Fix for warning: 'res' may be used uninitialized in this function
[mirror/scst/.git] / iscsi-scst / kernel / nthread.c
1 /*
2  *  Network threads.
3  *
4  *  Copyright (C) 2004 - 2005 FUJITA Tomonori <tomof@acm.org>
5  *  Copyright (C) 2007 - 2009 Vladislav Bolkhovitin
6  *  Copyright (C) 2007 - 2009 ID7 Ltd.
7  *
8  *  This program is free software; you can redistribute it and/or
9  *  modify it under the terms of the GNU General Public License
10  *  as published by the Free Software Foundation.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  *  GNU General Public License for more details.
16  */
17
18 #include <linux/sched.h>
19 #include <linux/file.h>
20 #include <linux/kthread.h>
21 #include <asm/ioctls.h>
22 #include <linux/delay.h>
23 #include <net/tcp.h>
24
25 #include "iscsi.h"
26 #include "digest.h"
27
28 enum rx_state {
29         RX_INIT_BHS, /* Must be zero for better "switch" optimiztion. */
30         RX_BHS,
31         RX_CMD_START,
32         RX_DATA,
33         RX_END,
34
35         RX_CMD_CONTINUE,
36         RX_INIT_HDIGEST,
37         RX_CHECK_HDIGEST,
38         RX_INIT_DDIGEST,
39         RX_CHECK_DDIGEST,
40         RX_AHS,
41         RX_PADDING,
42 };
43
44 enum tx_state {
45         TX_INIT = 0, /* Must be zero for better "switch" optimiztion. */
46         TX_BHS_DATA,
47         TX_INIT_PADDING,
48         TX_PADDING,
49         TX_INIT_DDIGEST,
50         TX_DDIGEST,
51         TX_END,
52 };
53
54 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
55 static void iscsi_check_closewait(struct iscsi_conn *conn)
56 {
57         struct iscsi_cmnd *cmnd;
58
59         TRACE_ENTRY();
60
61         TRACE_CONN_CLOSE_DBG("conn %p, sk_state %d", conn,
62                 conn->sock->sk->sk_state);
63
64         if (conn->sock->sk->sk_state != TCP_CLOSE) {
65                 TRACE_CONN_CLOSE_DBG("conn %p, skipping", conn);
66                 goto out;
67         }
68
69         /*
70          * No data are going to be sent, so all queued buffers can be freed
71          * now. In many cases TCP does that only in close(), but we can't rely
72          * on user space on calling it.
73          */
74
75 again:
76         spin_lock_bh(&conn->cmd_list_lock);
77         list_for_each_entry(cmnd, &conn->cmd_list, cmd_list_entry) {
78                 struct iscsi_cmnd *rsp;
79                 int restart = 0;
80
81                 TRACE_CONN_CLOSE_DBG("cmd %p, scst_state %x, data_waiting %d, "
82                         "ref_cnt %d, parent_req %p, net_ref_cnt %d, sg %p",
83                         cmnd, cmnd->scst_state, cmnd->data_waiting,
84                         atomic_read(&cmnd->ref_cnt), cmnd->parent_req,
85                         atomic_read(&cmnd->net_ref_cnt), cmnd->sg);
86
87                 sBUG_ON(cmnd->parent_req != NULL);
88
89                 if (cmnd->sg != NULL) {
90                         int i;
91
92                         if (cmnd_get_check(cmnd))
93                                 continue;
94
95                         for (i = 0; i < cmnd->sg_cnt; i++) {
96                                 struct page *page = sg_page(&cmnd->sg[i]);
97                                 TRACE_CONN_CLOSE_DBG("page %p, net_priv %p, "
98                                         "_count %d", page, page->net_priv,
99                                         atomic_read(&page->_count));
100
101                                 if (page->net_priv != NULL) {
102                                         if (restart == 0) {
103                                                 spin_unlock_bh(&conn->cmd_list_lock);
104                                                 restart = 1;
105                                         }
106                                         while (page->net_priv != NULL)
107                                                 iscsi_put_page_callback(page);
108                                 }
109                         }
110                         cmnd_put(cmnd);
111
112                         if (restart)
113                                 goto again;
114                 }
115
116                 spin_lock_bh(&cmnd->rsp_cmd_lock);
117                 list_for_each_entry(rsp, &cmnd->rsp_cmd_list,
118                                 rsp_cmd_list_entry) {
119                         TRACE_CONN_CLOSE_DBG("  rsp %p, ref_cnt %d, "
120                                 "net_ref_cnt %d, sg %p",
121                                 rsp, atomic_read(&rsp->ref_cnt),
122                                 atomic_read(&rsp->net_ref_cnt), rsp->sg);
123
124                         if ((rsp->sg != cmnd->sg) && (rsp->sg != NULL)) {
125                                 int i;
126
127                                 if (cmnd_get_check(rsp))
128                                         continue;
129
130                                 for (i = 0; i < rsp->sg_cnt; i++) {
131                                         struct page *page =
132                                                 sg_page(&rsp->sg[i]);
133                                         TRACE_CONN_CLOSE_DBG(
134                                                 "    page %p, net_priv %p, "
135                                                 "_count %d",
136                                                 page, page->net_priv,
137                                                 atomic_read(&page->_count));
138
139                                         if (page->net_priv != NULL) {
140                                                 if (restart == 0) {
141                                                         spin_unlock_bh(&cmnd->rsp_cmd_lock);
142                                                         spin_unlock_bh(&conn->cmd_list_lock);
143                                                         restart = 1;
144                                                 }
145                                                 while (page->net_priv != NULL)
146                                                         iscsi_put_page_callback(page);
147                                         }
148                                 }
149                                 cmnd_put(rsp);
150
151                                 if (restart)
152                                         goto again;
153                         }
154                 }
155                 spin_unlock_bh(&cmnd->rsp_cmd_lock);
156         }
157         spin_unlock_bh(&conn->cmd_list_lock);
158
159 out:
160         TRACE_EXIT();
161         return;
162 }
163 #else
164 static inline void iscsi_check_closewait(struct iscsi_conn *conn) {};
165 #endif
166
167 static void free_pending_commands(struct iscsi_conn *conn)
168 {
169         struct iscsi_session *session = conn->session;
170         struct list_head *pending_list = &session->pending_list;
171         int req_freed;
172         struct iscsi_cmnd *cmnd;
173
174         spin_lock(&session->sn_lock);
175         do {
176                 req_freed = 0;
177                 list_for_each_entry(cmnd, pending_list, pending_list_entry) {
178                         TRACE_CONN_CLOSE_DBG("Pending cmd %p"
179                                 "(conn %p, cmd_sn %u, exp_cmd_sn %u)",
180                                 cmnd, conn, cmnd->pdu.bhs.sn,
181                                 session->exp_cmd_sn);
182                         if ((cmnd->conn == conn) &&
183                             (session->exp_cmd_sn == cmnd->pdu.bhs.sn)) {
184                                 TRACE_CONN_CLOSE_DBG("Freeing pending cmd %p",
185                                         cmnd);
186
187                                 list_del(&cmnd->pending_list_entry);
188                                 cmnd->pending = 0;
189
190                                 session->exp_cmd_sn++;
191
192                                 spin_unlock(&session->sn_lock);
193
194                                 req_cmnd_release_force(cmnd, 0);
195
196                                 req_freed = 1;
197                                 spin_lock(&session->sn_lock);
198                                 break;
199                         }
200                 }
201         } while (req_freed);
202         spin_unlock(&session->sn_lock);
203
204         return;
205 }
206
207 static void free_orphaned_pending_commands(struct iscsi_conn *conn)
208 {
209         struct iscsi_session *session = conn->session;
210         struct list_head *pending_list = &session->pending_list;
211         int req_freed;
212         struct iscsi_cmnd *cmnd;
213
214         spin_lock(&session->sn_lock);
215         do {
216                 req_freed = 0;
217                 list_for_each_entry(cmnd, pending_list, pending_list_entry) {
218                         TRACE_CONN_CLOSE_DBG("Pending cmd %p"
219                                 "(conn %p, cmd_sn %u, exp_cmd_sn %u)",
220                                 cmnd, conn, cmnd->pdu.bhs.sn,
221                                 session->exp_cmd_sn);
222                         if (cmnd->conn == conn) {
223                                 PRINT_ERROR("Freeing orphaned pending cmd %p",
224                                             cmnd);
225
226                                 list_del(&cmnd->pending_list_entry);
227                                 cmnd->pending = 0;
228
229                                 if (session->exp_cmd_sn == cmnd->pdu.bhs.sn)
230                                         session->exp_cmd_sn++;
231
232                                 spin_unlock(&session->sn_lock);
233
234                                 req_cmnd_release_force(cmnd, 0);
235
236                                 req_freed = 1;
237                                 spin_lock(&session->sn_lock);
238                                 break;
239                         }
240                 }
241         } while (req_freed);
242         spin_unlock(&session->sn_lock);
243
244         return;
245 }
246
247 #ifdef CONFIG_SCST_DEBUG
248 static void trace_conn_close(struct iscsi_conn *conn)
249 {
250         struct iscsi_cmnd *cmnd;
251 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
252         struct iscsi_cmnd *rsp;
253 #endif
254
255 #if 0
256         if (time_after(jiffies, start_waiting + 10*HZ))
257                 trace_flag |= TRACE_CONN_OC_DBG;
258 #endif
259
260         spin_lock_bh(&conn->cmd_list_lock);
261         list_for_each_entry(cmnd, &conn->cmd_list,
262                         cmd_list_entry) {
263                 TRACE_CONN_CLOSE_DBG(
264                         "cmd %p, scst_state %x, scst_cmd state %d, "
265                         "data_waiting %d, ref_cnt %d, sn %u, "
266                         "parent_req %p, pending %d",
267                         cmnd, cmnd->scst_state,
268                         (cmnd->parent_req && cmnd->scst_cmd) ?
269                                 cmnd->scst_cmd->state : -1,
270                         cmnd->data_waiting, atomic_read(&cmnd->ref_cnt),
271                         cmnd->pdu.bhs.sn, cmnd->parent_req, cmnd->pending);
272 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
273                 TRACE_CONN_CLOSE_DBG("net_ref_cnt %d, sg %p",
274                         atomic_read(&cmnd->net_ref_cnt),
275                         cmnd->sg);
276                 if (cmnd->sg != NULL) {
277                         int i;
278                         for (i = 0; i < cmnd->sg_cnt; i++) {
279                                 struct page *page = sg_page(&cmnd->sg[i]);
280                                 TRACE_CONN_CLOSE_DBG("page %p, "
281                                         "net_priv %p, _count %d",
282                                         page, page->net_priv,
283                                         atomic_read(&page->_count));
284                         }
285                 }
286
287                 sBUG_ON(cmnd->parent_req != NULL);
288
289                 spin_lock_bh(&cmnd->rsp_cmd_lock);
290                 list_for_each_entry(rsp, &cmnd->rsp_cmd_list,
291                                 rsp_cmd_list_entry) {
292                         TRACE_CONN_CLOSE_DBG("  rsp %p, "
293                             "ref_cnt %d, net_ref_cnt %d, sg %p",
294                             rsp, atomic_read(&rsp->ref_cnt),
295                             atomic_read(&rsp->net_ref_cnt), rsp->sg);
296                         if (rsp->sg != cmnd->sg && rsp->sg) {
297                                 int i;
298                                 for (i = 0; i < rsp->sg_cnt; i++) {
299                                         TRACE_CONN_CLOSE_DBG("    page %p, "
300                                           "net_priv %p, _count %d",
301                                           sg_page(&rsp->sg[i]),
302                                           sg_page(&rsp->sg[i])->net_priv,
303                                           atomic_read(&sg_page(&rsp->sg[i])->
304                                                 _count));
305                                 }
306                         }
307                 }
308                 spin_unlock_bh(&cmnd->rsp_cmd_lock);
309 #endif /* CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION */
310         }
311         spin_unlock_bh(&conn->cmd_list_lock);
312         return;
313 }
314 #else /* CONFIG_SCST_DEBUG */
315 static void trace_conn_close(struct iscsi_conn *conn) {}
316 #endif /* CONFIG_SCST_DEBUG */
317
318 void iscsi_task_mgmt_affected_cmds_done(struct scst_mgmt_cmd *scst_mcmd)
319 {
320         int fn = scst_mgmt_cmd_get_fn(scst_mcmd);
321         void *priv = scst_mgmt_cmd_get_tgt_priv(scst_mcmd);
322
323         TRACE_MGMT_DBG("scst_mcmd %p, fn %d, priv %p", scst_mcmd, fn, priv);
324
325         switch (fn) {
326         case SCST_NEXUS_LOSS_SESS:
327         case SCST_ABORT_ALL_TASKS_SESS:
328         {
329                 struct iscsi_conn *conn = (struct iscsi_conn *)priv;
330                 struct iscsi_session *sess = conn->session;
331                 struct iscsi_conn *c;
332
333                 mutex_lock(&sess->target->target_mutex);
334
335                 /*
336                  * We can't mark sess as shutting down earlier, because until
337                  * now it might have pending commands. Otherwise, in case of
338                  * reinstatement it might lead to data corruption, because
339                  * commands in being reinstated session can be executed
340                  * after commands in the new session.
341                  */
342                 sess->sess_shutting_down = 1;
343                 list_for_each_entry(c, &sess->conn_list, conn_list_entry) {
344                         if (!test_bit(ISCSI_CONN_SHUTTINGDOWN, &c->conn_aflags)) {
345                                 sess->sess_shutting_down = 0;
346                                 break;
347                         }
348                 }
349
350                 if (conn->conn_reinst_successor != NULL) {
351                         sBUG_ON(!test_bit(ISCSI_CONN_REINSTATING,
352                                   &conn->conn_reinst_successor->conn_aflags));
353                         conn_reinst_finished(conn->conn_reinst_successor);
354                         conn->conn_reinst_successor = NULL;
355                 } else if (sess->sess_reinst_successor != NULL) {
356                         sess_reinst_finished(sess->sess_reinst_successor);
357                         sess->sess_reinst_successor = NULL;
358                 }
359                 mutex_unlock(&sess->target->target_mutex);
360
361                 complete_all(&conn->ready_to_free);
362                 break;
363         }
364         default:
365                 /* Nothing to do */
366                 break;
367         }
368
369         return;
370 }
371
372 /* No locks */
373 static void close_conn(struct iscsi_conn *conn)
374 {
375         struct iscsi_session *session = conn->session;
376         struct iscsi_target *target = conn->target;
377         typeof(jiffies) start_waiting = jiffies;
378         typeof(jiffies) shut_start_waiting = start_waiting;
379         bool pending_reported = 0, wait_expired = 0, shut_expired = 0;
380         bool reinst;
381
382 #define CONN_PENDING_TIMEOUT    ((typeof(jiffies))10*HZ)
383 #define CONN_WAIT_TIMEOUT       ((typeof(jiffies))10*HZ)
384 #define CONN_REG_SHUT_TIMEOUT   ((typeof(jiffies))125*HZ)
385 #define CONN_DEL_SHUT_TIMEOUT   ((typeof(jiffies))10*HZ)
386
387         TRACE_ENTRY();
388
389         TRACE_CONN_CLOSE("Closing connection %p (conn_ref_cnt=%d)", conn,
390                 atomic_read(&conn->conn_ref_cnt));
391
392         iscsi_extracheck_is_rd_thread(conn);
393
394         sBUG_ON(!conn->closing);
395
396         if (conn->active_close) {
397                 /* We want all our already send operations to complete */
398                 conn->sock->ops->shutdown(conn->sock, RCV_SHUTDOWN);
399         } else {
400                 conn->sock->ops->shutdown(conn->sock,
401                         RCV_SHUTDOWN|SEND_SHUTDOWN);
402         }
403
404         mutex_lock(&session->target->target_mutex);
405
406         set_bit(ISCSI_CONN_SHUTTINGDOWN, &conn->conn_aflags);
407         reinst = (conn->conn_reinst_successor != NULL);
408
409         mutex_unlock(&session->target->target_mutex);
410
411         if (reinst) {
412                 int rc;
413                 int lun = 0;
414
415                 /* Abort all outstanding commands */
416                 rc = scst_rx_mgmt_fn_lun(session->scst_sess,
417                         SCST_ABORT_ALL_TASKS_SESS, (uint8_t *)&lun, sizeof(lun),
418                         SCST_NON_ATOMIC, conn);
419                 if (rc != 0)
420                         PRINT_ERROR("SCST_ABORT_ALL_TASKS_SESS failed %d", rc);
421         } else {
422                 int rc;
423                 int lun = 0;
424
425                 rc = scst_rx_mgmt_fn_lun(session->scst_sess,
426                         SCST_NEXUS_LOSS_SESS, (uint8_t *)&lun, sizeof(lun),
427                         SCST_NON_ATOMIC, conn);
428                 if (rc != 0)
429                         PRINT_ERROR("SCST_NEXUS_LOSS_SESS failed %d", rc);
430         }
431
432         if (conn->read_state != RX_INIT_BHS) {
433                 struct iscsi_cmnd *cmnd = conn->read_cmnd;
434
435                 if (cmnd->scst_state == ISCSI_CMD_STATE_RX_CMD) {
436                         TRACE_DBG("Going to wait for cmnd %p to change state "
437                                 "from RX_CMD", cmnd);
438                 }
439                 wait_event(conn->read_state_waitQ,
440                         cmnd->scst_state != ISCSI_CMD_STATE_RX_CMD);
441
442                 conn->read_cmnd = NULL;
443                 conn->read_state = RX_INIT_BHS;
444                 req_cmnd_release_force(cmnd, 0);
445         }
446
447         conn_abort(conn);
448
449         /* ToDo: not the best way to wait */
450         while (atomic_read(&conn->conn_ref_cnt) != 0) {
451                 mutex_lock(&target->target_mutex);
452                 spin_lock(&session->sn_lock);
453                 if (session->tm_rsp && session->tm_rsp->conn == conn) {
454                         struct iscsi_cmnd *tm_rsp = session->tm_rsp;
455                         TRACE(TRACE_MGMT_MINOR, "Dropping delayed TM rsp %p",
456                                 tm_rsp);
457                         session->tm_rsp = NULL;
458                         session->tm_active--;
459                         WARN_ON(session->tm_active < 0);
460                         spin_unlock(&session->sn_lock);
461                         mutex_unlock(&target->target_mutex);
462
463                         rsp_cmnd_release(tm_rsp);
464                 } else {
465                         spin_unlock(&session->sn_lock);
466                         mutex_unlock(&target->target_mutex);
467                 }
468
469                 /* It's safe to check it without sn_lock */
470                 if (!list_empty(&session->pending_list)) {
471                         TRACE_CONN_CLOSE_DBG("Disposing pending commands on "
472                                 "connection %p (conn_ref_cnt=%d)", conn,
473                                 atomic_read(&conn->conn_ref_cnt));
474
475                         free_pending_commands(conn);
476
477                         if (time_after(jiffies,
478                                 start_waiting + CONN_PENDING_TIMEOUT)) {
479                                 if (!pending_reported) {
480                                         TRACE_CONN_CLOSE("%s",
481                                                 "Pending wait time expired");
482                                         pending_reported = 1;
483                                 }
484                                 free_orphaned_pending_commands(conn);
485                         }
486                 }
487
488                 iscsi_make_conn_wr_active(conn);
489
490                 /* That's for active close only, actually */
491                 if (time_after(jiffies, start_waiting + CONN_WAIT_TIMEOUT) &&
492                     !wait_expired) {
493                         TRACE_CONN_CLOSE("Wait time expired (conn %p, "
494                                 "sk_state %d)",
495                                 conn, conn->sock->sk->sk_state);
496                         conn->sock->ops->shutdown(conn->sock, SEND_SHUTDOWN);
497                         wait_expired = 1;
498                         shut_start_waiting = jiffies;
499                 }
500
501                 if (wait_expired && !shut_expired &&
502                     time_after(jiffies, shut_start_waiting +
503                                 conn->deleting ? CONN_DEL_SHUT_TIMEOUT :
504                                                  CONN_REG_SHUT_TIMEOUT)) {
505                         TRACE_CONN_CLOSE("Wait time after shutdown expired "
506                                 "(conn %p, sk_state %d)", conn,
507                                 conn->sock->sk->sk_state);
508                         conn->sock->sk->sk_prot->disconnect(conn->sock->sk, 0);
509                         shut_expired = 1;
510                 }
511
512                 if (conn->deleting)
513                         msleep(200);
514                 else
515                         msleep(1000);
516
517                 TRACE_CONN_CLOSE_DBG("conn %p, conn_ref_cnt %d left, "
518                         "wr_state %d, exp_cmd_sn %u",
519                         conn, atomic_read(&conn->conn_ref_cnt),
520                         conn->wr_state, session->exp_cmd_sn);
521
522                 trace_conn_close(conn);
523
524                 iscsi_check_closewait(conn);
525         }
526
527         write_lock_bh(&conn->sock->sk->sk_callback_lock);
528         conn->sock->sk->sk_state_change = conn->old_state_change;
529         conn->sock->sk->sk_data_ready = conn->old_data_ready;
530         conn->sock->sk->sk_write_space = conn->old_write_space;
531         write_unlock_bh(&conn->sock->sk->sk_callback_lock);
532
533         while (1) {
534                 bool t;
535
536                 spin_lock_bh(&iscsi_wr_lock);
537                 t = (conn->wr_state == ISCSI_CONN_WR_STATE_IDLE);
538                 spin_unlock_bh(&iscsi_wr_lock);
539
540                 if (t && (atomic_read(&conn->conn_ref_cnt) == 0))
541                         break;
542
543                 TRACE_CONN_CLOSE_DBG("Waiting for wr thread (conn %p), "
544                         "wr_state %x", conn, conn->wr_state);
545                 msleep(50);
546         }
547
548         wait_for_completion(&conn->ready_to_free);
549
550         TRACE_CONN_CLOSE("Notifying user space about closing connection %p",
551                          conn);
552         event_send(target->tid, session->sid, conn->cid, E_CONN_CLOSE);
553
554 #ifdef CONFIG_SCST_PROC
555         mutex_lock(&target->target_mutex);
556         conn_free(conn);
557         mutex_unlock(&target->target_mutex);
558 #else
559         kobject_put(&conn->iscsi_conn_kobj);
560 #endif
561
562         TRACE_EXIT();
563         return;
564 }
565
566 static int close_conn_thr(void *arg)
567 {
568         struct iscsi_conn *conn = (struct iscsi_conn *)arg;
569
570         TRACE_ENTRY();
571
572 #ifdef CONFIG_SCST_EXTRACHECKS
573         /*
574          * To satisfy iscsi_extracheck_is_rd_thread() in functions called
575          * on the connection close. It is safe, because at this point conn
576          * can't be used by any other thread.
577          */
578         conn->rd_task = current;
579 #endif
580         close_conn(conn);
581
582         TRACE_EXIT();
583         return 0;
584 }
585
586 /* No locks */
587 static void start_close_conn(struct iscsi_conn *conn)
588 {
589         struct task_struct *t;
590
591         TRACE_ENTRY();
592
593         t = kthread_run(close_conn_thr, conn, "iscsi_conn_cleanup");
594         if (IS_ERR(t)) {
595                 PRINT_ERROR("kthread_run() failed (%ld), closing conn %p "
596                         "directly", PTR_ERR(t), conn);
597                 close_conn(conn);
598         }
599
600         TRACE_EXIT();
601         return;
602 }
603
604 static inline void iscsi_conn_init_read(struct iscsi_conn *conn,
605         void __user *data, size_t len)
606 {
607         conn->read_iov[0].iov_base = data;
608         conn->read_iov[0].iov_len = len;
609         conn->read_msg.msg_iov = conn->read_iov;
610         conn->read_msg.msg_iovlen = 1;
611         conn->read_size = len;
612         return;
613 }
614
615 static void iscsi_conn_prepare_read_ahs(struct iscsi_conn *conn,
616         struct iscsi_cmnd *cmnd)
617 {
618         int asize = (cmnd->pdu.ahssize + 3) & -4;
619
620         /* ToDo: __GFP_NOFAIL ?? */
621         cmnd->pdu.ahs = kmalloc(asize, __GFP_NOFAIL|GFP_KERNEL);
622         sBUG_ON(cmnd->pdu.ahs == NULL);
623         iscsi_conn_init_read(conn, (void __force __user *)cmnd->pdu.ahs, asize);
624         return;
625 }
626
627 static struct iscsi_cmnd *iscsi_get_send_cmnd(struct iscsi_conn *conn)
628 {
629         struct iscsi_cmnd *cmnd = NULL;
630
631         spin_lock_bh(&conn->write_list_lock);
632         if (!list_empty(&conn->write_list)) {
633                 cmnd = list_entry(conn->write_list.next, struct iscsi_cmnd,
634                                 write_list_entry);
635                 cmd_del_from_write_list(cmnd);
636                 cmnd->write_processing_started = 1;
637         }
638         spin_unlock_bh(&conn->write_list_lock);
639
640         return cmnd;
641 }
642
643 /* Returns number of bytes left to receive or <0 for error */
644 static int do_recv(struct iscsi_conn *conn)
645 {
646         int res;
647         mm_segment_t oldfs;
648         struct msghdr msg;
649         int first_len;
650
651         EXTRACHECKS_BUG_ON(conn->read_cmnd == NULL);
652
653         if (unlikely(conn->closing)) {
654                 res = -EIO;
655                 goto out;
656         }
657
658         /*
659          * We suppose that if sock_recvmsg() returned less data than requested,
660          * then next time it will return -EAGAIN, so there's no point to call
661          * it again.
662          */
663
664 restart:
665         memset(&msg, 0, sizeof(msg));
666         msg.msg_iov = conn->read_msg.msg_iov;
667         msg.msg_iovlen = conn->read_msg.msg_iovlen;
668         first_len = msg.msg_iov->iov_len;
669
670         oldfs = get_fs();
671         set_fs(get_ds());
672         res = sock_recvmsg(conn->sock, &msg, conn->read_size,
673                            MSG_DONTWAIT | MSG_NOSIGNAL);
674         set_fs(oldfs);
675
676         if (res > 0) {
677                 /*
678                  * To save some considerable effort and CPU power we
679                  * suppose that TCP functions adjust
680                  * conn->read_msg.msg_iov and conn->read_msg.msg_iovlen
681                  * on amount of copied data. This BUG_ON is intended
682                  * to catch if it is changed in the future.
683                  */
684                 sBUG_ON((res >= first_len) &&
685                         (conn->read_msg.msg_iov->iov_len != 0));
686                 conn->read_size -= res;
687                 if (conn->read_size != 0) {
688                         if (res >= first_len) {
689                                 int done = 1 + ((res - first_len) >> PAGE_SHIFT);
690                                 conn->read_msg.msg_iov += done;
691                                 conn->read_msg.msg_iovlen -= done;
692                         }
693                 }
694                 res = conn->read_size;
695         } else {
696                 switch (res) {
697                 case -EAGAIN:
698                         TRACE_DBG("EAGAIN received for conn %p", conn);
699                         res = conn->read_size;
700                         break;
701                 case -ERESTARTSYS:
702                         TRACE_DBG("ERESTARTSYS received for conn %p", conn);
703                         goto restart;
704                 default:
705                         PRINT_ERROR("sock_recvmsg() failed: %d", res);
706                         mark_conn_closed(conn);
707                         if (res == 0)
708                                 res = -EIO;
709                         break;
710                 }
711         }
712
713 out:
714         TRACE_EXIT_RES(res);
715         return res;
716 }
717
718 static int iscsi_rx_check_ddigest(struct iscsi_conn *conn)
719 {
720         struct iscsi_cmnd *cmnd = conn->read_cmnd;
721         int res;
722
723         res = do_recv(conn);
724         if (res == 0) {
725                 conn->read_state = RX_END;
726
727                 if (cmnd->pdu.datasize <= 16*1024) {
728                         /*
729                          * It's cache hot, so let's compute it inline. The
730                          * choice here about what will expose more latency:
731                          * possible cache misses or the digest calculation.
732                          */
733                         TRACE_DBG("cmnd %p, opcode %x: checking RX "
734                                 "ddigest inline", cmnd, cmnd_opcode(cmnd));
735                         cmnd->ddigest_checked = 1;
736                         res = digest_rx_data(cmnd);
737                         if (unlikely(res != 0)) {
738                                 mark_conn_closed(conn);
739                                 goto out;
740                         }
741                 } else if (cmnd_opcode(cmnd) == ISCSI_OP_SCSI_CMD) {
742                         cmd_add_on_rx_ddigest_list(cmnd, cmnd);
743                         cmnd_get(cmnd);
744                 } else if (cmnd_opcode(cmnd) != ISCSI_OP_SCSI_DATA_OUT) {
745                         /*
746                          * We could get here only for NOP-Out. ISCSI RFC
747                          * doesn't specify how to deal with digest errors in
748                          * this case. Is closing connection correct?
749                          */
750                         TRACE_DBG("cmnd %p, opcode %x: checking NOP RX "
751                                 "ddigest", cmnd, cmnd_opcode(cmnd));
752                         res = digest_rx_data(cmnd);
753                         if (unlikely(res != 0)) {
754                                 mark_conn_closed(conn);
755                                 goto out;
756                         }
757                 }
758         }
759
760 out:
761         return res;
762 }
763
764 /* No locks, conn is rd processing */
765 static void process_read_io(struct iscsi_conn *conn, int *closed)
766 {
767         struct iscsi_cmnd *cmnd = conn->read_cmnd;
768         int res;
769
770         TRACE_ENTRY();
771
772         /* In case of error cmnd will be freed in close_conn() */
773
774         do {
775                 switch (conn->read_state) {
776                 case RX_INIT_BHS:
777                         EXTRACHECKS_BUG_ON(conn->read_cmnd != NULL);
778                         cmnd = cmnd_alloc(conn, NULL);
779                         conn->read_cmnd = cmnd;
780                         iscsi_conn_init_read(cmnd->conn,
781                                 (void __force __user *)&cmnd->pdu.bhs,
782                                 sizeof(cmnd->pdu.bhs));
783                         conn->read_state = RX_BHS;
784                         /* go through */
785
786                 case RX_BHS:
787                         res = do_recv(conn);
788                         if (res == 0) {
789                                 iscsi_cmnd_get_length(&cmnd->pdu);
790                                 if (cmnd->pdu.ahssize == 0) {
791                                         if ((conn->hdigest_type & DIGEST_NONE) == 0)
792                                                 conn->read_state = RX_INIT_HDIGEST;
793                                         else
794                                                 conn->read_state = RX_CMD_START;
795                                 } else {
796                                         iscsi_conn_prepare_read_ahs(conn, cmnd);
797                                         conn->read_state = RX_AHS;
798                                 }
799                         }
800                         break;
801
802                 case RX_CMD_START:
803                         res = cmnd_rx_start(cmnd);
804                         if (res == 0) {
805                                 if (cmnd->pdu.datasize == 0)
806                                         conn->read_state = RX_END;
807                                 else
808                                         conn->read_state = RX_DATA;
809                         } else if (res > 0)
810                                 conn->read_state = RX_CMD_CONTINUE;
811                         else
812                                 sBUG_ON(!conn->closing);
813                         break;
814
815                 case RX_CMD_CONTINUE:
816                         if (cmnd->scst_state == ISCSI_CMD_STATE_RX_CMD) {
817                                 TRACE_DBG("cmnd %p is still in RX_CMD state",
818                                         cmnd);
819                                 res = 1;
820                                 break;
821                         }
822                         res = cmnd_rx_continue(cmnd);
823                         if (unlikely(res != 0))
824                                 sBUG_ON(!conn->closing);
825                         else {
826                                 if (cmnd->pdu.datasize == 0)
827                                         conn->read_state = RX_END;
828                                 else
829                                         conn->read_state = RX_DATA;
830                         }
831                         break;
832
833                 case RX_DATA:
834                         res = do_recv(conn);
835                         if (res == 0) {
836                                 int psz = ((cmnd->pdu.datasize + 3) & -4) - cmnd->pdu.datasize;
837                                 if (psz != 0) {
838                                         TRACE_DBG("padding %d bytes", psz);
839                                         iscsi_conn_init_read(conn,
840                                                 (void __force __user *)&conn->rpadding, psz);
841                                         conn->read_state = RX_PADDING;
842                                 } else if ((conn->ddigest_type & DIGEST_NONE) != 0)
843                                         conn->read_state = RX_END;
844                                 else
845                                         conn->read_state = RX_INIT_DDIGEST;
846                         }
847                         break;
848
849                 case RX_END:
850                         res = 0;
851                         if (unlikely(conn->read_size != 0)) {
852                                 PRINT_CRIT_ERROR("%d %x %d", res,
853                                         cmnd_opcode(cmnd), conn->read_size);
854                                 sBUG();
855                         }
856                         conn->read_cmnd = NULL;
857                         conn->read_state = RX_INIT_BHS;
858
859                         cmnd_rx_end(cmnd);
860
861                         EXTRACHECKS_BUG_ON(conn->read_size != 0);
862                         break;
863
864                 case RX_INIT_HDIGEST:
865                         iscsi_conn_init_read(conn,
866                                 (void __force __user *)&cmnd->hdigest, sizeof(u32));
867                         conn->read_state = RX_CHECK_HDIGEST;
868                         /* go through */
869
870                 case RX_CHECK_HDIGEST:
871                         res = do_recv(conn);
872                         if (res == 0) {
873                                 res = digest_rx_header(cmnd);
874                                 if (unlikely(res != 0)) {
875                                         PRINT_ERROR("rx header digest for "
876                                                 "initiator %s failed (%d)",
877                                                 conn->session->initiator_name,
878                                                 res);
879                                         mark_conn_closed(conn);
880                                 } else
881                                         conn->read_state = RX_CMD_START;
882                         }
883                         break;
884
885                 case RX_INIT_DDIGEST:
886                         iscsi_conn_init_read(conn,
887                                 (void __force __user *)&cmnd->ddigest,
888                                 sizeof(u32));
889                         conn->read_state = RX_CHECK_DDIGEST;
890                         /* go through */
891
892                 case RX_CHECK_DDIGEST:
893                         res = iscsi_rx_check_ddigest(conn);
894                         break;
895
896                 case RX_AHS:
897                         res = do_recv(conn);
898                         if (res == 0) {
899                                 if ((conn->hdigest_type & DIGEST_NONE) == 0)
900                                         conn->read_state = RX_INIT_HDIGEST;
901                                 else
902                                         conn->read_state = RX_CMD_START;
903                         }
904                         break;
905
906                 case RX_PADDING:
907                         res = do_recv(conn);
908                         if (res == 0) {
909                                 if ((conn->ddigest_type & DIGEST_NONE) == 0)
910                                         conn->read_state = RX_INIT_DDIGEST;
911                                 else
912                                         conn->read_state = RX_END;
913                         }
914                         break;
915
916                 default:
917                         PRINT_CRIT_ERROR("%d %x", conn->read_state, cmnd_opcode(cmnd));
918                         res = -1; /* to keep compiler happy */
919                         sBUG();
920                 }
921         } while (res == 0);
922
923         if (unlikely(conn->closing)) {
924                 start_close_conn(conn);
925                 *closed = 1;
926         }
927
928         TRACE_EXIT();
929         return;
930 }
931
932 /*
933  * Called under iscsi_rd_lock and BHs disabled, but will drop it inside,
934  * then reaquire.
935  */
936 static void scst_do_job_rd(void)
937         __acquires(&iscsi_rd_lock)
938         __releases(&iscsi_rd_lock)
939 {
940         TRACE_ENTRY();
941
942         /*
943          * We delete/add to tail connections to maintain fairness between them.
944          */
945
946         while (!list_empty(&iscsi_rd_list)) {
947                 int closed = 0;
948                 struct iscsi_conn *conn = list_entry(iscsi_rd_list.next,
949                         typeof(*conn), rd_list_entry);
950
951                 list_del(&conn->rd_list_entry);
952
953                 sBUG_ON(conn->rd_state == ISCSI_CONN_RD_STATE_PROCESSING);
954                 conn->rd_data_ready = 0;
955                 conn->rd_state = ISCSI_CONN_RD_STATE_PROCESSING;
956 #ifdef CONFIG_SCST_EXTRACHECKS
957                 conn->rd_task = current;
958 #endif
959                 spin_unlock_bh(&iscsi_rd_lock);
960
961                 process_read_io(conn, &closed);
962
963                 spin_lock_bh(&iscsi_rd_lock);
964
965                 if (closed)
966                         continue;
967
968 #ifdef CONFIG_SCST_EXTRACHECKS
969                 conn->rd_task = NULL;
970 #endif
971                 if (conn->rd_data_ready) {
972                         list_add_tail(&conn->rd_list_entry, &iscsi_rd_list);
973                         conn->rd_state = ISCSI_CONN_RD_STATE_IN_LIST;
974                 } else
975                         conn->rd_state = ISCSI_CONN_RD_STATE_IDLE;
976         }
977
978         TRACE_EXIT();
979         return;
980 }
981
982 static inline int test_rd_list(void)
983 {
984         int res = !list_empty(&iscsi_rd_list) ||
985                   unlikely(kthread_should_stop());
986         return res;
987 }
988
989 int istrd(void *arg)
990 {
991         TRACE_ENTRY();
992
993         PRINT_INFO("Read thread started, PID %d", current->pid);
994
995         current->flags |= PF_NOFREEZE;
996
997         spin_lock_bh(&iscsi_rd_lock);
998         while (!kthread_should_stop()) {
999                 wait_queue_t wait;
1000                 init_waitqueue_entry(&wait, current);
1001
1002                 if (!test_rd_list()) {
1003                         add_wait_queue_exclusive_head(&iscsi_rd_waitQ, &wait);
1004                         for (;;) {
1005                                 set_current_state(TASK_INTERRUPTIBLE);
1006                                 if (test_rd_list())
1007                                         break;
1008                                 spin_unlock_bh(&iscsi_rd_lock);
1009                                 schedule();
1010                                 spin_lock_bh(&iscsi_rd_lock);
1011                         }
1012                         set_current_state(TASK_RUNNING);
1013                         remove_wait_queue(&iscsi_rd_waitQ, &wait);
1014                 }
1015                 scst_do_job_rd();
1016         }
1017         spin_unlock_bh(&iscsi_rd_lock);
1018
1019         /*
1020          * If kthread_should_stop() is true, we are guaranteed to be
1021          * on the module unload, so iscsi_rd_list must be empty.
1022          */
1023         sBUG_ON(!list_empty(&iscsi_rd_list));
1024
1025         PRINT_INFO("Read thread PID %d finished", current->pid);
1026
1027         TRACE_EXIT();
1028         return 0;
1029 }
1030
1031 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
1032 static inline void __iscsi_get_page_callback(struct iscsi_cmnd *cmd)
1033 {
1034         int v;
1035
1036         TRACE_NET_PAGE("cmd %p, new net_ref_cnt %d",
1037                 cmd, atomic_read(&cmd->net_ref_cnt)+1);
1038
1039         v = atomic_inc_return(&cmd->net_ref_cnt);
1040         if (v == 1) {
1041                 TRACE_NET_PAGE("getting cmd %p", cmd);
1042                 cmnd_get(cmd);
1043         }
1044         return;
1045 }
1046
1047 void iscsi_get_page_callback(struct page *page)
1048 {
1049         struct iscsi_cmnd *cmd = (struct iscsi_cmnd *)page->net_priv;
1050
1051         TRACE_NET_PAGE("page %p, _count %d", page,
1052                 atomic_read(&page->_count));
1053
1054         __iscsi_get_page_callback(cmd);
1055         return;
1056 }
1057
1058 static inline void __iscsi_put_page_callback(struct iscsi_cmnd *cmd)
1059 {
1060         TRACE_NET_PAGE("cmd %p, new net_ref_cnt %d", cmd,
1061                 atomic_read(&cmd->net_ref_cnt)-1);
1062
1063         if (atomic_dec_and_test(&cmd->net_ref_cnt)) {
1064                 int i, sg_cnt = cmd->sg_cnt;
1065                 for (i = 0; i < sg_cnt; i++) {
1066                         struct page *page = sg_page(&cmd->sg[i]);
1067                         TRACE_NET_PAGE("Clearing page %p", page);
1068                         if (page->net_priv == cmd)
1069                                 page->net_priv = NULL;
1070                 }
1071                 cmnd_put(cmd);
1072         }
1073         return;
1074 }
1075
1076 void iscsi_put_page_callback(struct page *page)
1077 {
1078         struct iscsi_cmnd *cmd = (struct iscsi_cmnd *)page->net_priv;
1079
1080         TRACE_NET_PAGE("page %p, _count %d", page,
1081                 atomic_read(&page->_count));
1082
1083         __iscsi_put_page_callback(cmd);
1084         return;
1085 }
1086
1087 static void check_net_priv(struct iscsi_cmnd *cmd, struct page *page)
1088 {
1089         if ((atomic_read(&cmd->net_ref_cnt) == 1) && (page->net_priv == cmd)) {
1090                 TRACE_DBG("sendpage() not called get_page(), zeroing net_priv "
1091                         "%p (page %p)", page->net_priv, page);
1092                 page->net_priv = NULL;
1093         }
1094         return;
1095 }
1096 #else
1097 static inline void check_net_priv(struct iscsi_cmnd *cmd, struct page *page) {}
1098 static inline void __iscsi_get_page_callback(struct iscsi_cmnd *cmd) {}
1099 static inline void __iscsi_put_page_callback(struct iscsi_cmnd *cmd) {}
1100 #endif
1101
1102 /* This is partially taken from the Ardis code. */
1103 static int write_data(struct iscsi_conn *conn)
1104 {
1105         mm_segment_t oldfs;
1106         struct file *file;
1107         struct iovec *iop;
1108         struct socket *sock;
1109         ssize_t (*sock_sendpage)(struct socket *, struct page *, int, size_t,
1110                                  int);
1111         ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int);
1112         struct iscsi_cmnd *write_cmnd = conn->write_cmnd;
1113         struct iscsi_cmnd *ref_cmd;
1114         struct page *page;
1115         struct scatterlist *sg;
1116         int saved_size, size, sendsize;
1117         int length, offset, idx;
1118         int flags, res, count, sg_size;
1119         bool do_put = false, ref_cmd_to_parent;
1120
1121         TRACE_ENTRY();
1122
1123         iscsi_extracheck_is_wr_thread(conn);
1124
1125         if (write_cmnd->own_sg == 0) {
1126                 ref_cmd = write_cmnd->parent_req;
1127                 ref_cmd_to_parent = true;
1128         } else {
1129                 ref_cmd = write_cmnd;
1130                 ref_cmd_to_parent = false;
1131         }
1132
1133         if (!ref_cmd->on_written_list) {
1134                 TRACE_DBG("Adding cmd %p to conn %p written_list", ref_cmd,
1135                         conn);
1136                 spin_lock_bh(&conn->write_list_lock);
1137                 ref_cmd->on_written_list = 1;
1138                 ref_cmd->write_timeout = jiffies + ISCSI_RSP_TIMEOUT;
1139                 list_add_tail(&ref_cmd->written_list_entry,
1140                         &conn->written_list);
1141                 spin_unlock_bh(&conn->write_list_lock);
1142         }
1143
1144         if (!timer_pending(&conn->rsp_timer)) {
1145                 sBUG_ON(!ref_cmd->on_written_list);
1146                 spin_lock_bh(&conn->write_list_lock);
1147                 if (likely(!timer_pending(&conn->rsp_timer))) {
1148                         TRACE_DBG("Starting timer on %ld (conn %p)",
1149                                 ref_cmd->write_timeout, conn);
1150                         conn->rsp_timer.expires = ref_cmd->write_timeout;
1151                         add_timer(&conn->rsp_timer);
1152                 }
1153                 spin_unlock_bh(&conn->write_list_lock);
1154         }
1155
1156         file = conn->file;
1157         size = conn->write_size;
1158         saved_size = size;
1159         iop = conn->write_iop;
1160         count = conn->write_iop_used;
1161
1162         if (iop) {
1163                 while (1) {
1164                         loff_t off = 0;
1165                         int rest;
1166
1167                         sBUG_ON(count > (signed)(sizeof(conn->write_iov) /
1168                                                 sizeof(conn->write_iov[0])));
1169 retry:
1170                         oldfs = get_fs();
1171                         set_fs(KERNEL_DS);
1172                         res = vfs_writev(file,
1173                                          (struct iovec __force __user *)iop,
1174                                          count, &off);
1175                         set_fs(oldfs);
1176                         TRACE_WRITE("sid %#Lx, cid %u, res %d, iov_len %ld",
1177                                     (long long unsigned int)conn->session->sid,
1178                                     conn->cid, res, (long)iop->iov_len);
1179                         if (unlikely(res <= 0)) {
1180                                 if (res == -EAGAIN) {
1181                                         conn->write_iop = iop;
1182                                         conn->write_iop_used = count;
1183                                         goto out_iov;
1184                                 } else if (res == -EINTR)
1185                                         goto retry;
1186                                 goto out_err;
1187                         }
1188
1189                         rest = res;
1190                         size -= res;
1191                         while ((typeof(rest))iop->iov_len <= rest && rest) {
1192                                 rest -= iop->iov_len;
1193                                 iop++;
1194                                 count--;
1195                         }
1196                         if (count == 0) {
1197                                 conn->write_iop = NULL;
1198                                 conn->write_iop_used = 0;
1199                                 if (size)
1200                                         break;
1201                                 goto out_iov;
1202                         }
1203                         sBUG_ON(iop > conn->write_iov + sizeof(conn->write_iov)
1204                                                   /sizeof(conn->write_iov[0]));
1205                         iop->iov_base += rest;
1206                         iop->iov_len -= rest;
1207                 }
1208         }
1209
1210         sg = write_cmnd->sg;
1211         if (unlikely(sg == NULL)) {
1212                 PRINT_INFO("WARNING: Data missed (cmd %p)!", write_cmnd);
1213                 res = 0;
1214                 goto out;
1215         }
1216
1217         /* To protect from too early transfer completion race */
1218         __iscsi_get_page_callback(ref_cmd);
1219         do_put = true;
1220
1221         sock = conn->sock;
1222
1223 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
1224         sock_sendpage = sock->ops->sendpage;
1225 #else
1226         if ((write_cmnd->parent_req->scst_cmd != NULL) &&
1227             scst_cmd_get_dh_data_buff_alloced(write_cmnd->parent_req->scst_cmd))
1228                 sock_sendpage = sock_no_sendpage;
1229         else
1230                 sock_sendpage = sock->ops->sendpage;
1231 #endif
1232
1233         flags = MSG_DONTWAIT;
1234         sg_size = size;
1235
1236         if (sg != write_cmnd->rsp_sg) {
1237                 offset = conn->write_offset + sg[0].offset;
1238                 idx = offset >> PAGE_SHIFT;
1239                 offset &= ~PAGE_MASK;
1240                 length = min(size, (int)PAGE_SIZE - offset);
1241                 TRACE_WRITE("write_offset %d, sg_size %d, idx %d, offset %d, "
1242                         "length %d", conn->write_offset, sg_size, idx, offset,
1243                         length);
1244         } else {
1245                 idx = 0;
1246                 offset = conn->write_offset;
1247                 while (offset >= sg[idx].length) {
1248                         offset -= sg[idx].length;
1249                         idx++;
1250                 }
1251                 length = sg[idx].length - offset;
1252                 offset += sg[idx].offset;
1253                 sock_sendpage = sock_no_sendpage;
1254                 TRACE_WRITE("rsp_sg: write_offset %d, sg_size %d, idx %d, "
1255                         "offset %d, length %d", conn->write_offset, sg_size,
1256                         idx, offset, length);
1257         }
1258         page = sg_page(&sg[idx]);
1259
1260         while (1) {
1261                 sendpage = sock_sendpage;
1262
1263 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
1264                 {
1265                         static DEFINE_SPINLOCK(net_priv_lock);
1266                         spin_lock(&net_priv_lock);
1267                         if (unlikely(page->net_priv != NULL)) {
1268                                 if (page->net_priv != ref_cmd) {
1269                                         /*
1270                                          * This might happen if user space
1271                                          * supplies to scst_user the same
1272                                          * pages in different commands or in
1273                                          * case of zero-copy FILEIO, when
1274                                          * several initiators request the same
1275                                          * data simultaneously.
1276                                          */
1277                                         TRACE_DBG("net_priv isn't NULL and != "
1278                                             "ref_cmd (write_cmnd %p, ref_cmd "
1279                                             "%p, sg %p, idx %d, page %p, "
1280                                             "net_priv %p)",
1281                                             write_cmnd, ref_cmd, sg, idx,
1282                                             page, page->net_priv);
1283                                         sendpage = sock_no_sendpage;
1284                                 }
1285                         } else
1286                                 page->net_priv = ref_cmd;
1287                         spin_unlock(&net_priv_lock);
1288                 }
1289 #endif
1290                 sendsize = min(size, length);
1291                 if (size <= sendsize) {
1292 retry2:
1293                         res = sendpage(sock, page, offset, size, flags);
1294                         TRACE_WRITE("Final %s sid %#Lx, cid %u, res %d (page "
1295                                 "index %lu, offset %u, size %u, cmd %p, "
1296                                 "page %p)", (sendpage != sock_no_sendpage) ?
1297                                                 "sendpage" : "sock_no_sendpage",
1298                                 (long long unsigned int)conn->session->sid,
1299                                 conn->cid, res, page->index,
1300                                 offset, size, write_cmnd, page);
1301                         if (unlikely(res <= 0)) {
1302                                 if (res == -EINTR)
1303                                         goto retry2;
1304                                 else
1305                                         goto out_res;
1306                         }
1307
1308                         check_net_priv(ref_cmd, page);
1309                         if (res == size) {
1310                                 conn->write_size = 0;
1311                                 res = saved_size;
1312                                 goto out_put;
1313                         }
1314
1315                         offset += res;
1316                         size -= res;
1317                         goto retry2;
1318                 }
1319
1320 retry1:
1321                 res = sendpage(sock, page, offset, sendsize, flags | MSG_MORE);
1322                 TRACE_WRITE("%s sid %#Lx, cid %u, res %d (page index %lu, "
1323                         "offset %u, sendsize %u, size %u, cmd %p, page %p)",
1324                         (sendpage != sock_no_sendpage) ? "sendpage" :
1325                                                          "sock_no_sendpage",
1326                         (unsigned long long)conn->session->sid, conn->cid,
1327                         res, page->index, offset, sendsize, size,
1328                         write_cmnd, page);
1329                 if (unlikely(res <= 0)) {
1330                         if (res == -EINTR)
1331                                 goto retry1;
1332                         else
1333                                 goto out_res;
1334                 }
1335
1336                 check_net_priv(ref_cmd, page);
1337
1338                 size -= res;
1339
1340                 if (res == sendsize) {
1341                         idx++;
1342                         EXTRACHECKS_BUG_ON(idx >= ref_cmd->sg_cnt);
1343                         page = sg_page(&sg[idx]);
1344                         length = sg[idx].length;
1345                         offset = sg[idx].offset;
1346                 } else {
1347                         offset += res;
1348                         sendsize -= res;
1349                         goto retry1;
1350                 }
1351         }
1352
1353 out_off:
1354         conn->write_offset += sg_size - size;
1355
1356 out_iov:
1357         conn->write_size = size;
1358         if ((saved_size == size) && res == -EAGAIN)
1359                 goto out_put;
1360
1361         res = saved_size - size;
1362
1363 out_put:
1364         if (do_put)
1365                 __iscsi_put_page_callback(ref_cmd);
1366
1367 out:
1368         TRACE_EXIT_RES(res);
1369         return res;
1370
1371 out_res:
1372         check_net_priv(ref_cmd, page);
1373         if (res == -EAGAIN)
1374                 goto out_off;
1375         /* else go through */
1376
1377 out_err:
1378 #ifndef CONFIG_SCST_DEBUG
1379         if (!conn->closing)
1380 #endif
1381         {
1382                 PRINT_ERROR("error %d at sid:cid %#Lx:%u, cmnd %p", res,
1383                             (long long unsigned int)conn->session->sid,
1384                             conn->cid, conn->write_cmnd);
1385         }
1386         if (ref_cmd_to_parent &&
1387             ((ref_cmd->scst_cmd != NULL) || (ref_cmd->scst_aen != NULL))) {
1388                 if (ref_cmd->scst_state == ISCSI_CMD_STATE_AEN)
1389                         scst_set_aen_delivery_status(ref_cmd->scst_aen,
1390                                 SCST_AEN_RES_FAILED);
1391                 else
1392                         scst_set_delivery_status(ref_cmd->scst_cmd,
1393                                 SCST_CMD_DELIVERY_FAILED);
1394         }
1395         goto out_put;
1396 }
1397
1398 static int exit_tx(struct iscsi_conn *conn, int res)
1399 {
1400         iscsi_extracheck_is_wr_thread(conn);
1401
1402         switch (res) {
1403         case -EAGAIN:
1404         case -ERESTARTSYS:
1405                 res = 0;
1406                 break;
1407         default:
1408 #ifndef CONFIG_SCST_DEBUG
1409                 if (!conn->closing)
1410 #endif
1411                 {
1412                         PRINT_ERROR("Sending data failed: initiator %s, "
1413                                 "write_size %d, write_state %d, res %d",
1414                                 conn->session->initiator_name,
1415                                 conn->write_size,
1416                                 conn->write_state, res);
1417                 }
1418                 conn->write_state = TX_END;
1419                 conn->write_size = 0;
1420                 mark_conn_closed(conn);
1421                 break;
1422         }
1423         return res;
1424 }
1425
1426 static int tx_ddigest(struct iscsi_cmnd *cmnd, int state)
1427 {
1428         int res, rest = cmnd->conn->write_size;
1429         struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
1430         struct kvec iov;
1431
1432         iscsi_extracheck_is_wr_thread(cmnd->conn);
1433
1434         TRACE_DBG("Sending data digest %x (cmd %p)", cmnd->ddigest, cmnd);
1435
1436         iov.iov_base = (char *)(&cmnd->ddigest) + (sizeof(u32) - rest);
1437         iov.iov_len = rest;
1438
1439         res = kernel_sendmsg(cmnd->conn->sock, &msg, &iov, 1, rest);
1440         if (res > 0) {
1441                 cmnd->conn->write_size -= res;
1442                 if (!cmnd->conn->write_size)
1443                         cmnd->conn->write_state = state;
1444         } else
1445                 res = exit_tx(cmnd->conn, res);
1446
1447         return res;
1448 }
1449
1450 static void init_tx_hdigest(struct iscsi_cmnd *cmnd)
1451 {
1452         struct iscsi_conn *conn = cmnd->conn;
1453         struct iovec *iop;
1454
1455         iscsi_extracheck_is_wr_thread(conn);
1456
1457         digest_tx_header(cmnd);
1458
1459         sBUG_ON(conn->write_iop_used >=
1460                 (signed)(sizeof(conn->write_iov)/sizeof(conn->write_iov[0])));
1461
1462         iop = &conn->write_iop[conn->write_iop_used];
1463         conn->write_iop_used++;
1464         iop->iov_base = (void __force __user *)&(cmnd->hdigest);
1465         iop->iov_len = sizeof(u32);
1466         conn->write_size += sizeof(u32);
1467
1468         return;
1469 }
1470
1471 static int tx_padding(struct iscsi_cmnd *cmnd, int state)
1472 {
1473         int res, rest = cmnd->conn->write_size;
1474         struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
1475         struct kvec iov;
1476         static const uint32_t padding;
1477
1478         iscsi_extracheck_is_wr_thread(cmnd->conn);
1479
1480         TRACE_DBG("Sending %d padding bytes (cmd %p)", rest, cmnd);
1481
1482         iov.iov_base = (char *)(&padding) + (sizeof(uint32_t) - rest);
1483         iov.iov_len = rest;
1484
1485         res = kernel_sendmsg(cmnd->conn->sock, &msg, &iov, 1, rest);
1486         if (res > 0) {
1487                 cmnd->conn->write_size -= res;
1488                 if (!cmnd->conn->write_size)
1489                         cmnd->conn->write_state = state;
1490         } else
1491                 res = exit_tx(cmnd->conn, res);
1492
1493         return res;
1494 }
1495
1496 static int iscsi_do_send(struct iscsi_conn *conn, int state)
1497 {
1498         int res;
1499
1500         iscsi_extracheck_is_wr_thread(conn);
1501
1502         res = write_data(conn);
1503         if (res > 0) {
1504                 if (!conn->write_size)
1505                         conn->write_state = state;
1506         } else
1507                 res = exit_tx(conn, res);
1508
1509         return res;
1510 }
1511
1512 /*
1513  * No locks, conn is wr processing.
1514  *
1515  * IMPORTANT! Connection conn must be protected by additional conn_get()
1516  * upon entrance in this function, because otherwise it could be destroyed
1517  * inside as a result of cmnd release.
1518  */
1519 int iscsi_send(struct iscsi_conn *conn)
1520 {
1521         struct iscsi_cmnd *cmnd = conn->write_cmnd;
1522         int ddigest, res = 0;
1523
1524         TRACE_ENTRY();
1525
1526         TRACE_DBG("conn %p, write_cmnd %p", conn, cmnd);
1527
1528         iscsi_extracheck_is_wr_thread(conn);
1529
1530         ddigest = conn->ddigest_type != DIGEST_NONE ? 1 : 0;
1531
1532         switch (conn->write_state) {
1533         case TX_INIT:
1534                 sBUG_ON(cmnd != NULL);
1535                 cmnd = conn->write_cmnd = iscsi_get_send_cmnd(conn);
1536                 if (!cmnd)
1537                         goto out;
1538                 cmnd_tx_start(cmnd);
1539                 if (!(conn->hdigest_type & DIGEST_NONE))
1540                         init_tx_hdigest(cmnd);
1541                 conn->write_state = TX_BHS_DATA;
1542         case TX_BHS_DATA:
1543                 res = iscsi_do_send(conn, cmnd->pdu.datasize ?
1544                                         TX_INIT_PADDING : TX_END);
1545                 if (res <= 0 || conn->write_state != TX_INIT_PADDING)
1546                         break;
1547         case TX_INIT_PADDING:
1548                 cmnd->conn->write_size = ((cmnd->pdu.datasize + 3) & -4) -
1549                                                 cmnd->pdu.datasize;
1550                 if (cmnd->conn->write_size != 0)
1551                         conn->write_state = TX_PADDING;
1552                 else if (ddigest)
1553                         conn->write_state = TX_INIT_DDIGEST;
1554                  else
1555                         conn->write_state = TX_END;
1556                 break;
1557         case TX_PADDING:
1558                 res = tx_padding(cmnd, ddigest ? TX_INIT_DDIGEST : TX_END);
1559                 if (res <= 0 || conn->write_state != TX_INIT_DDIGEST)
1560                         break;
1561         case TX_INIT_DDIGEST:
1562                 cmnd->conn->write_size = sizeof(u32);
1563                 conn->write_state = TX_DDIGEST;
1564         case TX_DDIGEST:
1565                 res = tx_ddigest(cmnd, TX_END);
1566                 break;
1567         default:
1568                 PRINT_CRIT_ERROR("%d %d %x", res, conn->write_state,
1569                         cmnd_opcode(cmnd));
1570                 sBUG();
1571         }
1572
1573         if (res == 0)
1574                 goto out;
1575
1576         if (conn->write_state != TX_END)
1577                 goto out;
1578
1579         if (unlikely(conn->write_size)) {
1580                 PRINT_CRIT_ERROR("%d %x %u", res, cmnd_opcode(cmnd),
1581                         conn->write_size);
1582                 sBUG();
1583         }
1584         cmnd_tx_end(cmnd);
1585
1586         rsp_cmnd_release(cmnd);
1587
1588         conn->write_cmnd = NULL;
1589         conn->write_state = TX_INIT;
1590
1591 out:
1592         TRACE_EXIT_RES(res);
1593         return res;
1594 }
1595
1596 /* No locks, conn is wr processing.
1597  *
1598  * IMPORTANT! Connection conn must be protected by additional conn_get()
1599  * upon entrance in this function, because otherwise it could be destroyed
1600  * inside as a result of iscsi_send(), which releases sent commands.
1601  */
1602 static int process_write_queue(struct iscsi_conn *conn)
1603 {
1604         int res = 0;
1605
1606         TRACE_ENTRY();
1607
1608         if (likely(test_write_ready(conn)))
1609                 res = iscsi_send(conn);
1610
1611         TRACE_EXIT_RES(res);
1612         return res;
1613 }
1614
1615 /*
1616  * Called under iscsi_wr_lock and BHs disabled, but will drop it inside,
1617  * then reaquire.
1618  */
1619 static void scst_do_job_wr(void)
1620         __acquires(&iscsi_wr_lock)
1621         __releases(&iscsi_wr_lock)
1622 {
1623         TRACE_ENTRY();
1624
1625         /*
1626          * We delete/add to tail connections to maintain fairness between them.
1627          */
1628
1629         while (!list_empty(&iscsi_wr_list)) {
1630                 int rc;
1631                 struct iscsi_conn *conn = list_entry(iscsi_wr_list.next,
1632                         typeof(*conn), wr_list_entry);
1633
1634                 TRACE_DBG("conn %p, wr_state %x, wr_space_ready %d, "
1635                         "write ready %d", conn, conn->wr_state,
1636                         conn->wr_space_ready, test_write_ready(conn));
1637
1638                 list_del(&conn->wr_list_entry);
1639
1640                 sBUG_ON(conn->wr_state == ISCSI_CONN_WR_STATE_PROCESSING);
1641
1642                 conn->wr_state = ISCSI_CONN_WR_STATE_PROCESSING;
1643                 conn->wr_space_ready = 0;
1644 #ifdef CONFIG_SCST_EXTRACHECKS
1645                 conn->wr_task = current;
1646 #endif
1647                 spin_unlock_bh(&iscsi_wr_lock);
1648
1649                 conn_get(conn);
1650
1651                 rc = process_write_queue(conn);
1652
1653                 spin_lock_bh(&iscsi_wr_lock);
1654 #ifdef CONFIG_SCST_EXTRACHECKS
1655                 conn->wr_task = NULL;
1656 #endif
1657                 if ((rc == -EAGAIN) && !conn->wr_space_ready) {
1658                         conn->wr_state = ISCSI_CONN_WR_STATE_SPACE_WAIT;
1659                         goto cont;
1660                 }
1661
1662                 if (test_write_ready(conn)) {
1663                         list_add_tail(&conn->wr_list_entry, &iscsi_wr_list);
1664                         conn->wr_state = ISCSI_CONN_WR_STATE_IN_LIST;
1665                 } else
1666                         conn->wr_state = ISCSI_CONN_WR_STATE_IDLE;
1667
1668 cont:
1669                 conn_put(conn);
1670         }
1671
1672         TRACE_EXIT();
1673         return;
1674 }
1675
1676 static inline int test_wr_list(void)
1677 {
1678         int res = !list_empty(&iscsi_wr_list) ||
1679                   unlikely(kthread_should_stop());
1680         return res;
1681 }
1682
1683 int istwr(void *arg)
1684 {
1685         TRACE_ENTRY();
1686
1687         PRINT_INFO("Write thread started, PID %d", current->pid);
1688
1689         current->flags |= PF_NOFREEZE;
1690
1691         spin_lock_bh(&iscsi_wr_lock);
1692         while (!kthread_should_stop()) {
1693                 wait_queue_t wait;
1694                 init_waitqueue_entry(&wait, current);
1695
1696                 if (!test_wr_list()) {
1697                         add_wait_queue_exclusive_head(&iscsi_wr_waitQ, &wait);
1698                         for (;;) {
1699                                 set_current_state(TASK_INTERRUPTIBLE);
1700                                 if (test_wr_list())
1701                                         break;
1702                                 spin_unlock_bh(&iscsi_wr_lock);
1703                                 schedule();
1704                                 spin_lock_bh(&iscsi_wr_lock);
1705                         }
1706                         set_current_state(TASK_RUNNING);
1707                         remove_wait_queue(&iscsi_wr_waitQ, &wait);
1708                 }
1709                 scst_do_job_wr();
1710         }
1711         spin_unlock_bh(&iscsi_wr_lock);
1712
1713         /*
1714          * If kthread_should_stop() is true, we are guaranteed to be
1715          * on the module unload, so iscsi_wr_list must be empty.
1716          */
1717         sBUG_ON(!list_empty(&iscsi_wr_list));
1718
1719         PRINT_INFO("Write thread PID %d finished", current->pid);
1720
1721         TRACE_EXIT();
1722         return 0;
1723 }