9f4aab2c3ec5be57e059baf074413ff251bfba5b
[mirror/scst/.git] / iscsi-scst / kernel / nthread.c
1 /*
2  *  Network threads.
3  *
4  *  Copyright (C) 2004 - 2005 FUJITA Tomonori <tomof@acm.org>
5  *  Copyright (C) 2007 - 2009 Vladislav Bolkhovitin
6  *  Copyright (C) 2007 - 2009 ID7 Ltd.
7  *
8  *  This program is free software; you can redistribute it and/or
9  *  modify it under the terms of the GNU General Public License
10  *  as published by the Free Software Foundation.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  *  GNU General Public License for more details.
16  */
17
18 #include <linux/sched.h>
19 #include <linux/file.h>
20 #include <linux/kthread.h>
21 #include <asm/ioctls.h>
22 #include <linux/delay.h>
23 #include <net/tcp.h>
24
25 #include "iscsi.h"
26 #include "digest.h"
27
28 enum rx_state {
29         RX_INIT_BHS, /* Must be zero for better "switch" optimiztion. */
30         RX_BHS,
31         RX_CMD_START,
32         RX_DATA,
33         RX_END,
34
35         RX_CMD_CONTINUE,
36         RX_INIT_HDIGEST,
37         RX_CHECK_HDIGEST,
38         RX_INIT_DDIGEST,
39         RX_CHECK_DDIGEST,
40         RX_AHS,
41         RX_PADDING,
42 };
43
44 enum tx_state {
45         TX_INIT = 0, /* Must be zero for better "switch" optimiztion. */
46         TX_BHS_DATA,
47         TX_INIT_PADDING,
48         TX_PADDING,
49         TX_INIT_DDIGEST,
50         TX_DDIGEST,
51         TX_END,
52 };
53
54 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
55 static void iscsi_check_closewait(struct iscsi_conn *conn)
56 {
57         struct iscsi_cmnd *cmnd;
58
59         TRACE_ENTRY();
60
61         TRACE_CONN_CLOSE_DBG("conn %p, sk_state %d", conn,
62                 conn->sock->sk->sk_state);
63
64         if (conn->sock->sk->sk_state != TCP_CLOSE) {
65                 TRACE_CONN_CLOSE_DBG("conn %p, skipping", conn);
66                 goto out;
67         }
68
69         /*
70          * No data are going to be sent, so all queued buffers can be freed
71          * now. In many cases TCP does that only in close(), but we can't rely
72          * on user space on calling it.
73          */
74
75 again:
76         spin_lock_bh(&conn->cmd_list_lock);
77         list_for_each_entry(cmnd, &conn->cmd_list, cmd_list_entry) {
78                 struct iscsi_cmnd *rsp;
79                 int restart = 0;
80
81                 TRACE_CONN_CLOSE_DBG("cmd %p, scst_state %x, data_waiting %d, "
82                         "ref_cnt %d, parent_req %p, net_ref_cnt %d, sg %p",
83                         cmnd, cmnd->scst_state, cmnd->data_waiting,
84                         atomic_read(&cmnd->ref_cnt), cmnd->parent_req,
85                         atomic_read(&cmnd->net_ref_cnt), cmnd->sg);
86
87                 sBUG_ON(cmnd->parent_req != NULL);
88
89                 if (cmnd->sg != NULL) {
90                         int i;
91
92                         if (cmnd_get_check(cmnd))
93                                 continue;
94
95                         for (i = 0; i < cmnd->sg_cnt; i++) {
96                                 struct page *page = sg_page(&cmnd->sg[i]);
97                                 TRACE_CONN_CLOSE_DBG("page %p, net_priv %p, "
98                                         "_count %d", page, page->net_priv,
99                                         atomic_read(&page->_count));
100
101                                 if (page->net_priv != NULL) {
102                                         if (restart == 0) {
103                                                 spin_unlock_bh(&conn->cmd_list_lock);
104                                                 restart = 1;
105                                         }
106                                         while (page->net_priv != NULL)
107                                                 iscsi_put_page_callback(page);
108                                 }
109                         }
110                         cmnd_put(cmnd);
111
112                         if (restart)
113                                 goto again;
114                 }
115
116                 spin_lock_bh(&cmnd->rsp_cmd_lock);
117                 list_for_each_entry(rsp, &cmnd->rsp_cmd_list,
118                                 rsp_cmd_list_entry) {
119                         TRACE_CONN_CLOSE_DBG("  rsp %p, ref_cnt %d, "
120                                 "net_ref_cnt %d, sg %p",
121                                 rsp, atomic_read(&rsp->ref_cnt),
122                                 atomic_read(&rsp->net_ref_cnt), rsp->sg);
123
124                         if ((rsp->sg != cmnd->sg) && (rsp->sg != NULL)) {
125                                 int i;
126
127                                 if (cmnd_get_check(rsp))
128                                         continue;
129
130                                 for (i = 0; i < rsp->sg_cnt; i++) {
131                                         struct page *page =
132                                                 sg_page(&rsp->sg[i]);
133                                         TRACE_CONN_CLOSE_DBG(
134                                                 "    page %p, net_priv %p, "
135                                                 "_count %d",
136                                                 page, page->net_priv,
137                                                 atomic_read(&page->_count));
138
139                                         if (page->net_priv != NULL) {
140                                                 if (restart == 0) {
141                                                         spin_unlock_bh(&cmnd->rsp_cmd_lock);
142                                                         spin_unlock_bh(&conn->cmd_list_lock);
143                                                         restart = 1;
144                                                 }
145                                                 while (page->net_priv != NULL)
146                                                         iscsi_put_page_callback(page);
147                                         }
148                                 }
149                                 cmnd_put(rsp);
150
151                                 if (restart)
152                                         goto again;
153                         }
154                 }
155                 spin_unlock_bh(&cmnd->rsp_cmd_lock);
156         }
157         spin_unlock_bh(&conn->cmd_list_lock);
158
159 out:
160         TRACE_EXIT();
161         return;
162 }
163 #else
164 static inline void iscsi_check_closewait(struct iscsi_conn *conn) {};
165 #endif
166
167 static void free_pending_commands(struct iscsi_conn *conn)
168 {
169         struct iscsi_session *session = conn->session;
170         struct list_head *pending_list = &session->pending_list;
171         int req_freed;
172         struct iscsi_cmnd *cmnd;
173
174         spin_lock(&session->sn_lock);
175         do {
176                 req_freed = 0;
177                 list_for_each_entry(cmnd, pending_list, pending_list_entry) {
178                         TRACE_CONN_CLOSE_DBG("Pending cmd %p"
179                                 "(conn %p, cmd_sn %u, exp_cmd_sn %u)",
180                                 cmnd, conn, cmnd->pdu.bhs.sn,
181                                 session->exp_cmd_sn);
182                         if ((cmnd->conn == conn) &&
183                             (session->exp_cmd_sn == cmnd->pdu.bhs.sn)) {
184                                 TRACE_CONN_CLOSE_DBG("Freeing pending cmd %p",
185                                         cmnd);
186
187                                 list_del(&cmnd->pending_list_entry);
188                                 cmnd->pending = 0;
189
190                                 session->exp_cmd_sn++;
191
192                                 spin_unlock(&session->sn_lock);
193
194                                 req_cmnd_release_force(cmnd, 0);
195
196                                 req_freed = 1;
197                                 spin_lock(&session->sn_lock);
198                                 break;
199                         }
200                 }
201         } while (req_freed);
202         spin_unlock(&session->sn_lock);
203
204         return;
205 }
206
207 static void free_orphaned_pending_commands(struct iscsi_conn *conn)
208 {
209         struct iscsi_session *session = conn->session;
210         struct list_head *pending_list = &session->pending_list;
211         int req_freed;
212         struct iscsi_cmnd *cmnd;
213
214         spin_lock(&session->sn_lock);
215         do {
216                 req_freed = 0;
217                 list_for_each_entry(cmnd, pending_list, pending_list_entry) {
218                         TRACE_CONN_CLOSE_DBG("Pending cmd %p"
219                                 "(conn %p, cmd_sn %u, exp_cmd_sn %u)",
220                                 cmnd, conn, cmnd->pdu.bhs.sn,
221                                 session->exp_cmd_sn);
222                         if (cmnd->conn == conn) {
223                                 PRINT_ERROR("Freeing orphaned pending cmd %p",
224                                             cmnd);
225
226                                 list_del(&cmnd->pending_list_entry);
227                                 cmnd->pending = 0;
228
229                                 if (session->exp_cmd_sn == cmnd->pdu.bhs.sn)
230                                         session->exp_cmd_sn++;
231
232                                 spin_unlock(&session->sn_lock);
233
234                                 req_cmnd_release_force(cmnd, 0);
235
236                                 req_freed = 1;
237                                 spin_lock(&session->sn_lock);
238                                 break;
239                         }
240                 }
241         } while (req_freed);
242         spin_unlock(&session->sn_lock);
243
244         return;
245 }
246
247 #ifdef CONFIG_SCST_DEBUG
248 static void trace_conn_close(struct iscsi_conn *conn)
249 {
250         struct iscsi_cmnd *cmnd;
251 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
252         struct iscsi_cmnd *rsp;
253 #endif
254
255 #if 0
256         if (time_after(jiffies, start_waiting + 10*HZ))
257                 trace_flag |= TRACE_CONN_OC_DBG;
258 #endif
259
260         spin_lock_bh(&conn->cmd_list_lock);
261         list_for_each_entry(cmnd, &conn->cmd_list,
262                         cmd_list_entry) {
263                 TRACE_CONN_CLOSE_DBG(
264                         "cmd %p, scst_state %x, scst_cmd state %d, "
265                         "data_waiting %d, ref_cnt %d, sn %u, "
266                         "parent_req %p, pending %d",
267                         cmnd, cmnd->scst_state,
268                         (cmnd->parent_req && cmnd->scst_cmd) ?
269                                 cmnd->scst_cmd->state : -1,
270                         cmnd->data_waiting, atomic_read(&cmnd->ref_cnt),
271                         cmnd->pdu.bhs.sn, cmnd->parent_req, cmnd->pending);
272 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
273                 TRACE_CONN_CLOSE_DBG("net_ref_cnt %d, sg %p",
274                         atomic_read(&cmnd->net_ref_cnt),
275                         cmnd->sg);
276                 if (cmnd->sg != NULL) {
277                         int i;
278                         for (i = 0; i < cmnd->sg_cnt; i++) {
279                                 struct page *page = sg_page(&cmnd->sg[i]);
280                                 TRACE_CONN_CLOSE_DBG("page %p, "
281                                         "net_priv %p, _count %d",
282                                         page, page->net_priv,
283                                         atomic_read(&page->_count));
284                         }
285                 }
286
287                 sBUG_ON(cmnd->parent_req != NULL);
288
289                 spin_lock_bh(&cmnd->rsp_cmd_lock);
290                 list_for_each_entry(rsp, &cmnd->rsp_cmd_list,
291                                 rsp_cmd_list_entry) {
292                         TRACE_CONN_CLOSE_DBG("  rsp %p, "
293                             "ref_cnt %d, net_ref_cnt %d, sg %p",
294                             rsp, atomic_read(&rsp->ref_cnt),
295                             atomic_read(&rsp->net_ref_cnt), rsp->sg);
296                         if (rsp->sg != cmnd->sg && rsp->sg) {
297                                 int i;
298                                 for (i = 0; i < rsp->sg_cnt; i++) {
299                                         TRACE_CONN_CLOSE_DBG("    page %p, "
300                                           "net_priv %p, _count %d",
301                                           sg_page(&rsp->sg[i]),
302                                           sg_page(&rsp->sg[i])->net_priv,
303                                           atomic_read(&sg_page(&rsp->sg[i])->
304                                                 _count));
305                                 }
306                         }
307                 }
308                 spin_unlock_bh(&cmnd->rsp_cmd_lock);
309 #endif /* CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION */
310         }
311         spin_unlock_bh(&conn->cmd_list_lock);
312         return;
313 }
314 #else /* CONFIG_SCST_DEBUG */
315 static void trace_conn_close(struct iscsi_conn *conn) {}
316 #endif /* CONFIG_SCST_DEBUG */
317
318 void iscsi_task_mgmt_affected_cmds_done(struct scst_mgmt_cmd *scst_mcmd)
319 {
320         int fn = scst_mgmt_cmd_get_fn(scst_mcmd);
321         void *priv = scst_mgmt_cmd_get_tgt_priv(scst_mcmd);
322
323         TRACE_MGMT_DBG("scst_mcmd %p, fn %d, priv %p", scst_mcmd, fn, priv);
324
325         switch (fn) {
326         case SCST_NEXUS_LOSS_SESS:
327         case SCST_ABORT_ALL_TASKS_SESS:
328         {
329                 struct iscsi_conn *conn = (struct iscsi_conn *)priv;
330                 struct iscsi_session *sess = conn->session;
331                 struct iscsi_conn *c;
332
333                 mutex_lock(&sess->target->target_mutex);
334
335                 /*
336                  * We can't mark sess as shutting down earlier, because until
337                  * now it might have pending commands. Otherwise, in case of
338                  * reinstatement it might lead to data corruption, because
339                  * commands in being reinstated session can be executed
340                  * after commands in the new session.
341                  */
342                 sess->sess_shutting_down = 1;
343                 list_for_each_entry(c, &sess->conn_list, conn_list_entry) {
344                         if (!test_bit(ISCSI_CONN_SHUTTINGDOWN, &c->conn_aflags)) {
345                                 sess->sess_shutting_down = 0;
346                                 break;
347                         }
348                 }
349
350                 if (conn->conn_reinst_successor != NULL) {
351                         sBUG_ON(!test_bit(ISCSI_CONN_REINSTATING,
352                                   &conn->conn_reinst_successor->conn_aflags));
353                         conn_reinst_finished(conn->conn_reinst_successor);
354                         conn->conn_reinst_successor = NULL;
355                 } else if (sess->sess_reinst_successor != NULL) {
356                         sess_reinst_finished(sess->sess_reinst_successor);
357                         sess->sess_reinst_successor = NULL;
358                 }
359                 mutex_unlock(&sess->target->target_mutex);
360
361                 complete_all(&conn->ready_to_free);
362                 break;
363         }
364         default:
365                 /* Nothing to do */
366                 break;
367         }
368
369         return;
370 }
371
372 /* No locks */
373 static void close_conn(struct iscsi_conn *conn)
374 {
375         struct iscsi_session *session = conn->session;
376         struct iscsi_target *target = conn->target;
377         typeof(jiffies) start_waiting = jiffies;
378         typeof(jiffies) shut_start_waiting = start_waiting;
379         bool pending_reported = 0, wait_expired = 0, shut_expired = 0;
380         bool reinst;
381
382 #define CONN_PENDING_TIMEOUT    ((typeof(jiffies))10*HZ)
383 #define CONN_WAIT_TIMEOUT       ((typeof(jiffies))10*HZ)
384 #define CONN_REG_SHUT_TIMEOUT   ((typeof(jiffies))125*HZ)
385 #define CONN_DEL_SHUT_TIMEOUT   ((typeof(jiffies))10*HZ)
386
387         TRACE_ENTRY();
388
389         TRACE_CONN_CLOSE("Closing connection %p (conn_ref_cnt=%d)", conn,
390                 atomic_read(&conn->conn_ref_cnt));
391
392         iscsi_extracheck_is_rd_thread(conn);
393
394         sBUG_ON(!conn->closing);
395
396         if (conn->active_close) {
397                 /* We want all our already send operations to complete */
398                 conn->sock->ops->shutdown(conn->sock, RCV_SHUTDOWN);
399         } else {
400                 conn->sock->ops->shutdown(conn->sock,
401                         RCV_SHUTDOWN|SEND_SHUTDOWN);
402         }
403
404         mutex_lock(&session->target->target_mutex);
405
406         set_bit(ISCSI_CONN_SHUTTINGDOWN, &conn->conn_aflags);
407         reinst = (conn->conn_reinst_successor != NULL);
408
409         mutex_unlock(&session->target->target_mutex);
410
411         if (reinst) {
412                 int rc;
413                 int lun = 0;
414
415                 /* Abort all outstanding commands */
416                 rc = scst_rx_mgmt_fn_lun(session->scst_sess,
417                         SCST_ABORT_ALL_TASKS_SESS, (uint8_t *)&lun, sizeof(lun),
418                         SCST_NON_ATOMIC, conn);
419                 if (rc != 0)
420                         PRINT_ERROR("SCST_ABORT_ALL_TASKS_SESS failed %d", rc);
421         } else {
422                 int rc;
423                 int lun = 0;
424
425                 rc = scst_rx_mgmt_fn_lun(session->scst_sess,
426                         SCST_NEXUS_LOSS_SESS, (uint8_t *)&lun, sizeof(lun),
427                         SCST_NON_ATOMIC, conn);
428                 if (rc != 0)
429                         PRINT_ERROR("SCST_NEXUS_LOSS_SESS failed %d", rc);
430         }
431
432         if (conn->read_state != RX_INIT_BHS) {
433                 struct iscsi_cmnd *cmnd = conn->read_cmnd;
434
435                 if (cmnd->scst_state == ISCSI_CMD_STATE_RX_CMD) {
436                         TRACE_DBG("Going to wait for cmnd %p to change state "
437                                 "from RX_CMD", cmnd);
438                 }
439                 wait_event(conn->read_state_waitQ,
440                         cmnd->scst_state != ISCSI_CMD_STATE_RX_CMD);
441
442                 conn->read_cmnd = NULL;
443                 conn->read_state = RX_INIT_BHS;
444                 req_cmnd_release_force(cmnd, 0);
445         }
446
447         conn_abort(conn);
448
449         /* ToDo: not the best way to wait */
450         while (atomic_read(&conn->conn_ref_cnt) != 0) {
451                 mutex_lock(&target->target_mutex);
452                 spin_lock(&session->sn_lock);
453                 if (session->tm_rsp && session->tm_rsp->conn == conn) {
454                         struct iscsi_cmnd *tm_rsp = session->tm_rsp;
455                         TRACE(TRACE_MGMT_MINOR, "Dropping delayed TM rsp %p",
456                                 tm_rsp);
457                         session->tm_rsp = NULL;
458                         session->tm_active--;
459                         WARN_ON(session->tm_active < 0);
460                         spin_unlock(&session->sn_lock);
461                         mutex_unlock(&target->target_mutex);
462
463                         rsp_cmnd_release(tm_rsp);
464                 } else {
465                         spin_unlock(&session->sn_lock);
466                         mutex_unlock(&target->target_mutex);
467                 }
468
469                 /* It's safe to check it without sn_lock */
470                 if (!list_empty(&session->pending_list)) {
471                         TRACE_CONN_CLOSE_DBG("Disposing pending commands on "
472                                 "connection %p (conn_ref_cnt=%d)", conn,
473                                 atomic_read(&conn->conn_ref_cnt));
474
475                         free_pending_commands(conn);
476
477                         if (time_after(jiffies,
478                                 start_waiting + CONN_PENDING_TIMEOUT)) {
479                                 if (!pending_reported) {
480                                         TRACE_CONN_CLOSE("%s",
481                                                 "Pending wait time expired");
482                                         pending_reported = 1;
483                                 }
484                                 free_orphaned_pending_commands(conn);
485                         }
486                 }
487
488                 iscsi_make_conn_wr_active(conn);
489
490                 /* That's for active close only, actually */
491                 if (time_after(jiffies, start_waiting + CONN_WAIT_TIMEOUT) &&
492                     !wait_expired) {
493                         TRACE_CONN_CLOSE("Wait time expired (conn %p, "
494                                 "sk_state %d)",
495                                 conn, conn->sock->sk->sk_state);
496                         conn->sock->ops->shutdown(conn->sock, SEND_SHUTDOWN);
497                         wait_expired = 1;
498                         shut_start_waiting = jiffies;
499                 }
500
501                 if (wait_expired && !shut_expired &&
502                     time_after(jiffies, shut_start_waiting +
503                                 conn->deleting ? CONN_DEL_SHUT_TIMEOUT :
504                                                  CONN_REG_SHUT_TIMEOUT)) {
505                         TRACE_CONN_CLOSE("Wait time after shutdown expired "
506                                 "(conn %p, sk_state %d)", conn,
507                                 conn->sock->sk->sk_state);
508                         conn->sock->sk->sk_prot->disconnect(conn->sock->sk, 0);
509                         shut_expired = 1;
510                 }
511
512                 if (conn->deleting)
513                         msleep(200);
514                 else
515                         msleep(1000);
516
517                 TRACE_CONN_CLOSE_DBG("conn %p, conn_ref_cnt %d left, "
518                         "wr_state %d, exp_cmd_sn %u",
519                         conn, atomic_read(&conn->conn_ref_cnt),
520                         conn->wr_state, session->exp_cmd_sn);
521
522                 trace_conn_close(conn);
523
524                 iscsi_check_closewait(conn);
525         }
526
527         write_lock_bh(&conn->sock->sk->sk_callback_lock);
528         conn->sock->sk->sk_state_change = conn->old_state_change;
529         conn->sock->sk->sk_data_ready = conn->old_data_ready;
530         conn->sock->sk->sk_write_space = conn->old_write_space;
531         write_unlock_bh(&conn->sock->sk->sk_callback_lock);
532
533         while (1) {
534                 bool t;
535
536                 spin_lock_bh(&iscsi_wr_lock);
537                 t = (conn->wr_state == ISCSI_CONN_WR_STATE_IDLE);
538                 spin_unlock_bh(&iscsi_wr_lock);
539
540                 if (t && (atomic_read(&conn->conn_ref_cnt) == 0))
541                         break;
542
543                 TRACE_CONN_CLOSE_DBG("Waiting for wr thread (conn %p), "
544                         "wr_state %x", conn, conn->wr_state);
545                 msleep(50);
546         }
547
548         wait_for_completion(&conn->ready_to_free);
549
550         TRACE_CONN_CLOSE("Notifying user space about closing connection %p",
551                          conn);
552         event_send(target->tid, session->sid, conn->cid, E_CONN_CLOSE, 0);
553
554         mutex_lock(&target->target_mutex);
555
556         conn_free(conn);
557
558         if (list_empty(&session->conn_list)) {
559                 sBUG_ON(session->sess_reinst_successor != NULL);
560                 session_free(session, true);
561         }
562
563         mutex_unlock(&target->target_mutex);
564
565         TRACE_EXIT();
566         return;
567 }
568
569 static int close_conn_thr(void *arg)
570 {
571         struct iscsi_conn *conn = (struct iscsi_conn *)arg;
572
573         TRACE_ENTRY();
574
575 #ifdef CONFIG_SCST_EXTRACHECKS
576         /*
577          * To satisfy iscsi_extracheck_is_rd_thread() in functions called
578          * on the connection close. It is safe, because at this point conn
579          * can't be used by any other thread.
580          */
581         conn->rd_task = current;
582 #endif
583         close_conn(conn);
584
585         TRACE_EXIT();
586         return 0;
587 }
588
589 /* No locks */
590 static void start_close_conn(struct iscsi_conn *conn)
591 {
592         struct task_struct *t;
593
594         TRACE_ENTRY();
595
596         t = kthread_run(close_conn_thr, conn, "iscsi_conn_cleanup");
597         if (IS_ERR(t)) {
598                 PRINT_ERROR("kthread_run() failed (%ld), closing conn %p "
599                         "directly", PTR_ERR(t), conn);
600                 close_conn(conn);
601         }
602
603         TRACE_EXIT();
604         return;
605 }
606
607 static inline void iscsi_conn_init_read(struct iscsi_conn *conn,
608         void __user *data, size_t len)
609 {
610         conn->read_iov[0].iov_base = data;
611         conn->read_iov[0].iov_len = len;
612         conn->read_msg.msg_iov = conn->read_iov;
613         conn->read_msg.msg_iovlen = 1;
614         conn->read_size = len;
615         return;
616 }
617
618 static void iscsi_conn_prepare_read_ahs(struct iscsi_conn *conn,
619         struct iscsi_cmnd *cmnd)
620 {
621         int asize = (cmnd->pdu.ahssize + 3) & -4;
622
623         /* ToDo: __GFP_NOFAIL ?? */
624         cmnd->pdu.ahs = kmalloc(asize, __GFP_NOFAIL|GFP_KERNEL);
625         sBUG_ON(cmnd->pdu.ahs == NULL);
626         iscsi_conn_init_read(conn, (void __force __user *)cmnd->pdu.ahs, asize);
627         return;
628 }
629
630 static struct iscsi_cmnd *iscsi_get_send_cmnd(struct iscsi_conn *conn)
631 {
632         struct iscsi_cmnd *cmnd = NULL;
633
634         spin_lock_bh(&conn->write_list_lock);
635         if (!list_empty(&conn->write_list)) {
636                 cmnd = list_entry(conn->write_list.next, struct iscsi_cmnd,
637                                 write_list_entry);
638                 cmd_del_from_write_list(cmnd);
639                 cmnd->write_processing_started = 1;
640         }
641         spin_unlock_bh(&conn->write_list_lock);
642
643         return cmnd;
644 }
645
646 /* Returns number of bytes left to receive or <0 for error */
647 static int do_recv(struct iscsi_conn *conn)
648 {
649         int res;
650         mm_segment_t oldfs;
651         struct msghdr msg;
652         int first_len;
653
654         EXTRACHECKS_BUG_ON(conn->read_cmnd == NULL);
655
656         if (unlikely(conn->closing)) {
657                 res = -EIO;
658                 goto out;
659         }
660
661         /*
662          * We suppose that if sock_recvmsg() returned less data than requested,
663          * then next time it will return -EAGAIN, so there's no point to call
664          * it again.
665          */
666
667 restart:
668         memset(&msg, 0, sizeof(msg));
669         msg.msg_iov = conn->read_msg.msg_iov;
670         msg.msg_iovlen = conn->read_msg.msg_iovlen;
671         first_len = msg.msg_iov->iov_len;
672
673         oldfs = get_fs();
674         set_fs(get_ds());
675         res = sock_recvmsg(conn->sock, &msg, conn->read_size,
676                            MSG_DONTWAIT | MSG_NOSIGNAL);
677         set_fs(oldfs);
678
679         if (res > 0) {
680                 /*
681                  * To save some considerable effort and CPU power we
682                  * suppose that TCP functions adjust
683                  * conn->read_msg.msg_iov and conn->read_msg.msg_iovlen
684                  * on amount of copied data. This BUG_ON is intended
685                  * to catch if it is changed in the future.
686                  */
687                 sBUG_ON((res >= first_len) &&
688                         (conn->read_msg.msg_iov->iov_len != 0));
689                 conn->read_size -= res;
690                 if (conn->read_size != 0) {
691                         if (res >= first_len) {
692                                 int done = 1 + ((res - first_len) >> PAGE_SHIFT);
693                                 conn->read_msg.msg_iov += done;
694                                 conn->read_msg.msg_iovlen -= done;
695                         }
696                 }
697                 res = conn->read_size;
698         } else {
699                 switch (res) {
700                 case -EAGAIN:
701                         TRACE_DBG("EAGAIN received for conn %p", conn);
702                         res = conn->read_size;
703                         break;
704                 case -ERESTARTSYS:
705                         TRACE_DBG("ERESTARTSYS received for conn %p", conn);
706                         goto restart;
707                 default:
708                         PRINT_ERROR("sock_recvmsg() failed: %d", res);
709                         mark_conn_closed(conn);
710                         if (res == 0)
711                                 res = -EIO;
712                         break;
713                 }
714         }
715
716 out:
717         TRACE_EXIT_RES(res);
718         return res;
719 }
720
721 static int iscsi_rx_check_ddigest(struct iscsi_conn *conn)
722 {
723         struct iscsi_cmnd *cmnd = conn->read_cmnd;
724         int res;
725
726         res = do_recv(conn);
727         if (res == 0) {
728                 conn->read_state = RX_END;
729
730                 if (cmnd->pdu.datasize <= 16*1024) {
731                         /*
732                          * It's cache hot, so let's compute it inline. The
733                          * choice here about what will expose more latency:
734                          * possible cache misses or the digest calculation.
735                          */
736                         TRACE_DBG("cmnd %p, opcode %x: checking RX "
737                                 "ddigest inline", cmnd, cmnd_opcode(cmnd));
738                         cmnd->ddigest_checked = 1;
739                         res = digest_rx_data(cmnd);
740                         if (unlikely(res != 0)) {
741                                 mark_conn_closed(conn);
742                                 goto out;
743                         }
744                 } else if (cmnd_opcode(cmnd) == ISCSI_OP_SCSI_CMD) {
745                         cmd_add_on_rx_ddigest_list(cmnd, cmnd);
746                         cmnd_get(cmnd);
747                 } else if (cmnd_opcode(cmnd) != ISCSI_OP_SCSI_DATA_OUT) {
748                         /*
749                          * We could get here only for NOP-Out. ISCSI RFC
750                          * doesn't specify how to deal with digest errors in
751                          * this case. Is closing connection correct?
752                          */
753                         TRACE_DBG("cmnd %p, opcode %x: checking NOP RX "
754                                 "ddigest", cmnd, cmnd_opcode(cmnd));
755                         res = digest_rx_data(cmnd);
756                         if (unlikely(res != 0)) {
757                                 mark_conn_closed(conn);
758                                 goto out;
759                         }
760                 }
761         }               
762
763 out:
764         return res;
765 }
766
767 /* No locks, conn is rd processing */
768 static void process_read_io(struct iscsi_conn *conn, int *closed)
769 {
770         struct iscsi_cmnd *cmnd = conn->read_cmnd;
771         int res;
772
773         TRACE_ENTRY();
774
775         /* In case of error cmnd will be freed in close_conn() */
776
777         do {
778                 switch (conn->read_state) {
779                 case RX_INIT_BHS:
780                         EXTRACHECKS_BUG_ON(conn->read_cmnd != NULL);
781                         cmnd = cmnd_alloc(conn, NULL);
782                         conn->read_cmnd = cmnd;
783                         iscsi_conn_init_read(cmnd->conn,
784                                 (void __force __user *)&cmnd->pdu.bhs,
785                                 sizeof(cmnd->pdu.bhs));
786                         conn->read_state = RX_BHS;
787                         /* go through */
788
789                 case RX_BHS:
790                         res = do_recv(conn);
791                         if (res == 0) {
792                                 iscsi_cmnd_get_length(&cmnd->pdu);
793                                 if (cmnd->pdu.ahssize == 0) {
794                                         if ((conn->hdigest_type & DIGEST_NONE) == 0)
795                                                 conn->read_state = RX_INIT_HDIGEST;
796                                         else
797                                                 conn->read_state = RX_CMD_START;
798                                 } else {
799                                         iscsi_conn_prepare_read_ahs(conn, cmnd);
800                                         conn->read_state = RX_AHS;
801                                 } 
802                         }
803                         break;
804
805                 case RX_CMD_START:
806                         res = cmnd_rx_start(cmnd);
807                         if (res == 0) {
808                                 if (cmnd->pdu.datasize == 0)
809                                         conn->read_state = RX_END;
810                                 else
811                                         conn->read_state = RX_DATA;
812                         } else if (res > 0)
813                                 conn->read_state = RX_CMD_CONTINUE;
814                         else
815                                 sBUG_ON(!conn->closing);
816                         break;
817
818                 case RX_CMD_CONTINUE:
819                         if (cmnd->scst_state == ISCSI_CMD_STATE_RX_CMD) {
820                                 TRACE_DBG("cmnd %p is still in RX_CMD state",
821                                         cmnd);
822                                 res = 1;
823                                 break;
824                         }
825                         res = cmnd_rx_continue(cmnd);
826                         if (unlikely(res != 0))
827                                 sBUG_ON(!conn->closing);
828                         else {
829                                 if (cmnd->pdu.datasize == 0)
830                                         conn->read_state = RX_END;
831                                 else
832                                         conn->read_state = RX_DATA;
833                         }
834                         break;
835
836                 case RX_DATA:
837                         res = do_recv(conn);
838                         if (res == 0) {
839                                 int psz = ((cmnd->pdu.datasize + 3) & -4) - cmnd->pdu.datasize;
840                                 if (psz != 0) {
841                                         TRACE_DBG("padding %d bytes", psz);
842                                         iscsi_conn_init_read(conn,
843                                                 (void __force __user *)&conn->rpadding, psz);
844                                         conn->read_state = RX_PADDING;
845                                 } else if ((conn->ddigest_type & DIGEST_NONE) != 0)
846                                         conn->read_state = RX_END;
847                                 else
848                                         conn->read_state = RX_INIT_DDIGEST;
849                         }
850                         break;
851
852                 case RX_END:
853                         if (unlikely(conn->read_size != 0)) {
854                                 PRINT_CRIT_ERROR("%d %x %d", res,
855                                         cmnd_opcode(cmnd), conn->read_size);
856                                 sBUG();
857                         }
858                         conn->read_cmnd = NULL;
859                         conn->read_state = RX_INIT_BHS;
860
861                         cmnd_rx_end(cmnd);
862
863                         EXTRACHECKS_BUG_ON(conn->read_size != 0);
864                         break;
865
866                 case RX_INIT_HDIGEST:
867                         iscsi_conn_init_read(conn,
868                                 (void __force __user *)&cmnd->hdigest, sizeof(u32));
869                         conn->read_state = RX_CHECK_HDIGEST;
870                         /* go through */
871
872                 case RX_CHECK_HDIGEST:
873                         res = do_recv(conn);
874                         if (res == 0) {
875                                 res = digest_rx_header(cmnd);
876                                 if (unlikely(res != 0)) {
877                                         PRINT_ERROR("rx header digest for "
878                                                 "initiator %s failed (%d)",
879                                                 conn->session->initiator_name,
880                                                 res);
881                                         mark_conn_closed(conn);
882                                 } else
883                                         conn->read_state = RX_CMD_START;
884                         }
885                         break;
886
887                 case RX_INIT_DDIGEST:
888                         iscsi_conn_init_read(conn,
889                                 (void __force __user *)&cmnd->ddigest,
890                                 sizeof(u32));
891                         conn->read_state = RX_CHECK_DDIGEST;
892                         /* go through */
893
894                 case RX_CHECK_DDIGEST:
895                         res = iscsi_rx_check_ddigest(conn);
896                         break;
897
898                 case RX_AHS:
899                         res = do_recv(conn);
900                         if (res == 0) {
901                                 if ((conn->hdigest_type & DIGEST_NONE) == 0)
902                                         conn->read_state = RX_INIT_HDIGEST;
903                                 else
904                                         conn->read_state = RX_CMD_START;
905                         }
906                         break;
907
908                 case RX_PADDING:
909                         res = do_recv(conn);
910                         if (res == 0) {
911                                 if ((conn->ddigest_type & DIGEST_NONE) == 0)
912                                         conn->read_state = RX_INIT_DDIGEST;
913                                 else
914                                         conn->read_state = RX_END;
915                         }
916                         break;
917
918                 default:
919                         PRINT_CRIT_ERROR("%d %x", conn->read_state, cmnd_opcode(cmnd));
920                         sBUG();
921                 }
922         } while (res == 0);
923
924         if (unlikely(conn->closing)) {
925                 start_close_conn(conn);
926                 *closed = 1;
927         }
928
929         TRACE_EXIT();
930         return;
931 }
932
933 /*
934  * Called under iscsi_rd_lock and BHs disabled, but will drop it inside,
935  * then reaquire.
936  */
937 static void scst_do_job_rd(void)
938         __acquires(&iscsi_rd_lock)
939         __releases(&iscsi_rd_lock)
940 {
941         TRACE_ENTRY();
942
943         /*
944          * We delete/add to tail connections to maintain fairness between them.
945          */
946
947         while (!list_empty(&iscsi_rd_list)) {
948                 int closed = 0;
949                 struct iscsi_conn *conn = list_entry(iscsi_rd_list.next,
950                         typeof(*conn), rd_list_entry);
951
952                 list_del(&conn->rd_list_entry);
953
954                 sBUG_ON(conn->rd_state == ISCSI_CONN_RD_STATE_PROCESSING);
955                 conn->rd_data_ready = 0;
956                 conn->rd_state = ISCSI_CONN_RD_STATE_PROCESSING;
957 #ifdef CONFIG_SCST_EXTRACHECKS
958                 conn->rd_task = current;
959 #endif
960                 spin_unlock_bh(&iscsi_rd_lock);
961
962                 process_read_io(conn, &closed);
963
964                 spin_lock_bh(&iscsi_rd_lock);
965
966                 if (closed)
967                         continue;
968
969 #ifdef CONFIG_SCST_EXTRACHECKS
970                 conn->rd_task = NULL;
971 #endif
972                 if (conn->rd_data_ready) {
973                         list_add_tail(&conn->rd_list_entry, &iscsi_rd_list);
974                         conn->rd_state = ISCSI_CONN_RD_STATE_IN_LIST;
975                 } else
976                         conn->rd_state = ISCSI_CONN_RD_STATE_IDLE;
977         }
978
979         TRACE_EXIT();
980         return;
981 }
982
983 static inline int test_rd_list(void)
984 {
985         int res = !list_empty(&iscsi_rd_list) ||
986                   unlikely(kthread_should_stop());
987         return res;
988 }
989
990 int istrd(void *arg)
991 {
992         TRACE_ENTRY();
993
994         PRINT_INFO("Read thread started, PID %d", current->pid);
995
996         current->flags |= PF_NOFREEZE;
997
998         spin_lock_bh(&iscsi_rd_lock);
999         while (!kthread_should_stop()) {
1000                 wait_queue_t wait;
1001                 init_waitqueue_entry(&wait, current);
1002
1003                 if (!test_rd_list()) {
1004                         add_wait_queue_exclusive_head(&iscsi_rd_waitQ, &wait);
1005                         for (;;) {
1006                                 set_current_state(TASK_INTERRUPTIBLE);
1007                                 if (test_rd_list())
1008                                         break;
1009                                 spin_unlock_bh(&iscsi_rd_lock);
1010                                 schedule();
1011                                 spin_lock_bh(&iscsi_rd_lock);
1012                         }
1013                         set_current_state(TASK_RUNNING);
1014                         remove_wait_queue(&iscsi_rd_waitQ, &wait);
1015                 }
1016                 scst_do_job_rd();
1017         }
1018         spin_unlock_bh(&iscsi_rd_lock);
1019
1020         /*
1021          * If kthread_should_stop() is true, we are guaranteed to be
1022          * on the module unload, so iscsi_rd_list must be empty.
1023          */
1024         sBUG_ON(!list_empty(&iscsi_rd_list));
1025
1026         PRINT_INFO("Read thread PID %d finished", current->pid);
1027
1028         TRACE_EXIT();
1029         return 0;
1030 }
1031
1032 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
1033 static inline void __iscsi_get_page_callback(struct iscsi_cmnd *cmd)
1034 {
1035         int v;
1036
1037         TRACE_NET_PAGE("cmd %p, new net_ref_cnt %d",
1038                 cmd, atomic_read(&cmd->net_ref_cnt)+1);
1039
1040         v = atomic_inc_return(&cmd->net_ref_cnt);
1041         if (v == 1) {
1042                 TRACE_NET_PAGE("getting cmd %p", cmd);
1043                 cmnd_get(cmd);
1044         }
1045         return;
1046 }
1047
1048 void iscsi_get_page_callback(struct page *page)
1049 {
1050         struct iscsi_cmnd *cmd = (struct iscsi_cmnd *)page->net_priv;
1051
1052         TRACE_NET_PAGE("page %p, _count %d", page,
1053                 atomic_read(&page->_count));
1054
1055         __iscsi_get_page_callback(cmd);
1056         return;
1057 }
1058
1059 static inline void __iscsi_put_page_callback(struct iscsi_cmnd *cmd)
1060 {
1061         TRACE_NET_PAGE("cmd %p, new net_ref_cnt %d", cmd,
1062                 atomic_read(&cmd->net_ref_cnt)-1);
1063
1064         if (atomic_dec_and_test(&cmd->net_ref_cnt)) {
1065                 int i, sg_cnt = cmd->sg_cnt;
1066                 for (i = 0; i < sg_cnt; i++) {
1067                         struct page *page = sg_page(&cmd->sg[i]);
1068                         TRACE_NET_PAGE("Clearing page %p", page);
1069                         if (page->net_priv == cmd)
1070                                 page->net_priv = NULL;
1071                 }
1072                 cmnd_put(cmd);
1073         }
1074         return;
1075 }
1076
1077 void iscsi_put_page_callback(struct page *page)
1078 {
1079         struct iscsi_cmnd *cmd = (struct iscsi_cmnd *)page->net_priv;
1080
1081         TRACE_NET_PAGE("page %p, _count %d", page,
1082                 atomic_read(&page->_count));
1083
1084         __iscsi_put_page_callback(cmd);
1085         return;
1086 }
1087
1088 static void check_net_priv(struct iscsi_cmnd *cmd, struct page *page)
1089 {
1090         if ((atomic_read(&cmd->net_ref_cnt) == 1) && (page->net_priv == cmd)) {
1091                 TRACE_DBG("sendpage() not called get_page(), zeroing net_priv "
1092                         "%p (page %p)", page->net_priv, page);
1093                 page->net_priv = NULL;
1094         }
1095         return;
1096 }
1097 #else
1098 static inline void check_net_priv(struct iscsi_cmnd *cmd, struct page *page) {}
1099 static inline void __iscsi_get_page_callback(struct iscsi_cmnd *cmd) {}
1100 static inline void __iscsi_put_page_callback(struct iscsi_cmnd *cmd) {}
1101 #endif
1102
1103 /* This is partially taken from the Ardis code. */
1104 static int write_data(struct iscsi_conn *conn)
1105 {
1106         mm_segment_t oldfs;
1107         struct file *file;
1108         struct iovec *iop;
1109         struct socket *sock;
1110         ssize_t (*sock_sendpage)(struct socket *, struct page *, int, size_t,
1111                                  int);
1112         ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int);
1113         struct iscsi_cmnd *write_cmnd = conn->write_cmnd;
1114         struct iscsi_cmnd *ref_cmd;
1115         struct page *page;
1116         struct scatterlist *sg;
1117         int saved_size, size, sendsize;
1118         int length, offset, idx;
1119         int flags, res, count, sg_size;
1120         bool do_put = false, ref_cmd_to_parent;
1121
1122         TRACE_ENTRY();
1123
1124         iscsi_extracheck_is_wr_thread(conn);
1125
1126         if (write_cmnd->own_sg == 0) {
1127                 ref_cmd = write_cmnd->parent_req;
1128                 ref_cmd_to_parent = true;
1129         } else {
1130                 ref_cmd = write_cmnd;
1131                 ref_cmd_to_parent = false;
1132         }
1133
1134         if (!ref_cmd->on_written_list) {
1135                 TRACE_DBG("Adding cmd %p to conn %p written_list", ref_cmd,
1136                         conn);
1137                 spin_lock_bh(&conn->write_list_lock);
1138                 ref_cmd->on_written_list = 1;
1139                 ref_cmd->write_timeout = jiffies + ISCSI_RSP_TIMEOUT;
1140                 list_add_tail(&ref_cmd->written_list_entry,
1141                         &conn->written_list);
1142                 spin_unlock_bh(&conn->write_list_lock);
1143         }
1144
1145         if (!timer_pending(&conn->rsp_timer)) {
1146                 sBUG_ON(!ref_cmd->on_written_list);
1147                 spin_lock_bh(&conn->write_list_lock);
1148                 if (likely(!timer_pending(&conn->rsp_timer))) {
1149                         TRACE_DBG("Starting timer on %ld (conn %p)",
1150                                 ref_cmd->write_timeout, conn);
1151                         conn->rsp_timer.expires = ref_cmd->write_timeout;
1152                         add_timer(&conn->rsp_timer);
1153                 }
1154                 spin_unlock_bh(&conn->write_list_lock);
1155         }
1156
1157         file = conn->file;
1158         size = conn->write_size;
1159         saved_size = size;
1160         iop = conn->write_iop;
1161         count = conn->write_iop_used;
1162
1163         if (iop) {
1164                 while (1) {
1165                         loff_t off = 0;
1166                         int rest;
1167
1168                         sBUG_ON(count > (signed)(sizeof(conn->write_iov) /
1169                                                 sizeof(conn->write_iov[0])));
1170 retry:
1171                         oldfs = get_fs();
1172                         set_fs(KERNEL_DS);
1173                         res = vfs_writev(file,
1174                                          (struct iovec __force __user *)iop,
1175                                          count, &off);
1176                         set_fs(oldfs);
1177                         TRACE_WRITE("sid %#Lx, cid %u, res %d, iov_len %ld",
1178                                     (long long unsigned int)conn->session->sid,
1179                                     conn->cid, res, (long)iop->iov_len);
1180                         if (unlikely(res <= 0)) {
1181                                 if (res == -EAGAIN) {
1182                                         conn->write_iop = iop;
1183                                         conn->write_iop_used = count;
1184                                         goto out_iov;
1185                                 } else if (res == -EINTR)
1186                                         goto retry;
1187                                 goto out_err;
1188                         }
1189
1190                         rest = res;
1191                         size -= res;
1192                         while ((typeof(rest))iop->iov_len <= rest && rest) {
1193                                 rest -= iop->iov_len;
1194                                 iop++;
1195                                 count--;
1196                         }
1197                         if (count == 0) {
1198                                 conn->write_iop = NULL;
1199                                 conn->write_iop_used = 0;
1200                                 if (size)
1201                                         break;
1202                                 goto out_iov;
1203                         }
1204                         sBUG_ON(iop > conn->write_iov + sizeof(conn->write_iov)
1205                                                   /sizeof(conn->write_iov[0]));
1206                         iop->iov_base += rest;
1207                         iop->iov_len -= rest;
1208                 }
1209         }
1210
1211         sg = write_cmnd->sg;
1212         if (unlikely(sg == NULL)) {
1213                 PRINT_INFO("WARNING: Data missed (cmd %p)!", write_cmnd);
1214                 res = 0;
1215                 goto out;
1216         }
1217
1218         /* To protect from too early transfer completion race */
1219         __iscsi_get_page_callback(ref_cmd);
1220         do_put = true;
1221
1222         sock = conn->sock;
1223
1224 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
1225         sock_sendpage = sock->ops->sendpage;
1226 #else
1227         if ((write_cmnd->parent_req->scst_cmd != NULL) &&
1228             scst_cmd_get_dh_data_buff_alloced(write_cmnd->parent_req->scst_cmd))
1229                 sock_sendpage = sock_no_sendpage;
1230         else
1231                 sock_sendpage = sock->ops->sendpage;
1232 #endif
1233
1234         flags = MSG_DONTWAIT;
1235         sg_size = size;
1236
1237         if (sg != write_cmnd->rsp_sg) {
1238                 offset = conn->write_offset + sg[0].offset;
1239                 idx = offset >> PAGE_SHIFT;
1240                 offset &= ~PAGE_MASK;
1241                 length = min(size, (int)PAGE_SIZE - offset);
1242                 TRACE_WRITE("write_offset %d, sg_size %d, idx %d, offset %d, "
1243                         "length %d", conn->write_offset, sg_size, idx, offset,
1244                         length);
1245         } else {
1246                 idx = 0;
1247                 offset = conn->write_offset;
1248                 while (offset >= sg[idx].length) {
1249                         offset -= sg[idx].length;
1250                         idx++;
1251                 }
1252                 length = sg[idx].length - offset;
1253                 offset += sg[idx].offset;
1254                 sock_sendpage = sock_no_sendpage;
1255                 TRACE_WRITE("rsp_sg: write_offset %d, sg_size %d, idx %d, "
1256                         "offset %d, length %d", conn->write_offset, sg_size,
1257                         idx, offset, length);
1258         }
1259         page = sg_page(&sg[idx]);
1260
1261         while (1) {
1262                 sendpage = sock_sendpage;
1263
1264 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
1265                 {
1266                         static DEFINE_SPINLOCK(net_priv_lock);
1267                         spin_lock(&net_priv_lock);
1268                         if (unlikely(page->net_priv != NULL)) {
1269                                 if (page->net_priv != ref_cmd) {
1270                                         /*
1271                                          * This might happen if user space
1272                                          * supplies to scst_user the same
1273                                          * pages in different commands or in
1274                                          * case of zero-copy FILEIO, when
1275                                          * several initiators request the same
1276                                          * data simultaneously.
1277                                          */
1278                                         TRACE_DBG("net_priv isn't NULL and != "
1279                                             "ref_cmd (write_cmnd %p, ref_cmd "
1280                                             "%p, sg %p, idx %d, page %p, "
1281                                             "net_priv %p)",
1282                                             write_cmnd, ref_cmd, sg, idx,
1283                                             page, page->net_priv);
1284                                         sendpage = sock_no_sendpage;
1285                                 }
1286                         } else
1287                                 page->net_priv = ref_cmd;
1288                         spin_unlock(&net_priv_lock);
1289                 }
1290 #endif
1291                 sendsize = min(size, length);
1292                 if (size <= sendsize) {
1293 retry2:
1294                         res = sendpage(sock, page, offset, size, flags);
1295                         TRACE_WRITE("Final %s sid %#Lx, cid %u, res %d (page "
1296                                 "index %lu, offset %u, size %u, cmd %p, "
1297                                 "page %p)", (sendpage != sock_no_sendpage) ?
1298                                                 "sendpage" : "sock_no_sendpage",
1299                                 (long long unsigned int)conn->session->sid,
1300                                 conn->cid, res, page->index,
1301                                 offset, size, write_cmnd, page);
1302                         if (unlikely(res <= 0)) {
1303                                 if (res == -EINTR)
1304                                         goto retry2;
1305                                 else
1306                                         goto out_res;
1307                         }
1308
1309                         check_net_priv(ref_cmd, page);
1310                         if (res == size) {
1311                                 conn->write_size = 0;
1312                                 res = saved_size;
1313                                 goto out_put;
1314                         }
1315
1316                         offset += res;
1317                         size -= res;
1318                         goto retry2;
1319                 }
1320
1321 retry1:
1322                 res = sendpage(sock, page, offset, sendsize, flags | MSG_MORE);
1323                 TRACE_WRITE("%s sid %#Lx, cid %u, res %d (page index %lu, "
1324                         "offset %u, sendsize %u, size %u, cmd %p, page %p)",
1325                         (sendpage != sock_no_sendpage) ? "sendpage" :
1326                                                          "sock_no_sendpage",
1327                         (unsigned long long)conn->session->sid, conn->cid,
1328                         res, page->index, offset, sendsize, size,
1329                         write_cmnd, page);
1330                 if (unlikely(res <= 0)) {
1331                         if (res == -EINTR)
1332                                 goto retry1;
1333                         else
1334                                 goto out_res;
1335                 }
1336
1337                 check_net_priv(ref_cmd, page);
1338
1339                 size -= res;
1340
1341                 if (res == sendsize) {
1342                         idx++;
1343                         EXTRACHECKS_BUG_ON(idx >= ref_cmd->sg_cnt);
1344                         page = sg_page(&sg[idx]);
1345                         length = sg[idx].length;
1346                         offset = sg[idx].offset;
1347                 } else {
1348                         offset += res;
1349                         sendsize -= res;
1350                         goto retry1;
1351                 }
1352         }
1353
1354 out_off:
1355         conn->write_offset += sg_size - size;
1356
1357 out_iov:
1358         conn->write_size = size;
1359         if ((saved_size == size) && res == -EAGAIN)
1360                 goto out_put;
1361
1362         res = saved_size - size;
1363
1364 out_put:
1365         if (do_put)
1366                 __iscsi_put_page_callback(ref_cmd);
1367
1368 out:
1369         TRACE_EXIT_RES(res);
1370         return res;
1371
1372 out_res:
1373         check_net_priv(ref_cmd, page);
1374         if (res == -EAGAIN)
1375                 goto out_off;
1376         /* else go through */
1377
1378 out_err:
1379 #ifndef CONFIG_SCST_DEBUG
1380         if (!conn->closing)
1381 #endif
1382         {
1383                 PRINT_ERROR("error %d at sid:cid %#Lx:%u, cmnd %p", res,
1384                             (long long unsigned int)conn->session->sid,
1385                             conn->cid, conn->write_cmnd);
1386         }
1387         if (ref_cmd_to_parent &&
1388             ((ref_cmd->scst_cmd != NULL) || (ref_cmd->scst_aen != NULL))) {
1389                 if (ref_cmd->scst_state == ISCSI_CMD_STATE_AEN)
1390                         scst_set_aen_delivery_status(ref_cmd->scst_aen,
1391                                 SCST_AEN_RES_FAILED);
1392                 else
1393                         scst_set_delivery_status(ref_cmd->scst_cmd,
1394                                 SCST_CMD_DELIVERY_FAILED);
1395         }
1396         goto out_put;
1397 }
1398
1399 static int exit_tx(struct iscsi_conn *conn, int res)
1400 {
1401         iscsi_extracheck_is_wr_thread(conn);
1402
1403         switch (res) {
1404         case -EAGAIN:
1405         case -ERESTARTSYS:
1406                 res = 0;
1407                 break;
1408         default:
1409 #ifndef CONFIG_SCST_DEBUG
1410                 if (!conn->closing)
1411 #endif
1412                 {
1413                         PRINT_ERROR("Sending data failed: initiator %s, "
1414                                 "write_size %d, write_state %d, res %d",
1415                                 conn->session->initiator_name,
1416                                 conn->write_size,
1417                                 conn->write_state, res);
1418                 }
1419                 conn->write_state = TX_END;
1420                 conn->write_size = 0;
1421                 mark_conn_closed(conn);
1422                 break;
1423         }
1424         return res;
1425 }
1426
1427 static int tx_ddigest(struct iscsi_cmnd *cmnd, int state)
1428 {
1429         int res, rest = cmnd->conn->write_size;
1430         struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
1431         struct kvec iov;
1432
1433         iscsi_extracheck_is_wr_thread(cmnd->conn);
1434
1435         TRACE_DBG("Sending data digest %x (cmd %p)", cmnd->ddigest, cmnd);
1436
1437         iov.iov_base = (char *)(&cmnd->ddigest) + (sizeof(u32) - rest);
1438         iov.iov_len = rest;
1439
1440         res = kernel_sendmsg(cmnd->conn->sock, &msg, &iov, 1, rest);
1441         if (res > 0) {
1442                 cmnd->conn->write_size -= res;
1443                 if (!cmnd->conn->write_size)
1444                         cmnd->conn->write_state = state;
1445         } else
1446                 res = exit_tx(cmnd->conn, res);
1447
1448         return res;
1449 }
1450
1451 static void init_tx_hdigest(struct iscsi_cmnd *cmnd)
1452 {
1453         struct iscsi_conn *conn = cmnd->conn;
1454         struct iovec *iop;
1455
1456         iscsi_extracheck_is_wr_thread(conn);
1457
1458         digest_tx_header(cmnd);
1459
1460         sBUG_ON(conn->write_iop_used >=
1461                 (signed)(sizeof(conn->write_iov)/sizeof(conn->write_iov[0])));
1462
1463         iop = &conn->write_iop[conn->write_iop_used];
1464         conn->write_iop_used++;
1465         iop->iov_base = (void __force __user *)&(cmnd->hdigest);
1466         iop->iov_len = sizeof(u32);
1467         conn->write_size += sizeof(u32);
1468
1469         return;
1470 }
1471
1472 static int tx_padding(struct iscsi_cmnd *cmnd, int state)
1473 {
1474         int res, rest = cmnd->conn->write_size;
1475         struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
1476         struct kvec iov;
1477         static const uint32_t padding;
1478
1479         iscsi_extracheck_is_wr_thread(cmnd->conn);
1480
1481         TRACE_DBG("Sending %d padding bytes (cmd %p)", rest, cmnd);
1482
1483         iov.iov_base = (char *)(&padding) + (sizeof(uint32_t) - rest);
1484         iov.iov_len = rest;
1485
1486         res = kernel_sendmsg(cmnd->conn->sock, &msg, &iov, 1, rest);
1487         if (res > 0) {
1488                 cmnd->conn->write_size -= res;
1489                 if (!cmnd->conn->write_size)
1490                         cmnd->conn->write_state = state;
1491         } else
1492                 res = exit_tx(cmnd->conn, res);
1493
1494         return res;
1495 }
1496
1497 static int iscsi_do_send(struct iscsi_conn *conn, int state)
1498 {
1499         int res;
1500
1501         iscsi_extracheck_is_wr_thread(conn);
1502
1503         res = write_data(conn);
1504         if (res > 0) {
1505                 if (!conn->write_size)
1506                         conn->write_state = state;
1507         } else
1508                 res = exit_tx(conn, res);
1509
1510         return res;
1511 }
1512
1513 /*
1514  * No locks, conn is wr processing.
1515  *
1516  * IMPORTANT! Connection conn must be protected by additional conn_get()
1517  * upon entrance in this function, because otherwise it could be destroyed
1518  * inside as a result of cmnd release.
1519  */
1520 int iscsi_send(struct iscsi_conn *conn)
1521 {
1522         struct iscsi_cmnd *cmnd = conn->write_cmnd;
1523         int ddigest, res = 0;
1524
1525         TRACE_ENTRY();
1526
1527         TRACE_DBG("conn %p, write_cmnd %p", conn, cmnd);
1528
1529         iscsi_extracheck_is_wr_thread(conn);
1530
1531         ddigest = conn->ddigest_type != DIGEST_NONE ? 1 : 0;
1532
1533         switch (conn->write_state) {
1534         case TX_INIT:
1535                 sBUG_ON(cmnd != NULL);
1536                 cmnd = conn->write_cmnd = iscsi_get_send_cmnd(conn);
1537                 if (!cmnd)
1538                         goto out;
1539                 cmnd_tx_start(cmnd);
1540                 if (!(conn->hdigest_type & DIGEST_NONE))
1541                         init_tx_hdigest(cmnd);
1542                 conn->write_state = TX_BHS_DATA;
1543         case TX_BHS_DATA:
1544                 res = iscsi_do_send(conn, cmnd->pdu.datasize ?
1545                                         TX_INIT_PADDING : TX_END);
1546                 if (res <= 0 || conn->write_state != TX_INIT_PADDING)
1547                         break;
1548         case TX_INIT_PADDING:
1549                 cmnd->conn->write_size = ((cmnd->pdu.datasize + 3) & -4) -
1550                                                 cmnd->pdu.datasize;
1551                 if (cmnd->conn->write_size != 0)
1552                         conn->write_state = TX_PADDING;
1553                 else if (ddigest)
1554                         conn->write_state = TX_INIT_DDIGEST;
1555                  else
1556                         conn->write_state = TX_END;
1557                 break;
1558         case TX_PADDING:
1559                 res = tx_padding(cmnd, ddigest ? TX_INIT_DDIGEST : TX_END);
1560                 if (res <= 0 || conn->write_state != TX_INIT_DDIGEST)
1561                         break;
1562         case TX_INIT_DDIGEST:
1563                 cmnd->conn->write_size = sizeof(u32);
1564                 conn->write_state = TX_DDIGEST;
1565         case TX_DDIGEST:
1566                 res = tx_ddigest(cmnd, TX_END);
1567                 break;
1568         default:
1569                 PRINT_CRIT_ERROR("%d %d %x", res, conn->write_state,
1570                         cmnd_opcode(cmnd));
1571                 sBUG();
1572         }
1573
1574         if (res == 0)
1575                 goto out;
1576
1577         if (conn->write_state != TX_END)
1578                 goto out;
1579
1580         if (unlikely(conn->write_size)) {
1581                 PRINT_CRIT_ERROR("%d %x %u", res, cmnd_opcode(cmnd),
1582                         conn->write_size);
1583                 sBUG();
1584         }
1585         cmnd_tx_end(cmnd);
1586
1587         rsp_cmnd_release(cmnd);
1588
1589         conn->write_cmnd = NULL;
1590         conn->write_state = TX_INIT;
1591
1592 out:
1593         TRACE_EXIT_RES(res);
1594         return res;
1595 }
1596
1597 /* No locks, conn is wr processing.
1598  *
1599  * IMPORTANT! Connection conn must be protected by additional conn_get()
1600  * upon entrance in this function, because otherwise it could be destroyed
1601  * inside as a result of iscsi_send(), which releases sent commands.
1602  */
1603 static int process_write_queue(struct iscsi_conn *conn)
1604 {
1605         int res = 0;
1606
1607         TRACE_ENTRY();
1608
1609         if (likely(test_write_ready(conn)))
1610                 res = iscsi_send(conn);
1611
1612         TRACE_EXIT_RES(res);
1613         return res;
1614 }
1615
1616 /*
1617  * Called under iscsi_wr_lock and BHs disabled, but will drop it inside,
1618  * then reaquire.
1619  */
1620 static void scst_do_job_wr(void)
1621         __acquires(&iscsi_wr_lock)
1622         __releases(&iscsi_wr_lock)
1623 {
1624         TRACE_ENTRY();
1625
1626         /*
1627          * We delete/add to tail connections to maintain fairness between them.
1628          */
1629
1630         while (!list_empty(&iscsi_wr_list)) {
1631                 int rc;
1632                 struct iscsi_conn *conn = list_entry(iscsi_wr_list.next,
1633                         typeof(*conn), wr_list_entry);
1634
1635                 TRACE_DBG("conn %p, wr_state %x, wr_space_ready %d, "
1636                         "write ready %d", conn, conn->wr_state,
1637                         conn->wr_space_ready, test_write_ready(conn));
1638
1639                 list_del(&conn->wr_list_entry);
1640
1641                 sBUG_ON(conn->wr_state == ISCSI_CONN_WR_STATE_PROCESSING);
1642
1643                 conn->wr_state = ISCSI_CONN_WR_STATE_PROCESSING;
1644                 conn->wr_space_ready = 0;
1645 #ifdef CONFIG_SCST_EXTRACHECKS
1646                 conn->wr_task = current;
1647 #endif
1648                 spin_unlock_bh(&iscsi_wr_lock);
1649
1650                 conn_get(conn);
1651
1652                 rc = process_write_queue(conn);
1653
1654                 spin_lock_bh(&iscsi_wr_lock);
1655 #ifdef CONFIG_SCST_EXTRACHECKS
1656                 conn->wr_task = NULL;
1657 #endif
1658                 if ((rc == -EAGAIN) && !conn->wr_space_ready) {
1659                         conn->wr_state = ISCSI_CONN_WR_STATE_SPACE_WAIT;
1660                         goto cont;
1661                 }
1662
1663                 if (test_write_ready(conn)) {
1664                         list_add_tail(&conn->wr_list_entry, &iscsi_wr_list);
1665                         conn->wr_state = ISCSI_CONN_WR_STATE_IN_LIST;
1666                 } else
1667                         conn->wr_state = ISCSI_CONN_WR_STATE_IDLE;
1668
1669 cont:
1670                 conn_put(conn);
1671         }
1672
1673         TRACE_EXIT();
1674         return;
1675 }
1676
1677 static inline int test_wr_list(void)
1678 {
1679         int res = !list_empty(&iscsi_wr_list) ||
1680                   unlikely(kthread_should_stop());
1681         return res;
1682 }
1683
1684 int istwr(void *arg)
1685 {
1686         TRACE_ENTRY();
1687
1688         PRINT_INFO("Write thread started, PID %d", current->pid);
1689
1690         current->flags |= PF_NOFREEZE;
1691
1692         spin_lock_bh(&iscsi_wr_lock);
1693         while (!kthread_should_stop()) {
1694                 wait_queue_t wait;
1695                 init_waitqueue_entry(&wait, current);
1696
1697                 if (!test_wr_list()) {
1698                         add_wait_queue_exclusive_head(&iscsi_wr_waitQ, &wait);
1699                         for (;;) {
1700                                 set_current_state(TASK_INTERRUPTIBLE);
1701                                 if (test_wr_list())
1702                                         break;
1703                                 spin_unlock_bh(&iscsi_wr_lock);
1704                                 schedule();
1705                                 spin_lock_bh(&iscsi_wr_lock);
1706                         }
1707                         set_current_state(TASK_RUNNING);
1708                         remove_wait_queue(&iscsi_wr_waitQ, &wait);
1709                 }
1710                 scst_do_job_wr();
1711         }
1712         spin_unlock_bh(&iscsi_wr_lock);
1713
1714         /*
1715          * If kthread_should_stop() is true, we are guaranteed to be
1716          * on the module unload, so iscsi_wr_list must be empty.
1717          */
1718         sBUG_ON(!list_empty(&iscsi_wr_list));
1719
1720         PRINT_INFO("Write thread PID %d finished", current->pid);
1721
1722         TRACE_EXIT();
1723         return 0;
1724 }