A huge chunk of related to each other changes, which had to be tested together.
[mirror/scst/.git] / iscsi-scst / kernel / nthread.c
1 /*
2  *  Network threads.
3  *
4  *  Copyright (C) 2004 - 2005 FUJITA Tomonori <tomof@acm.org>
5  *  Copyright (C) 2007 - 2009 Vladislav Bolkhovitin
6  *  Copyright (C) 2007 - 2009 ID7 Ltd.
7  *
8  *  This program is free software; you can redistribute it and/or
9  *  modify it under the terms of the GNU General Public License
10  *  as published by the Free Software Foundation.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  *  GNU General Public License for more details.
16  */
17
18 #include <linux/sched.h>
19 #include <linux/file.h>
20 #include <linux/kthread.h>
21 #include <asm/ioctls.h>
22 #include <linux/delay.h>
23 #include <net/tcp.h>
24
25 #include "iscsi.h"
26 #include "digest.h"
27
28 enum rx_state {
29         RX_INIT_BHS, /* Must be zero for better "switch" optimization. */
30         RX_BHS,
31         RX_CMD_START,
32         RX_DATA,
33         RX_END,
34
35         RX_CMD_CONTINUE,
36         RX_INIT_HDIGEST,
37         RX_CHECK_HDIGEST,
38         RX_INIT_DDIGEST,
39         RX_CHECK_DDIGEST,
40         RX_AHS,
41         RX_PADDING,
42 };
43
44 enum tx_state {
45         TX_INIT = 0, /* Must be zero for better "switch" optimization. */
46         TX_BHS_DATA,
47         TX_INIT_PADDING,
48         TX_PADDING,
49         TX_INIT_DDIGEST,
50         TX_DDIGEST,
51         TX_END,
52 };
53
54 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
55 static void iscsi_check_closewait(struct iscsi_conn *conn)
56 {
57         struct iscsi_cmnd *cmnd;
58
59         TRACE_ENTRY();
60
61         TRACE_CONN_CLOSE_DBG("conn %p, sk_state %d", conn,
62                 conn->sock->sk->sk_state);
63
64         if (conn->sock->sk->sk_state != TCP_CLOSE) {
65                 TRACE_CONN_CLOSE_DBG("conn %p, skipping", conn);
66                 goto out;
67         }
68
69         /*
70          * No data are going to be sent, so all queued buffers can be freed
71          * now. In many cases TCP does that only in close(), but we can't rely
72          * on user space on calling it.
73          */
74
75 again:
76         spin_lock_bh(&conn->cmd_list_lock);
77         list_for_each_entry(cmnd, &conn->cmd_list, cmd_list_entry) {
78                 struct iscsi_cmnd *rsp;
79                 int restart = 0;
80
81                 TRACE_CONN_CLOSE_DBG("cmd %p, scst_state %x, "
82                         "r2t_len_to_receive %d, ref_cnt %d, parent_req %p, "
83                         "net_ref_cnt %d, sg %p", cmnd, cmnd->scst_state,
84                         cmnd->r2t_len_to_receive, atomic_read(&cmnd->ref_cnt),
85                         cmnd->parent_req, atomic_read(&cmnd->net_ref_cnt),
86                         cmnd->sg);
87
88                 sBUG_ON(cmnd->parent_req != NULL);
89
90                 if (cmnd->sg != NULL) {
91                         int i;
92
93                         if (cmnd_get_check(cmnd))
94                                 continue;
95
96                         for (i = 0; i < cmnd->sg_cnt; i++) {
97                                 struct page *page = sg_page(&cmnd->sg[i]);
98                                 TRACE_CONN_CLOSE_DBG("page %p, net_priv %p, "
99                                         "_count %d", page, page->net_priv,
100                                         atomic_read(&page->_count));
101
102                                 if (page->net_priv != NULL) {
103                                         if (restart == 0) {
104                                                 spin_unlock_bh(&conn->cmd_list_lock);
105                                                 restart = 1;
106                                         }
107                                         while (page->net_priv != NULL)
108                                                 iscsi_put_page_callback(page);
109                                 }
110                         }
111                         cmnd_put(cmnd);
112
113                         if (restart)
114                                 goto again;
115                 }
116
117                 list_for_each_entry(rsp, &cmnd->rsp_cmd_list,
118                                 rsp_cmd_list_entry) {
119                         TRACE_CONN_CLOSE_DBG("  rsp %p, ref_cnt %d, "
120                                 "net_ref_cnt %d, sg %p",
121                                 rsp, atomic_read(&rsp->ref_cnt),
122                                 atomic_read(&rsp->net_ref_cnt), rsp->sg);
123
124                         if ((rsp->sg != cmnd->sg) && (rsp->sg != NULL)) {
125                                 int i;
126
127                                 if (cmnd_get_check(rsp))
128                                         continue;
129
130                                 for (i = 0; i < rsp->sg_cnt; i++) {
131                                         struct page *page =
132                                                 sg_page(&rsp->sg[i]);
133                                         TRACE_CONN_CLOSE_DBG(
134                                                 "    page %p, net_priv %p, "
135                                                 "_count %d",
136                                                 page, page->net_priv,
137                                                 atomic_read(&page->_count));
138
139                                         if (page->net_priv != NULL) {
140                                                 if (restart == 0) {
141                                                         spin_unlock_bh(&conn->cmd_list_lock);
142                                                         restart = 1;
143                                                 }
144                                                 while (page->net_priv != NULL)
145                                                         iscsi_put_page_callback(page);
146                                         }
147                                 }
148                                 cmnd_put(rsp);
149
150                                 if (restart)
151                                         goto again;
152                         }
153                 }
154         }
155         spin_unlock_bh(&conn->cmd_list_lock);
156
157 out:
158         TRACE_EXIT();
159         return;
160 }
161 #else
162 static inline void iscsi_check_closewait(struct iscsi_conn *conn) {};
163 #endif
164
165 static void free_pending_commands(struct iscsi_conn *conn)
166 {
167         struct iscsi_session *session = conn->session;
168         struct list_head *pending_list = &session->pending_list;
169         int req_freed;
170         struct iscsi_cmnd *cmnd;
171
172         spin_lock(&session->sn_lock);
173         do {
174                 req_freed = 0;
175                 list_for_each_entry(cmnd, pending_list, pending_list_entry) {
176                         TRACE_CONN_CLOSE_DBG("Pending cmd %p"
177                                 "(conn %p, cmd_sn %u, exp_cmd_sn %u)",
178                                 cmnd, conn, cmnd->pdu.bhs.sn,
179                                 session->exp_cmd_sn);
180                         if ((cmnd->conn == conn) &&
181                             (session->exp_cmd_sn == cmnd->pdu.bhs.sn)) {
182                                 TRACE_CONN_CLOSE_DBG("Freeing pending cmd %p",
183                                         cmnd);
184
185                                 list_del(&cmnd->pending_list_entry);
186                                 cmnd->pending = 0;
187
188                                 session->exp_cmd_sn++;
189
190                                 spin_unlock(&session->sn_lock);
191
192                                 req_cmnd_release_force(cmnd);
193
194                                 req_freed = 1;
195                                 spin_lock(&session->sn_lock);
196                                 break;
197                         }
198                 }
199         } while (req_freed);
200         spin_unlock(&session->sn_lock);
201
202         return;
203 }
204
205 static void free_orphaned_pending_commands(struct iscsi_conn *conn)
206 {
207         struct iscsi_session *session = conn->session;
208         struct list_head *pending_list = &session->pending_list;
209         int req_freed;
210         struct iscsi_cmnd *cmnd;
211
212         spin_lock(&session->sn_lock);
213         do {
214                 req_freed = 0;
215                 list_for_each_entry(cmnd, pending_list, pending_list_entry) {
216                         TRACE_CONN_CLOSE_DBG("Pending cmd %p"
217                                 "(conn %p, cmd_sn %u, exp_cmd_sn %u)",
218                                 cmnd, conn, cmnd->pdu.bhs.sn,
219                                 session->exp_cmd_sn);
220                         if (cmnd->conn == conn) {
221                                 PRINT_ERROR("Freeing orphaned pending cmd %p",
222                                             cmnd);
223
224                                 list_del(&cmnd->pending_list_entry);
225                                 cmnd->pending = 0;
226
227                                 if (session->exp_cmd_sn == cmnd->pdu.bhs.sn)
228                                         session->exp_cmd_sn++;
229
230                                 spin_unlock(&session->sn_lock);
231
232                                 req_cmnd_release_force(cmnd);
233
234                                 req_freed = 1;
235                                 spin_lock(&session->sn_lock);
236                                 break;
237                         }
238                 }
239         } while (req_freed);
240         spin_unlock(&session->sn_lock);
241
242         return;
243 }
244
245 #ifdef CONFIG_SCST_DEBUG
246 static void trace_conn_close(struct iscsi_conn *conn)
247 {
248         struct iscsi_cmnd *cmnd;
249 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
250         struct iscsi_cmnd *rsp;
251 #endif
252
253 #if 0
254         if (time_after(jiffies, start_waiting + 10*HZ))
255                 trace_flag |= TRACE_CONN_OC_DBG;
256 #endif
257
258         spin_lock_bh(&conn->cmd_list_lock);
259         list_for_each_entry(cmnd, &conn->cmd_list,
260                         cmd_list_entry) {
261                 TRACE_CONN_CLOSE_DBG(
262                         "cmd %p, scst_cmd %p, scst_state %x, scst_cmd state "
263                         "%d, r2t_len_to_receive %d, ref_cnt %d, sn %u, "
264                         "parent_req %p, pending %d",
265                         cmnd, cmnd->scst_cmd, cmnd->scst_state,
266                         ((cmnd->parent_req == NULL) && cmnd->scst_cmd) ?
267                                 cmnd->scst_cmd->state : -1,
268                         cmnd->r2t_len_to_receive, atomic_read(&cmnd->ref_cnt),
269                         cmnd->pdu.bhs.sn, cmnd->parent_req, cmnd->pending);
270 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
271                 TRACE_CONN_CLOSE_DBG("net_ref_cnt %d, sg %p",
272                         atomic_read(&cmnd->net_ref_cnt),
273                         cmnd->sg);
274                 if (cmnd->sg != NULL) {
275                         int i;
276                         for (i = 0; i < cmnd->sg_cnt; i++) {
277                                 struct page *page = sg_page(&cmnd->sg[i]);
278                                 TRACE_CONN_CLOSE_DBG("page %p, "
279                                         "net_priv %p, _count %d",
280                                         page, page->net_priv,
281                                         atomic_read(&page->_count));
282                         }
283                 }
284
285                 sBUG_ON(cmnd->parent_req != NULL);
286
287                 list_for_each_entry(rsp, &cmnd->rsp_cmd_list,
288                                 rsp_cmd_list_entry) {
289                         TRACE_CONN_CLOSE_DBG("  rsp %p, "
290                             "ref_cnt %d, net_ref_cnt %d, sg %p",
291                             rsp, atomic_read(&rsp->ref_cnt),
292                             atomic_read(&rsp->net_ref_cnt), rsp->sg);
293                         if (rsp->sg != cmnd->sg && rsp->sg) {
294                                 int i;
295                                 for (i = 0; i < rsp->sg_cnt; i++) {
296                                         TRACE_CONN_CLOSE_DBG("    page %p, "
297                                           "net_priv %p, _count %d",
298                                           sg_page(&rsp->sg[i]),
299                                           sg_page(&rsp->sg[i])->net_priv,
300                                           atomic_read(&sg_page(&rsp->sg[i])->
301                                                 _count));
302                                 }
303                         }
304                 }
305 #endif /* CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION */
306         }
307         spin_unlock_bh(&conn->cmd_list_lock);
308         return;
309 }
310 #else /* CONFIG_SCST_DEBUG */
311 static void trace_conn_close(struct iscsi_conn *conn) {}
312 #endif /* CONFIG_SCST_DEBUG */
313
314 void iscsi_task_mgmt_affected_cmds_done(struct scst_mgmt_cmd *scst_mcmd)
315 {
316         int fn = scst_mgmt_cmd_get_fn(scst_mcmd);
317         void *priv = scst_mgmt_cmd_get_tgt_priv(scst_mcmd);
318
319         TRACE_MGMT_DBG("scst_mcmd %p, fn %d, priv %p", scst_mcmd, fn, priv);
320
321         switch (fn) {
322         case SCST_NEXUS_LOSS_SESS:
323         case SCST_ABORT_ALL_TASKS_SESS:
324         {
325                 struct iscsi_conn *conn = (struct iscsi_conn *)priv;
326                 struct iscsi_session *sess = conn->session;
327                 struct iscsi_conn *c;
328
329                 mutex_lock(&sess->target->target_mutex);
330
331                 /*
332                  * We can't mark sess as shutting down earlier, because until
333                  * now it might have pending commands. Otherwise, in case of
334                  * reinstatement it might lead to data corruption, because
335                  * commands in being reinstated session can be executed
336                  * after commands in the new session.
337                  */
338                 sess->sess_shutting_down = 1;
339                 list_for_each_entry(c, &sess->conn_list, conn_list_entry) {
340                         if (!test_bit(ISCSI_CONN_SHUTTINGDOWN, &c->conn_aflags)) {
341                                 sess->sess_shutting_down = 0;
342                                 break;
343                         }
344                 }
345
346                 if (conn->conn_reinst_successor != NULL) {
347                         sBUG_ON(!test_bit(ISCSI_CONN_REINSTATING,
348                                   &conn->conn_reinst_successor->conn_aflags));
349                         conn_reinst_finished(conn->conn_reinst_successor);
350                         conn->conn_reinst_successor = NULL;
351                 } else if (sess->sess_reinst_successor != NULL) {
352                         sess_reinst_finished(sess->sess_reinst_successor);
353                         sess->sess_reinst_successor = NULL;
354                 }
355                 mutex_unlock(&sess->target->target_mutex);
356
357                 complete_all(&conn->ready_to_free);
358                 break;
359         }
360         default:
361                 /* Nothing to do */
362                 break;
363         }
364
365         return;
366 }
367
368 /* No locks */
369 static void close_conn(struct iscsi_conn *conn)
370 {
371         struct iscsi_session *session = conn->session;
372         struct iscsi_target *target = conn->target;
373         typeof(jiffies) start_waiting = jiffies;
374         typeof(jiffies) shut_start_waiting = start_waiting;
375         bool pending_reported = 0, wait_expired = 0, shut_expired = 0;
376         bool reinst;
377
378 #define CONN_PENDING_TIMEOUT    ((typeof(jiffies))10*HZ)
379 #define CONN_WAIT_TIMEOUT       ((typeof(jiffies))10*HZ)
380 #define CONN_REG_SHUT_TIMEOUT   ((typeof(jiffies))125*HZ)
381 #define CONN_DEL_SHUT_TIMEOUT   ((typeof(jiffies))10*HZ)
382
383         TRACE_ENTRY();
384
385         TRACE_CONN_CLOSE("Closing connection %p (conn_ref_cnt=%d)", conn,
386                 atomic_read(&conn->conn_ref_cnt));
387
388         iscsi_extracheck_is_rd_thread(conn);
389
390         sBUG_ON(!conn->closing);
391
392         if (conn->active_close) {
393                 /* We want all our already send operations to complete */
394                 conn->sock->ops->shutdown(conn->sock, RCV_SHUTDOWN);
395         } else {
396                 conn->sock->ops->shutdown(conn->sock,
397                         RCV_SHUTDOWN|SEND_SHUTDOWN);
398         }
399
400         mutex_lock(&session->target->target_mutex);
401
402         set_bit(ISCSI_CONN_SHUTTINGDOWN, &conn->conn_aflags);
403         reinst = (conn->conn_reinst_successor != NULL);
404
405         mutex_unlock(&session->target->target_mutex);
406
407         if (reinst) {
408                 int rc;
409                 int lun = 0;
410
411                 /* Abort all outstanding commands */
412                 rc = scst_rx_mgmt_fn_lun(session->scst_sess,
413                         SCST_ABORT_ALL_TASKS_SESS, (uint8_t *)&lun, sizeof(lun),
414                         SCST_NON_ATOMIC, conn);
415                 if (rc != 0)
416                         PRINT_ERROR("SCST_ABORT_ALL_TASKS_SESS failed %d", rc);
417         } else {
418                 int rc;
419                 int lun = 0;
420
421                 rc = scst_rx_mgmt_fn_lun(session->scst_sess,
422                         SCST_NEXUS_LOSS_SESS, (uint8_t *)&lun, sizeof(lun),
423                         SCST_NON_ATOMIC, conn);
424                 if (rc != 0)
425                         PRINT_ERROR("SCST_NEXUS_LOSS_SESS failed %d", rc);
426         }
427
428         if (conn->read_state != RX_INIT_BHS) {
429                 struct iscsi_cmnd *cmnd = conn->read_cmnd;
430
431                 if (cmnd->scst_state == ISCSI_CMD_STATE_RX_CMD) {
432                         TRACE_CONN_CLOSE_DBG("Going to wait for cmnd %p to "
433                                 "change state from RX_CMD", cmnd);
434                 }
435                 wait_event(conn->read_state_waitQ,
436                         cmnd->scst_state != ISCSI_CMD_STATE_RX_CMD);
437
438                 TRACE_CONN_CLOSE_DBG("Releasing conn->read_cmnd %p (conn %p)",
439                         conn->read_cmnd, conn);
440
441                 conn->read_cmnd = NULL;
442                 conn->read_state = RX_INIT_BHS;
443                 req_cmnd_release_force(cmnd);
444         }
445
446         conn_abort(conn);
447
448         /* ToDo: not the best way to wait */
449         while (atomic_read(&conn->conn_ref_cnt) != 0) {
450                 if (conn->conn_tm_active)
451                         iscsi_check_tm_data_wait_timeouts(conn, true);
452
453                 mutex_lock(&target->target_mutex);
454                 spin_lock(&session->sn_lock);
455                 if (session->tm_rsp && session->tm_rsp->conn == conn) {
456                         struct iscsi_cmnd *tm_rsp = session->tm_rsp;
457                         TRACE(TRACE_MGMT_MINOR, "Dropping delayed TM rsp %p",
458                                 tm_rsp);
459                         session->tm_rsp = NULL;
460                         session->tm_active--;
461                         WARN_ON(session->tm_active < 0);
462                         spin_unlock(&session->sn_lock);
463                         mutex_unlock(&target->target_mutex);
464
465                         rsp_cmnd_release(tm_rsp);
466                 } else {
467                         spin_unlock(&session->sn_lock);
468                         mutex_unlock(&target->target_mutex);
469                 }
470
471                 /* It's safe to check it without sn_lock */
472                 if (!list_empty(&session->pending_list)) {
473                         TRACE_CONN_CLOSE_DBG("Disposing pending commands on "
474                                 "connection %p (conn_ref_cnt=%d)", conn,
475                                 atomic_read(&conn->conn_ref_cnt));
476
477                         free_pending_commands(conn);
478
479                         if (time_after(jiffies,
480                                 start_waiting + CONN_PENDING_TIMEOUT)) {
481                                 if (!pending_reported) {
482                                         TRACE_CONN_CLOSE("%s",
483                                                 "Pending wait time expired");
484                                         pending_reported = 1;
485                                 }
486                                 free_orphaned_pending_commands(conn);
487                         }
488                 }
489
490                 iscsi_make_conn_wr_active(conn);
491
492                 /* That's for active close only, actually */
493                 if (time_after(jiffies, start_waiting + CONN_WAIT_TIMEOUT) &&
494                     !wait_expired) {
495                         TRACE_CONN_CLOSE("Wait time expired (conn %p, "
496                                 "sk_state %d)",
497                                 conn, conn->sock->sk->sk_state);
498                         conn->sock->ops->shutdown(conn->sock, SEND_SHUTDOWN);
499                         wait_expired = 1;
500                         shut_start_waiting = jiffies;
501                 }
502
503                 if (wait_expired && !shut_expired &&
504                     time_after(jiffies, shut_start_waiting +
505                                 conn->deleting ? CONN_DEL_SHUT_TIMEOUT :
506                                                  CONN_REG_SHUT_TIMEOUT)) {
507                         TRACE_CONN_CLOSE("Wait time after shutdown expired "
508                                 "(conn %p, sk_state %d)", conn,
509                                 conn->sock->sk->sk_state);
510                         conn->sock->sk->sk_prot->disconnect(conn->sock->sk, 0);
511                         shut_expired = 1;
512                 }
513
514                 if (conn->deleting)
515                         msleep(200);
516                 else
517                         msleep(1000);
518
519                 TRACE_CONN_CLOSE_DBG("conn %p, conn_ref_cnt %d left, "
520                         "wr_state %d, exp_cmd_sn %u",
521                         conn, atomic_read(&conn->conn_ref_cnt),
522                         conn->wr_state, session->exp_cmd_sn);
523
524                 trace_conn_close(conn);
525
526                 iscsi_check_closewait(conn);
527         }
528
529         write_lock_bh(&conn->sock->sk->sk_callback_lock);
530         conn->sock->sk->sk_state_change = conn->old_state_change;
531         conn->sock->sk->sk_data_ready = conn->old_data_ready;
532         conn->sock->sk->sk_write_space = conn->old_write_space;
533         write_unlock_bh(&conn->sock->sk->sk_callback_lock);
534
535         while (1) {
536                 bool t;
537
538                 spin_lock_bh(&iscsi_wr_lock);
539                 t = (conn->wr_state == ISCSI_CONN_WR_STATE_IDLE);
540                 spin_unlock_bh(&iscsi_wr_lock);
541
542                 if (t && (atomic_read(&conn->conn_ref_cnt) == 0))
543                         break;
544
545                 TRACE_CONN_CLOSE_DBG("Waiting for wr thread (conn %p), "
546                         "wr_state %x", conn, conn->wr_state);
547                 msleep(50);
548         }
549
550         wait_for_completion(&conn->ready_to_free);
551
552         TRACE_CONN_CLOSE("Notifying user space about closing connection %p",
553                          conn);
554         event_send(target->tid, session->sid, conn->cid, E_CONN_CLOSE);
555
556 #ifdef CONFIG_SCST_PROC
557         mutex_lock(&target->target_mutex);
558         conn_free(conn);
559         mutex_unlock(&target->target_mutex);
560 #else
561         kobject_put(&conn->iscsi_conn_kobj);
562 #endif
563
564         TRACE_EXIT();
565         return;
566 }
567
568 static int close_conn_thr(void *arg)
569 {
570         struct iscsi_conn *conn = (struct iscsi_conn *)arg;
571
572         TRACE_ENTRY();
573
574 #ifdef CONFIG_SCST_EXTRACHECKS
575         /*
576          * To satisfy iscsi_extracheck_is_rd_thread() in functions called
577          * on the connection close. It is safe, because at this point conn
578          * can't be used by any other thread.
579          */
580         conn->rd_task = current;
581 #endif
582         close_conn(conn);
583
584         TRACE_EXIT();
585         return 0;
586 }
587
588 /* No locks */
589 static void start_close_conn(struct iscsi_conn *conn)
590 {
591         struct task_struct *t;
592
593         TRACE_ENTRY();
594
595         t = kthread_run(close_conn_thr, conn, "iscsi_conn_cleanup");
596         if (IS_ERR(t)) {
597                 PRINT_ERROR("kthread_run() failed (%ld), closing conn %p "
598                         "directly", PTR_ERR(t), conn);
599                 close_conn(conn);
600         }
601
602         TRACE_EXIT();
603         return;
604 }
605
606 static inline void iscsi_conn_init_read(struct iscsi_conn *conn,
607         void __user *data, size_t len)
608 {
609         conn->read_iov[0].iov_base = data;
610         conn->read_iov[0].iov_len = len;
611         conn->read_msg.msg_iov = conn->read_iov;
612         conn->read_msg.msg_iovlen = 1;
613         conn->read_size = len;
614         return;
615 }
616
617 static void iscsi_conn_prepare_read_ahs(struct iscsi_conn *conn,
618         struct iscsi_cmnd *cmnd)
619 {
620         int asize = (cmnd->pdu.ahssize + 3) & -4;
621
622         /* ToDo: __GFP_NOFAIL ?? */
623         cmnd->pdu.ahs = kmalloc(asize, __GFP_NOFAIL|GFP_KERNEL);
624         sBUG_ON(cmnd->pdu.ahs == NULL);
625         iscsi_conn_init_read(conn, (void __force __user *)cmnd->pdu.ahs, asize);
626         return;
627 }
628
629 static struct iscsi_cmnd *iscsi_get_send_cmnd(struct iscsi_conn *conn)
630 {
631         struct iscsi_cmnd *cmnd = NULL;
632
633         spin_lock_bh(&conn->write_list_lock);
634         if (!list_empty(&conn->write_list)) {
635                 cmnd = list_entry(conn->write_list.next, struct iscsi_cmnd,
636                                 write_list_entry);
637                 cmd_del_from_write_list(cmnd);
638                 cmnd->write_processing_started = 1;
639         }
640         spin_unlock_bh(&conn->write_list_lock);
641
642         if (unlikely(test_bit(ISCSI_CMD_ABORTED,
643                         &cmnd->parent_req->prelim_compl_flags))) {
644                 TRACE_MGMT_DBG("Going to send acmd %p (scst cmd %p, "
645                         "state %d, parent_req %p)", cmnd, cmnd->scst_cmd,
646                         cmnd->scst_state, cmnd->parent_req);
647         }
648
649         if (unlikely(cmnd_opcode(cmnd) == ISCSI_OP_SCSI_TASK_MGT_RSP)) {
650                 struct iscsi_task_mgt_hdr *req_hdr =
651                         (struct iscsi_task_mgt_hdr *)&cmnd->parent_req->pdu.bhs;
652                 struct iscsi_task_rsp_hdr *rsp_hdr =
653                         (struct iscsi_task_rsp_hdr *)&cmnd->pdu.bhs;
654                 TRACE_MGMT_DBG("Going to send TM response %p (status %d, "
655                         "fn %d, parent_req %p)", cmnd, rsp_hdr->response,
656                         req_hdr->function & ISCSI_FUNCTION_MASK,
657                         cmnd->parent_req);
658         }
659
660         return cmnd;
661 }
662
663 /* Returns number of bytes left to receive or <0 for error */
664 static int do_recv(struct iscsi_conn *conn)
665 {
666         int res;
667         mm_segment_t oldfs;
668         struct msghdr msg;
669         int first_len;
670
671         EXTRACHECKS_BUG_ON(conn->read_cmnd == NULL);
672
673         if (unlikely(conn->closing)) {
674                 res = -EIO;
675                 goto out;
676         }
677
678         /*
679          * We suppose that if sock_recvmsg() returned less data than requested,
680          * then next time it will return -EAGAIN, so there's no point to call
681          * it again.
682          */
683
684 restart:
685         memset(&msg, 0, sizeof(msg));
686         msg.msg_iov = conn->read_msg.msg_iov;
687         msg.msg_iovlen = conn->read_msg.msg_iovlen;
688         first_len = msg.msg_iov->iov_len;
689
690         oldfs = get_fs();
691         set_fs(get_ds());
692         res = sock_recvmsg(conn->sock, &msg, conn->read_size,
693                            MSG_DONTWAIT | MSG_NOSIGNAL);
694         set_fs(oldfs);
695
696         if (res > 0) {
697                 /*
698                  * To save some considerable effort and CPU power we
699                  * suppose that TCP functions adjust
700                  * conn->read_msg.msg_iov and conn->read_msg.msg_iovlen
701                  * on amount of copied data. This BUG_ON is intended
702                  * to catch if it is changed in the future.
703                  */
704                 sBUG_ON((res >= first_len) &&
705                         (conn->read_msg.msg_iov->iov_len != 0));
706                 conn->read_size -= res;
707                 if (conn->read_size != 0) {
708                         if (res >= first_len) {
709                                 int done = 1 + ((res - first_len) >> PAGE_SHIFT);
710                                 conn->read_msg.msg_iov += done;
711                                 conn->read_msg.msg_iovlen -= done;
712                         }
713                 }
714                 res = conn->read_size;
715         } else {
716                 switch (res) {
717                 case -EAGAIN:
718                         TRACE_DBG("EAGAIN received for conn %p", conn);
719                         res = conn->read_size;
720                         break;
721                 case -ERESTARTSYS:
722                         TRACE_DBG("ERESTARTSYS received for conn %p", conn);
723                         goto restart;
724                 default:
725                         if (!conn->closing) {
726                                 PRINT_ERROR("sock_recvmsg() failed: %d", res);
727                                 mark_conn_closed(conn);
728                         }
729                         if (res == 0)
730                                 res = -EIO;
731                         break;
732                 }
733         }
734
735 out:
736         TRACE_EXIT_RES(res);
737         return res;
738 }
739
740 static int iscsi_rx_check_ddigest(struct iscsi_conn *conn)
741 {
742         struct iscsi_cmnd *cmnd = conn->read_cmnd;
743         int res;
744
745         res = do_recv(conn);
746         if (res == 0) {
747                 conn->read_state = RX_END;
748
749                 if (cmnd->pdu.datasize <= 16*1024) {
750                         /*
751                          * It's cache hot, so let's compute it inline. The
752                          * choice here about what will expose more latency:
753                          * possible cache misses or the digest calculation.
754                          */
755                         TRACE_DBG("cmnd %p, opcode %x: checking RX "
756                                 "ddigest inline", cmnd, cmnd_opcode(cmnd));
757                         cmnd->ddigest_checked = 1;
758                         res = digest_rx_data(cmnd);
759                         if (unlikely(res != 0)) {
760                                 mark_conn_closed(conn);
761                                 goto out;
762                         }
763                 } else if (cmnd_opcode(cmnd) == ISCSI_OP_SCSI_CMD) {
764                         cmd_add_on_rx_ddigest_list(cmnd, cmnd);
765                         cmnd_get(cmnd);
766                 } else if (cmnd_opcode(cmnd) != ISCSI_OP_SCSI_DATA_OUT) {
767                         /*
768                          * We could get here only for NOP-Out. ISCSI RFC
769                          * doesn't specify how to deal with digest errors in
770                          * this case. Is closing connection correct?
771                          */
772                         TRACE_DBG("cmnd %p, opcode %x: checking NOP RX "
773                                 "ddigest", cmnd, cmnd_opcode(cmnd));
774                         res = digest_rx_data(cmnd);
775                         if (unlikely(res != 0)) {
776                                 mark_conn_closed(conn);
777                                 goto out;
778                         }
779                 }
780         }
781
782 out:
783         return res;
784 }
785
786 /* No locks, conn is rd processing */
787 static int process_read_io(struct iscsi_conn *conn, int *closed)
788 {
789         struct iscsi_cmnd *cmnd = conn->read_cmnd;
790         int res;
791
792         TRACE_ENTRY();
793
794         /* In case of error cmnd will be freed in close_conn() */
795
796         do {
797                 switch (conn->read_state) {
798                 case RX_INIT_BHS:
799                         EXTRACHECKS_BUG_ON(conn->read_cmnd != NULL);
800                         cmnd = cmnd_alloc(conn, NULL);
801                         conn->read_cmnd = cmnd;
802                         iscsi_conn_init_read(cmnd->conn,
803                                 (void __force __user *)&cmnd->pdu.bhs,
804                                 sizeof(cmnd->pdu.bhs));
805                         conn->read_state = RX_BHS;
806                         /* go through */
807
808                 case RX_BHS:
809                         res = do_recv(conn);
810                         if (res == 0) {
811                                 iscsi_cmnd_get_length(&cmnd->pdu);
812                                 if (cmnd->pdu.ahssize == 0) {
813                                         if ((conn->hdigest_type & DIGEST_NONE) == 0)
814                                                 conn->read_state = RX_INIT_HDIGEST;
815                                         else
816                                                 conn->read_state = RX_CMD_START;
817                                 } else {
818                                         iscsi_conn_prepare_read_ahs(conn, cmnd);
819                                         conn->read_state = RX_AHS;
820                                 }
821                         }
822                         break;
823
824                 case RX_CMD_START:
825                         res = cmnd_rx_start(cmnd);
826                         if (res == 0) {
827                                 if (cmnd->pdu.datasize == 0)
828                                         conn->read_state = RX_END;
829                                 else
830                                         conn->read_state = RX_DATA;
831                         } else if (res > 0)
832                                 conn->read_state = RX_CMD_CONTINUE;
833                         else
834                                 sBUG_ON(!conn->closing);
835                         break;
836
837                 case RX_CMD_CONTINUE:
838                         if (cmnd->scst_state == ISCSI_CMD_STATE_RX_CMD) {
839                                 TRACE_DBG("cmnd %p is still in RX_CMD state",
840                                         cmnd);
841                                 res = 1;
842                                 break;
843                         }
844                         res = cmnd_rx_continue(cmnd);
845                         if (unlikely(res != 0))
846                                 sBUG_ON(!conn->closing);
847                         else {
848                                 if (cmnd->pdu.datasize == 0)
849                                         conn->read_state = RX_END;
850                                 else
851                                         conn->read_state = RX_DATA;
852                         }
853                         break;
854
855                 case RX_DATA:
856                         res = do_recv(conn);
857                         if (res == 0) {
858                                 int psz = ((cmnd->pdu.datasize + 3) & -4) - cmnd->pdu.datasize;
859                                 if (psz != 0) {
860                                         TRACE_DBG("padding %d bytes", psz);
861                                         iscsi_conn_init_read(conn,
862                                                 (void __force __user *)&conn->rpadding, psz);
863                                         conn->read_state = RX_PADDING;
864                                 } else if ((conn->ddigest_type & DIGEST_NONE) != 0)
865                                         conn->read_state = RX_END;
866                                 else
867                                         conn->read_state = RX_INIT_DDIGEST;
868                         }
869                         break;
870
871                 case RX_END:
872                         if (unlikely(conn->read_size != 0)) {
873                                 PRINT_CRIT_ERROR("conn read_size !=0 on RX_END "
874                                         "(conn %p, op %x, read_size %d)", conn,
875                                         cmnd_opcode(cmnd), conn->read_size);
876                                 sBUG();
877                         }
878                         conn->read_cmnd = NULL;
879                         conn->read_state = RX_INIT_BHS;
880
881                         cmnd_rx_end(cmnd);
882
883                         EXTRACHECKS_BUG_ON(conn->read_size != 0);
884
885                         /*
886                          * To maintain fairness. Res must be 0 here anyway, the
887                          * assignment is only to remove compiler warning about
888                          * uninitialized variable.
889                          */
890                         res = 0;
891                         goto out;
892
893                 case RX_INIT_HDIGEST:
894                         iscsi_conn_init_read(conn,
895                                 (void __force __user *)&cmnd->hdigest, sizeof(u32));
896                         conn->read_state = RX_CHECK_HDIGEST;
897                         /* go through */
898
899                 case RX_CHECK_HDIGEST:
900                         res = do_recv(conn);
901                         if (res == 0) {
902                                 res = digest_rx_header(cmnd);
903                                 if (unlikely(res != 0)) {
904                                         PRINT_ERROR("rx header digest for "
905                                                 "initiator %s failed (%d)",
906                                                 conn->session->initiator_name,
907                                                 res);
908                                         mark_conn_closed(conn);
909                                 } else
910                                         conn->read_state = RX_CMD_START;
911                         }
912                         break;
913
914                 case RX_INIT_DDIGEST:
915                         iscsi_conn_init_read(conn,
916                                 (void __force __user *)&cmnd->ddigest,
917                                 sizeof(u32));
918                         conn->read_state = RX_CHECK_DDIGEST;
919                         /* go through */
920
921                 case RX_CHECK_DDIGEST:
922                         res = iscsi_rx_check_ddigest(conn);
923                         break;
924
925                 case RX_AHS:
926                         res = do_recv(conn);
927                         if (res == 0) {
928                                 if ((conn->hdigest_type & DIGEST_NONE) == 0)
929                                         conn->read_state = RX_INIT_HDIGEST;
930                                 else
931                                         conn->read_state = RX_CMD_START;
932                         }
933                         break;
934
935                 case RX_PADDING:
936                         res = do_recv(conn);
937                         if (res == 0) {
938                                 if ((conn->ddigest_type & DIGEST_NONE) == 0)
939                                         conn->read_state = RX_INIT_DDIGEST;
940                                 else
941                                         conn->read_state = RX_END;
942                         }
943                         break;
944
945                 default:
946                         PRINT_CRIT_ERROR("%d %x", conn->read_state, cmnd_opcode(cmnd));
947                         res = -1; /* to keep compiler happy */
948                         sBUG();
949                 }
950         } while (res == 0);
951
952         if (unlikely(conn->closing)) {
953                 start_close_conn(conn);
954                 *closed = 1;
955         }
956
957 out:
958         TRACE_EXIT_RES(res);
959         return res;
960 }
961
962 /*
963  * Called under iscsi_rd_lock and BHs disabled, but will drop it inside,
964  * then reaquire.
965  */
966 static void scst_do_job_rd(void)
967         __acquires(&iscsi_rd_lock)
968         __releases(&iscsi_rd_lock)
969 {
970         TRACE_ENTRY();
971
972         /*
973          * We delete/add to tail connections to maintain fairness between them.
974          */
975
976         while (!list_empty(&iscsi_rd_list)) {
977                 int closed = 0, rc;
978                 struct iscsi_conn *conn = list_entry(iscsi_rd_list.next,
979                         typeof(*conn), rd_list_entry);
980
981                 list_del(&conn->rd_list_entry);
982
983                 sBUG_ON(conn->rd_state == ISCSI_CONN_RD_STATE_PROCESSING);
984                 conn->rd_data_ready = 0;
985                 conn->rd_state = ISCSI_CONN_RD_STATE_PROCESSING;
986 #ifdef CONFIG_SCST_EXTRACHECKS
987                 conn->rd_task = current;
988 #endif
989                 spin_unlock_bh(&iscsi_rd_lock);
990
991                 rc = process_read_io(conn, &closed);
992
993                 spin_lock_bh(&iscsi_rd_lock);
994
995                 if (unlikely(closed))
996                         continue;
997
998                 if (unlikely(conn->conn_tm_active)) {
999                         spin_unlock_bh(&iscsi_rd_lock);
1000                         iscsi_check_tm_data_wait_timeouts(conn, false);
1001                         spin_lock_bh(&iscsi_rd_lock);
1002                 }
1003
1004 #ifdef CONFIG_SCST_EXTRACHECKS
1005                 conn->rd_task = NULL;
1006 #endif
1007                 if ((rc == 0) || conn->rd_data_ready) {
1008                         list_add_tail(&conn->rd_list_entry, &iscsi_rd_list);
1009                         conn->rd_state = ISCSI_CONN_RD_STATE_IN_LIST;
1010                 } else
1011                         conn->rd_state = ISCSI_CONN_RD_STATE_IDLE;
1012         }
1013
1014         TRACE_EXIT();
1015         return;
1016 }
1017
1018 static inline int test_rd_list(void)
1019 {
1020         int res = !list_empty(&iscsi_rd_list) ||
1021                   unlikely(kthread_should_stop());
1022         return res;
1023 }
1024
1025 int istrd(void *arg)
1026 {
1027         TRACE_ENTRY();
1028
1029         PRINT_INFO("Read thread started, PID %d", current->pid);
1030
1031         current->flags |= PF_NOFREEZE;
1032
1033         spin_lock_bh(&iscsi_rd_lock);
1034         while (!kthread_should_stop()) {
1035                 wait_queue_t wait;
1036                 init_waitqueue_entry(&wait, current);
1037
1038                 if (!test_rd_list()) {
1039                         add_wait_queue_exclusive_head(&iscsi_rd_waitQ, &wait);
1040                         for (;;) {
1041                                 set_current_state(TASK_INTERRUPTIBLE);
1042                                 if (test_rd_list())
1043                                         break;
1044                                 spin_unlock_bh(&iscsi_rd_lock);
1045                                 schedule();
1046                                 spin_lock_bh(&iscsi_rd_lock);
1047                         }
1048                         set_current_state(TASK_RUNNING);
1049                         remove_wait_queue(&iscsi_rd_waitQ, &wait);
1050                 }
1051                 scst_do_job_rd();
1052         }
1053         spin_unlock_bh(&iscsi_rd_lock);
1054
1055         /*
1056          * If kthread_should_stop() is true, we are guaranteed to be
1057          * on the module unload, so iscsi_rd_list must be empty.
1058          */
1059         sBUG_ON(!list_empty(&iscsi_rd_list));
1060
1061         PRINT_INFO("Read thread PID %d finished", current->pid);
1062
1063         TRACE_EXIT();
1064         return 0;
1065 }
1066
1067 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
1068 static inline void __iscsi_get_page_callback(struct iscsi_cmnd *cmd)
1069 {
1070         int v;
1071
1072         TRACE_NET_PAGE("cmd %p, new net_ref_cnt %d",
1073                 cmd, atomic_read(&cmd->net_ref_cnt)+1);
1074
1075         v = atomic_inc_return(&cmd->net_ref_cnt);
1076         if (v == 1) {
1077                 TRACE_NET_PAGE("getting cmd %p", cmd);
1078                 cmnd_get(cmd);
1079         }
1080         return;
1081 }
1082
1083 void iscsi_get_page_callback(struct page *page)
1084 {
1085         struct iscsi_cmnd *cmd = (struct iscsi_cmnd *)page->net_priv;
1086
1087         TRACE_NET_PAGE("page %p, _count %d", page,
1088                 atomic_read(&page->_count));
1089
1090         __iscsi_get_page_callback(cmd);
1091         return;
1092 }
1093
1094 static inline void __iscsi_put_page_callback(struct iscsi_cmnd *cmd)
1095 {
1096         TRACE_NET_PAGE("cmd %p, new net_ref_cnt %d", cmd,
1097                 atomic_read(&cmd->net_ref_cnt)-1);
1098
1099         if (atomic_dec_and_test(&cmd->net_ref_cnt)) {
1100                 int i, sg_cnt = cmd->sg_cnt;
1101                 for (i = 0; i < sg_cnt; i++) {
1102                         struct page *page = sg_page(&cmd->sg[i]);
1103                         TRACE_NET_PAGE("Clearing page %p", page);
1104                         if (page->net_priv == cmd)
1105                                 page->net_priv = NULL;
1106                 }
1107                 cmnd_put(cmd);
1108         }
1109         return;
1110 }
1111
1112 void iscsi_put_page_callback(struct page *page)
1113 {
1114         struct iscsi_cmnd *cmd = (struct iscsi_cmnd *)page->net_priv;
1115
1116         TRACE_NET_PAGE("page %p, _count %d", page,
1117                 atomic_read(&page->_count));
1118
1119         __iscsi_put_page_callback(cmd);
1120         return;
1121 }
1122
1123 static void check_net_priv(struct iscsi_cmnd *cmd, struct page *page)
1124 {
1125         if ((atomic_read(&cmd->net_ref_cnt) == 1) && (page->net_priv == cmd)) {
1126                 TRACE_DBG("sendpage() not called get_page(), zeroing net_priv "
1127                         "%p (page %p)", page->net_priv, page);
1128                 page->net_priv = NULL;
1129         }
1130         return;
1131 }
1132 #else
1133 static inline void check_net_priv(struct iscsi_cmnd *cmd, struct page *page) {}
1134 static inline void __iscsi_get_page_callback(struct iscsi_cmnd *cmd) {}
1135 static inline void __iscsi_put_page_callback(struct iscsi_cmnd *cmd) {}
1136 #endif
1137
1138 void req_add_to_write_timeout_list(struct iscsi_cmnd *req)
1139 {
1140         struct iscsi_conn *conn;
1141         unsigned long timeout_time;
1142         bool set_conn_tm_active = false;
1143
1144         TRACE_ENTRY();
1145
1146         if (req->on_write_timeout_list)
1147                 goto out;
1148
1149         conn = req->conn;
1150
1151         TRACE_DBG("Adding req %p to conn %p write_timeout_list",
1152                 req, conn);
1153
1154         spin_lock_bh(&conn->write_list_lock);
1155
1156         req->on_write_timeout_list = 1;
1157         req->write_start = jiffies;
1158         list_add_tail(&req->write_timeout_list_entry,
1159                 &conn->write_timeout_list);
1160
1161         if (!timer_pending(&conn->rsp_timer)) {
1162                 if (unlikely(conn->conn_tm_active ||
1163                              test_bit(ISCSI_CMD_ABORTED,
1164                                         &req->prelim_compl_flags))) {
1165                         set_conn_tm_active = true;
1166                         timeout_time = req->write_start +
1167                                 ISCSI_TM_DATA_WAIT_SCHED_TIMEOUT;
1168                 } else
1169                         timeout_time = req->write_start +
1170                                 ISCSI_RSP_SCHED_TIMEOUT;
1171
1172                 TRACE_DBG("Starting timer on %ld (con %p, write_start %ld)",
1173                         timeout_time, conn, req->write_start);
1174
1175                 conn->rsp_timer.expires = timeout_time;
1176                 add_timer(&conn->rsp_timer);
1177         } else if (unlikely(test_bit(ISCSI_CMD_ABORTED,
1178                                 &req->prelim_compl_flags))) {
1179                 unsigned long timeout_time = jiffies +
1180                                         ISCSI_TM_DATA_WAIT_SCHED_TIMEOUT;
1181                 set_conn_tm_active = true;
1182                 if (time_after(conn->rsp_timer.expires, timeout_time)) {
1183                         TRACE_MGMT_DBG("Mod timer on %ld (conn %p)",
1184                                 timeout_time, conn);
1185                         mod_timer(&conn->rsp_timer, timeout_time);
1186                 }
1187         }
1188
1189         spin_unlock_bh(&conn->write_list_lock);
1190
1191         /*
1192          * conn_tm_active can be already cleared by
1193          * iscsi_check_tm_data_wait_timeouts(). write_list_lock is an inner
1194          * lock for iscsi_rd_lock.
1195          */
1196         if (unlikely(set_conn_tm_active)) {
1197                 spin_lock_bh(&iscsi_rd_lock);
1198                 TRACE_MGMT_DBG("Setting conn_tm_active for conn %p", conn);
1199                 conn->conn_tm_active = 1;
1200                 spin_unlock_bh(&iscsi_rd_lock);
1201         }
1202
1203 out:
1204         TRACE_EXIT();
1205         return;
1206 }
1207
1208 static int write_data(struct iscsi_conn *conn)
1209 {
1210         mm_segment_t oldfs;
1211         struct file *file;
1212         struct iovec *iop;
1213         struct socket *sock;
1214         ssize_t (*sock_sendpage)(struct socket *, struct page *, int, size_t,
1215                                  int);
1216         ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int);
1217         struct iscsi_cmnd *write_cmnd = conn->write_cmnd;
1218         struct iscsi_cmnd *ref_cmd;
1219         struct page *page;
1220         struct scatterlist *sg;
1221         int saved_size, size, sendsize;
1222         int length, offset, idx;
1223         int flags, res, count, sg_size;
1224         bool do_put = false, ref_cmd_to_parent;
1225
1226         TRACE_ENTRY();
1227
1228         iscsi_extracheck_is_wr_thread(conn);
1229
1230         if (!write_cmnd->own_sg) {
1231                 ref_cmd = write_cmnd->parent_req;
1232                 ref_cmd_to_parent = true;
1233         } else {
1234                 ref_cmd = write_cmnd;
1235                 ref_cmd_to_parent = false;
1236         }
1237
1238         req_add_to_write_timeout_list(write_cmnd->parent_req);
1239
1240         file = conn->file;
1241         size = conn->write_size;
1242         saved_size = size;
1243         iop = conn->write_iop;
1244         count = conn->write_iop_used;
1245
1246         if (iop) {
1247                 while (1) {
1248                         loff_t off = 0;
1249                         int rest;
1250
1251                         sBUG_ON(count > (signed)(sizeof(conn->write_iov) /
1252                                                 sizeof(conn->write_iov[0])));
1253 retry:
1254                         oldfs = get_fs();
1255                         set_fs(KERNEL_DS);
1256                         res = vfs_writev(file,
1257                                          (struct iovec __force __user *)iop,
1258                                          count, &off);
1259                         set_fs(oldfs);
1260                         TRACE_WRITE("sid %#Lx, cid %u, res %d, iov_len %ld",
1261                                     (long long unsigned int)conn->session->sid,
1262                                     conn->cid, res, (long)iop->iov_len);
1263                         if (unlikely(res <= 0)) {
1264                                 if (res == -EAGAIN) {
1265                                         conn->write_iop = iop;
1266                                         conn->write_iop_used = count;
1267                                         goto out_iov;
1268                                 } else if (res == -EINTR)
1269                                         goto retry;
1270                                 goto out_err;
1271                         }
1272
1273                         rest = res;
1274                         size -= res;
1275                         while ((typeof(rest))iop->iov_len <= rest && rest) {
1276                                 rest -= iop->iov_len;
1277                                 iop++;
1278                                 count--;
1279                         }
1280                         if (count == 0) {
1281                                 conn->write_iop = NULL;
1282                                 conn->write_iop_used = 0;
1283                                 if (size)
1284                                         break;
1285                                 goto out_iov;
1286                         }
1287                         sBUG_ON(iop > conn->write_iov + sizeof(conn->write_iov)
1288                                                   /sizeof(conn->write_iov[0]));
1289                         iop->iov_base += rest;
1290                         iop->iov_len -= rest;
1291                 }
1292         }
1293
1294         sg = write_cmnd->sg;
1295         if (unlikely(sg == NULL)) {
1296                 PRINT_INFO("WARNING: Data missed (cmd %p)!", write_cmnd);
1297                 res = 0;
1298                 goto out;
1299         }
1300
1301         /* To protect from too early transfer completion race */
1302         __iscsi_get_page_callback(ref_cmd);
1303         do_put = true;
1304
1305         sock = conn->sock;
1306
1307 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
1308         sock_sendpage = sock->ops->sendpage;
1309 #else
1310         if ((write_cmnd->parent_req->scst_cmd != NULL) &&
1311             scst_cmd_get_dh_data_buff_alloced(write_cmnd->parent_req->scst_cmd))
1312                 sock_sendpage = sock_no_sendpage;
1313         else
1314                 sock_sendpage = sock->ops->sendpage;
1315 #endif
1316
1317         flags = MSG_DONTWAIT;
1318         sg_size = size;
1319
1320         if (sg != write_cmnd->rsp_sg) {
1321                 offset = conn->write_offset + sg[0].offset;
1322                 idx = offset >> PAGE_SHIFT;
1323                 offset &= ~PAGE_MASK;
1324                 length = min(size, (int)PAGE_SIZE - offset);
1325                 TRACE_WRITE("write_offset %d, sg_size %d, idx %d, offset %d, "
1326                         "length %d", conn->write_offset, sg_size, idx, offset,
1327                         length);
1328         } else {
1329                 idx = 0;
1330                 offset = conn->write_offset;
1331                 while (offset >= sg[idx].length) {
1332                         offset -= sg[idx].length;
1333                         idx++;
1334                 }
1335                 length = sg[idx].length - offset;
1336                 offset += sg[idx].offset;
1337                 sock_sendpage = sock_no_sendpage;
1338                 TRACE_WRITE("rsp_sg: write_offset %d, sg_size %d, idx %d, "
1339                         "offset %d, length %d", conn->write_offset, sg_size,
1340                         idx, offset, length);
1341         }
1342         page = sg_page(&sg[idx]);
1343
1344         while (1) {
1345                 sendpage = sock_sendpage;
1346
1347 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
1348                 {
1349                         static DEFINE_SPINLOCK(net_priv_lock);
1350                         spin_lock(&net_priv_lock);
1351                         if (unlikely(page->net_priv != NULL)) {
1352                                 if (page->net_priv != ref_cmd) {
1353                                         /*
1354                                          * This might happen if user space
1355                                          * supplies to scst_user the same
1356                                          * pages in different commands or in
1357                                          * case of zero-copy FILEIO, when
1358                                          * several initiators request the same
1359                                          * data simultaneously.
1360                                          */
1361                                         TRACE_DBG("net_priv isn't NULL and != "
1362                                             "ref_cmd (write_cmnd %p, ref_cmd "
1363                                             "%p, sg %p, idx %d, page %p, "
1364                                             "net_priv %p)",
1365                                             write_cmnd, ref_cmd, sg, idx,
1366                                             page, page->net_priv);
1367                                         sendpage = sock_no_sendpage;
1368                                 }
1369                         } else
1370                                 page->net_priv = ref_cmd;
1371                         spin_unlock(&net_priv_lock);
1372                 }
1373 #endif
1374                 sendsize = min(size, length);
1375                 if (size <= sendsize) {
1376 retry2:
1377                         res = sendpage(sock, page, offset, size, flags);
1378                         TRACE_WRITE("Final %s sid %#Lx, cid %u, res %d (page "
1379                                 "index %lu, offset %u, size %u, cmd %p, "
1380                                 "page %p)", (sendpage != sock_no_sendpage) ?
1381                                                 "sendpage" : "sock_no_sendpage",
1382                                 (long long unsigned int)conn->session->sid,
1383                                 conn->cid, res, page->index,
1384                                 offset, size, write_cmnd, page);
1385                         if (unlikely(res <= 0)) {
1386                                 if (res == -EINTR)
1387                                         goto retry2;
1388                                 else
1389                                         goto out_res;
1390                         }
1391
1392                         check_net_priv(ref_cmd, page);
1393                         if (res == size) {
1394                                 conn->write_size = 0;
1395                                 res = saved_size;
1396                                 goto out_put;
1397                         }
1398
1399                         offset += res;
1400                         size -= res;
1401                         goto retry2;
1402                 }
1403
1404 retry1:
1405                 res = sendpage(sock, page, offset, sendsize, flags | MSG_MORE);
1406                 TRACE_WRITE("%s sid %#Lx, cid %u, res %d (page index %lu, "
1407                         "offset %u, sendsize %u, size %u, cmd %p, page %p)",
1408                         (sendpage != sock_no_sendpage) ? "sendpage" :
1409                                                          "sock_no_sendpage",
1410                         (unsigned long long)conn->session->sid, conn->cid,
1411                         res, page->index, offset, sendsize, size,
1412                         write_cmnd, page);
1413                 if (unlikely(res <= 0)) {
1414                         if (res == -EINTR)
1415                                 goto retry1;
1416                         else
1417                                 goto out_res;
1418                 }
1419
1420                 check_net_priv(ref_cmd, page);
1421
1422                 size -= res;
1423
1424                 if (res == sendsize) {
1425                         idx++;
1426                         EXTRACHECKS_BUG_ON(idx >= ref_cmd->sg_cnt);
1427                         page = sg_page(&sg[idx]);
1428                         length = sg[idx].length;
1429                         offset = sg[idx].offset;
1430                 } else {
1431                         offset += res;
1432                         sendsize -= res;
1433                         goto retry1;
1434                 }
1435         }
1436
1437 out_off:
1438         conn->write_offset += sg_size - size;
1439
1440 out_iov:
1441         conn->write_size = size;
1442         if ((saved_size == size) && res == -EAGAIN)
1443                 goto out_put;
1444
1445         res = saved_size - size;
1446
1447 out_put:
1448         if (do_put)
1449                 __iscsi_put_page_callback(ref_cmd);
1450
1451 out:
1452         TRACE_EXIT_RES(res);
1453         return res;
1454
1455 out_res:
1456         check_net_priv(ref_cmd, page);
1457         if (res == -EAGAIN)
1458                 goto out_off;
1459         /* else go through */
1460
1461 out_err:
1462 #ifndef CONFIG_SCST_DEBUG
1463         if (!conn->closing)
1464 #endif
1465         {
1466                 PRINT_ERROR("error %d at sid:cid %#Lx:%u, cmnd %p", res,
1467                             (long long unsigned int)conn->session->sid,
1468                             conn->cid, conn->write_cmnd);
1469         }
1470         if (ref_cmd_to_parent &&
1471             ((ref_cmd->scst_cmd != NULL) || (ref_cmd->scst_aen != NULL))) {
1472                 if (ref_cmd->scst_state == ISCSI_CMD_STATE_AEN)
1473                         scst_set_aen_delivery_status(ref_cmd->scst_aen,
1474                                 SCST_AEN_RES_FAILED);
1475                 else
1476                         scst_set_delivery_status(ref_cmd->scst_cmd,
1477                                 SCST_CMD_DELIVERY_FAILED);
1478         }
1479         goto out_put;
1480 }
1481
1482 static int exit_tx(struct iscsi_conn *conn, int res)
1483 {
1484         iscsi_extracheck_is_wr_thread(conn);
1485
1486         switch (res) {
1487         case -EAGAIN:
1488         case -ERESTARTSYS:
1489                 res = 0;
1490                 break;
1491         default:
1492 #ifndef CONFIG_SCST_DEBUG
1493                 if (!conn->closing)
1494 #endif
1495                 {
1496                         PRINT_ERROR("Sending data failed: initiator %s, "
1497                                 "write_size %d, write_state %d, res %d",
1498                                 conn->session->initiator_name,
1499                                 conn->write_size,
1500                                 conn->write_state, res);
1501                 }
1502                 conn->write_state = TX_END;
1503                 conn->write_size = 0;
1504                 mark_conn_closed(conn);
1505                 break;
1506         }
1507         return res;
1508 }
1509
1510 static int tx_ddigest(struct iscsi_cmnd *cmnd, int state)
1511 {
1512         int res, rest = cmnd->conn->write_size;
1513         struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
1514         struct kvec iov;
1515
1516         iscsi_extracheck_is_wr_thread(cmnd->conn);
1517
1518         TRACE_DBG("Sending data digest %x (cmd %p)", cmnd->ddigest, cmnd);
1519
1520         iov.iov_base = (char *)(&cmnd->ddigest) + (sizeof(u32) - rest);
1521         iov.iov_len = rest;
1522
1523         res = kernel_sendmsg(cmnd->conn->sock, &msg, &iov, 1, rest);
1524         if (res > 0) {
1525                 cmnd->conn->write_size -= res;
1526                 if (!cmnd->conn->write_size)
1527                         cmnd->conn->write_state = state;
1528         } else
1529                 res = exit_tx(cmnd->conn, res);
1530
1531         return res;
1532 }
1533
1534 static void init_tx_hdigest(struct iscsi_cmnd *cmnd)
1535 {
1536         struct iscsi_conn *conn = cmnd->conn;
1537         struct iovec *iop;
1538
1539         iscsi_extracheck_is_wr_thread(conn);
1540
1541         digest_tx_header(cmnd);
1542
1543         sBUG_ON(conn->write_iop_used >=
1544                 (signed)(sizeof(conn->write_iov)/sizeof(conn->write_iov[0])));
1545
1546         iop = &conn->write_iop[conn->write_iop_used];
1547         conn->write_iop_used++;
1548         iop->iov_base = (void __force __user *)&(cmnd->hdigest);
1549         iop->iov_len = sizeof(u32);
1550         conn->write_size += sizeof(u32);
1551
1552         return;
1553 }
1554
1555 static int tx_padding(struct iscsi_cmnd *cmnd, int state)
1556 {
1557         int res, rest = cmnd->conn->write_size;
1558         struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
1559         struct kvec iov;
1560         static const uint32_t padding;
1561
1562         iscsi_extracheck_is_wr_thread(cmnd->conn);
1563
1564         TRACE_DBG("Sending %d padding bytes (cmd %p)", rest, cmnd);
1565
1566         iov.iov_base = (char *)(&padding) + (sizeof(uint32_t) - rest);
1567         iov.iov_len = rest;
1568
1569         res = kernel_sendmsg(cmnd->conn->sock, &msg, &iov, 1, rest);
1570         if (res > 0) {
1571                 cmnd->conn->write_size -= res;
1572                 if (!cmnd->conn->write_size)
1573                         cmnd->conn->write_state = state;
1574         } else
1575                 res = exit_tx(cmnd->conn, res);
1576
1577         return res;
1578 }
1579
1580 static int iscsi_do_send(struct iscsi_conn *conn, int state)
1581 {
1582         int res;
1583
1584         iscsi_extracheck_is_wr_thread(conn);
1585
1586         res = write_data(conn);
1587         if (res > 0) {
1588                 if (!conn->write_size)
1589                         conn->write_state = state;
1590         } else
1591                 res = exit_tx(conn, res);
1592
1593         return res;
1594 }
1595
1596 /*
1597  * No locks, conn is wr processing.
1598  *
1599  * IMPORTANT! Connection conn must be protected by additional conn_get()
1600  * upon entrance in this function, because otherwise it could be destroyed
1601  * inside as a result of cmnd release.
1602  */
1603 int iscsi_send(struct iscsi_conn *conn)
1604 {
1605         struct iscsi_cmnd *cmnd = conn->write_cmnd;
1606         int ddigest, res = 0;
1607
1608         TRACE_ENTRY();
1609
1610         TRACE_DBG("conn %p, write_cmnd %p", conn, cmnd);
1611
1612         iscsi_extracheck_is_wr_thread(conn);
1613
1614         ddigest = conn->ddigest_type != DIGEST_NONE ? 1 : 0;
1615
1616         switch (conn->write_state) {
1617         case TX_INIT:
1618                 sBUG_ON(cmnd != NULL);
1619                 cmnd = conn->write_cmnd = iscsi_get_send_cmnd(conn);
1620                 if (!cmnd)
1621                         goto out;
1622                 cmnd_tx_start(cmnd);
1623                 if (!(conn->hdigest_type & DIGEST_NONE))
1624                         init_tx_hdigest(cmnd);
1625                 conn->write_state = TX_BHS_DATA;
1626         case TX_BHS_DATA:
1627                 res = iscsi_do_send(conn, cmnd->pdu.datasize ?
1628                                         TX_INIT_PADDING : TX_END);
1629                 if (res <= 0 || conn->write_state != TX_INIT_PADDING)
1630                         break;
1631         case TX_INIT_PADDING:
1632                 cmnd->conn->write_size = ((cmnd->pdu.datasize + 3) & -4) -
1633                                                 cmnd->pdu.datasize;
1634                 if (cmnd->conn->write_size != 0)
1635                         conn->write_state = TX_PADDING;
1636                 else if (ddigest)
1637                         conn->write_state = TX_INIT_DDIGEST;
1638                  else
1639                         conn->write_state = TX_END;
1640                 break;
1641         case TX_PADDING:
1642                 res = tx_padding(cmnd, ddigest ? TX_INIT_DDIGEST : TX_END);
1643                 if (res <= 0 || conn->write_state != TX_INIT_DDIGEST)
1644                         break;
1645         case TX_INIT_DDIGEST:
1646                 cmnd->conn->write_size = sizeof(u32);
1647                 conn->write_state = TX_DDIGEST;
1648         case TX_DDIGEST:
1649                 res = tx_ddigest(cmnd, TX_END);
1650                 break;
1651         default:
1652                 PRINT_CRIT_ERROR("%d %d %x", res, conn->write_state,
1653                         cmnd_opcode(cmnd));
1654                 sBUG();
1655         }
1656
1657         if (res == 0)
1658                 goto out;
1659
1660         if (conn->write_state != TX_END)
1661                 goto out;
1662
1663         if (unlikely(conn->write_size)) {
1664                 PRINT_CRIT_ERROR("%d %x %u", res, cmnd_opcode(cmnd),
1665                         conn->write_size);
1666                 sBUG();
1667         }
1668         cmnd_tx_end(cmnd);
1669
1670         rsp_cmnd_release(cmnd);
1671
1672         conn->write_cmnd = NULL;
1673         conn->write_state = TX_INIT;
1674
1675 out:
1676         TRACE_EXIT_RES(res);
1677         return res;
1678 }
1679
1680 /* No locks, conn is wr processing.
1681  *
1682  * IMPORTANT! Connection conn must be protected by additional conn_get()
1683  * upon entrance in this function, because otherwise it could be destroyed
1684  * inside as a result of iscsi_send(), which releases sent commands.
1685  */
1686 static int process_write_queue(struct iscsi_conn *conn)
1687 {
1688         int res = 0;
1689
1690         TRACE_ENTRY();
1691
1692         if (likely(test_write_ready(conn)))
1693                 res = iscsi_send(conn);
1694
1695         TRACE_EXIT_RES(res);
1696         return res;
1697 }
1698
1699 /*
1700  * Called under iscsi_wr_lock and BHs disabled, but will drop it inside,
1701  * then reaquire.
1702  */
1703 static void scst_do_job_wr(void)
1704         __acquires(&iscsi_wr_lock)
1705         __releases(&iscsi_wr_lock)
1706 {
1707         TRACE_ENTRY();
1708
1709         /*
1710          * We delete/add to tail connections to maintain fairness between them.
1711          */
1712
1713         while (!list_empty(&iscsi_wr_list)) {
1714                 int rc;
1715                 struct iscsi_conn *conn = list_entry(iscsi_wr_list.next,
1716                         typeof(*conn), wr_list_entry);
1717
1718                 TRACE_DBG("conn %p, wr_state %x, wr_space_ready %d, "
1719                         "write ready %d", conn, conn->wr_state,
1720                         conn->wr_space_ready, test_write_ready(conn));
1721
1722                 list_del(&conn->wr_list_entry);
1723
1724                 sBUG_ON(conn->wr_state == ISCSI_CONN_WR_STATE_PROCESSING);
1725
1726                 conn->wr_state = ISCSI_CONN_WR_STATE_PROCESSING;
1727                 conn->wr_space_ready = 0;
1728 #ifdef CONFIG_SCST_EXTRACHECKS
1729                 conn->wr_task = current;
1730 #endif
1731                 spin_unlock_bh(&iscsi_wr_lock);
1732
1733                 conn_get(conn);
1734
1735                 rc = process_write_queue(conn);
1736
1737                 spin_lock_bh(&iscsi_wr_lock);
1738 #ifdef CONFIG_SCST_EXTRACHECKS
1739                 conn->wr_task = NULL;
1740 #endif
1741                 if ((rc == -EAGAIN) && !conn->wr_space_ready) {
1742                         conn->wr_state = ISCSI_CONN_WR_STATE_SPACE_WAIT;
1743                         goto cont;
1744                 }
1745
1746                 if (test_write_ready(conn)) {
1747                         list_add_tail(&conn->wr_list_entry, &iscsi_wr_list);
1748                         conn->wr_state = ISCSI_CONN_WR_STATE_IN_LIST;
1749                 } else
1750                         conn->wr_state = ISCSI_CONN_WR_STATE_IDLE;
1751
1752 cont:
1753                 conn_put(conn);
1754         }
1755
1756         TRACE_EXIT();
1757         return;
1758 }
1759
1760 static inline int test_wr_list(void)
1761 {
1762         int res = !list_empty(&iscsi_wr_list) ||
1763                   unlikely(kthread_should_stop());
1764         return res;
1765 }
1766
1767 int istwr(void *arg)
1768 {
1769         TRACE_ENTRY();
1770
1771         PRINT_INFO("Write thread started, PID %d", current->pid);
1772
1773         current->flags |= PF_NOFREEZE;
1774
1775         spin_lock_bh(&iscsi_wr_lock);
1776         while (!kthread_should_stop()) {
1777                 wait_queue_t wait;
1778                 init_waitqueue_entry(&wait, current);
1779
1780                 if (!test_wr_list()) {
1781                         add_wait_queue_exclusive_head(&iscsi_wr_waitQ, &wait);
1782                         for (;;) {
1783                                 set_current_state(TASK_INTERRUPTIBLE);
1784                                 if (test_wr_list())
1785                                         break;
1786                                 spin_unlock_bh(&iscsi_wr_lock);
1787                                 schedule();
1788                                 spin_lock_bh(&iscsi_wr_lock);
1789                         }
1790                         set_current_state(TASK_RUNNING);
1791                         remove_wait_queue(&iscsi_wr_waitQ, &wait);
1792                 }
1793                 scst_do_job_wr();
1794         }
1795         spin_unlock_bh(&iscsi_wr_lock);
1796
1797         /*
1798          * If kthread_should_stop() is true, we are guaranteed to be
1799          * on the module unload, so iscsi_wr_list must be empty.
1800          */
1801         sBUG_ON(!list_empty(&iscsi_wr_list));
1802
1803         PRINT_INFO("Write thread PID %d finished", current->pid);
1804
1805         TRACE_EXIT();
1806         return 0;
1807 }