Fixed a (false positive) compiler warning.
[mirror/scst/.git] / iscsi-scst / kernel / nthread.c
1 /*
2  *  Network threads.
3  *
4  *  Copyright (C) 2004 - 2005 FUJITA Tomonori <tomof@acm.org>
5  *  Copyright (C) 2007 - 2009 Vladislav Bolkhovitin
6  *  Copyright (C) 2007 - 2009 ID7 Ltd.
7  *
8  *  This program is free software; you can redistribute it and/or
9  *  modify it under the terms of the GNU General Public License
10  *  as published by the Free Software Foundation.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  *  GNU General Public License for more details.
16  */
17
18 #include <linux/sched.h>
19 #include <linux/file.h>
20 #include <linux/kthread.h>
21 #include <asm/ioctls.h>
22 #include <linux/delay.h>
23 #include <net/tcp.h>
24
25 #include "iscsi.h"
26 #include "digest.h"
27
28 enum rx_state {
29         RX_INIT_BHS, /* Must be zero for better "switch" optimiztion. */
30         RX_BHS,
31         RX_CMD_START,
32         RX_DATA,
33         RX_END,
34
35         RX_CMD_CONTINUE,
36         RX_INIT_HDIGEST,
37         RX_CHECK_HDIGEST,
38         RX_INIT_DDIGEST,
39         RX_CHECK_DDIGEST,
40         RX_AHS,
41         RX_PADDING,
42 };
43
44 enum tx_state {
45         TX_INIT = 0, /* Must be zero for better "switch" optimiztion. */
46         TX_BHS_DATA,
47         TX_INIT_PADDING,
48         TX_PADDING,
49         TX_INIT_DDIGEST,
50         TX_DDIGEST,
51         TX_END,
52 };
53
54 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
55 static void iscsi_check_closewait(struct iscsi_conn *conn)
56 {
57         struct iscsi_cmnd *cmnd;
58
59         TRACE_ENTRY();
60
61         TRACE_CONN_CLOSE_DBG("conn %p, sk_state %d", conn,
62                 conn->sock->sk->sk_state);
63
64         if (conn->sock->sk->sk_state != TCP_CLOSE) {
65                 TRACE_CONN_CLOSE_DBG("conn %p, skipping", conn);
66                 goto out;
67         }
68
69         /*
70          * No data are going to be sent, so all queued buffers can be freed
71          * now. In many cases TCP does that only in close(), but we can't rely
72          * on user space on calling it.
73          */
74
75 again:
76         spin_lock_bh(&conn->cmd_list_lock);
77         list_for_each_entry(cmnd, &conn->cmd_list, cmd_list_entry) {
78                 struct iscsi_cmnd *rsp;
79                 int restart = 0;
80
81                 TRACE_CONN_CLOSE_DBG("cmd %p, scst_state %x, data_waiting %d, "
82                         "ref_cnt %d, parent_req %p, net_ref_cnt %d, sg %p",
83                         cmnd, cmnd->scst_state, cmnd->data_waiting,
84                         atomic_read(&cmnd->ref_cnt), cmnd->parent_req,
85                         atomic_read(&cmnd->net_ref_cnt), cmnd->sg);
86
87                 sBUG_ON(cmnd->parent_req != NULL);
88
89                 if (cmnd->sg != NULL) {
90                         int i;
91
92                         if (cmnd_get_check(cmnd))
93                                 continue;
94
95                         for (i = 0; i < cmnd->sg_cnt; i++) {
96                                 struct page *page = sg_page(&cmnd->sg[i]);
97                                 TRACE_CONN_CLOSE_DBG("page %p, net_priv %p, "
98                                         "_count %d", page, page->net_priv,
99                                         atomic_read(&page->_count));
100
101                                 if (page->net_priv != NULL) {
102                                         if (restart == 0) {
103                                                 spin_unlock_bh(&conn->cmd_list_lock);
104                                                 restart = 1;
105                                         }
106                                         while (page->net_priv != NULL)
107                                                 iscsi_put_page_callback(page);
108                                 }
109                         }
110                         cmnd_put(cmnd);
111
112                         if (restart)
113                                 goto again;
114                 }
115
116                 spin_lock_bh(&cmnd->rsp_cmd_lock);
117                 list_for_each_entry(rsp, &cmnd->rsp_cmd_list,
118                                 rsp_cmd_list_entry) {
119                         TRACE_CONN_CLOSE_DBG("  rsp %p, ref_cnt %d, "
120                                 "net_ref_cnt %d, sg %p",
121                                 rsp, atomic_read(&rsp->ref_cnt),
122                                 atomic_read(&rsp->net_ref_cnt), rsp->sg);
123
124                         if ((rsp->sg != cmnd->sg) && (rsp->sg != NULL)) {
125                                 int i;
126
127                                 if (cmnd_get_check(rsp))
128                                         continue;
129
130                                 for (i = 0; i < rsp->sg_cnt; i++) {
131                                         struct page *page =
132                                                 sg_page(&rsp->sg[i]);
133                                         TRACE_CONN_CLOSE_DBG(
134                                                 "    page %p, net_priv %p, "
135                                                 "_count %d",
136                                                 page, page->net_priv,
137                                                 atomic_read(&page->_count));
138
139                                         if (page->net_priv != NULL) {
140                                                 if (restart == 0) {
141                                                         spin_unlock_bh(&cmnd->rsp_cmd_lock);
142                                                         spin_unlock_bh(&conn->cmd_list_lock);
143                                                         restart = 1;
144                                                 }
145                                                 while (page->net_priv != NULL)
146                                                         iscsi_put_page_callback(page);
147                                         }
148                                 }
149                                 cmnd_put(rsp);
150
151                                 if (restart)
152                                         goto again;
153                         }
154                 }
155                 spin_unlock_bh(&cmnd->rsp_cmd_lock);
156         }
157         spin_unlock_bh(&conn->cmd_list_lock);
158
159 out:
160         TRACE_EXIT();
161         return;
162 }
163 #else
164 static inline void iscsi_check_closewait(struct iscsi_conn *conn) {};
165 #endif
166
167 static void free_pending_commands(struct iscsi_conn *conn)
168 {
169         struct iscsi_session *session = conn->session;
170         struct list_head *pending_list = &session->pending_list;
171         int req_freed;
172         struct iscsi_cmnd *cmnd;
173
174         spin_lock(&session->sn_lock);
175         do {
176                 req_freed = 0;
177                 list_for_each_entry(cmnd, pending_list, pending_list_entry) {
178                         TRACE_CONN_CLOSE_DBG("Pending cmd %p"
179                                 "(conn %p, cmd_sn %u, exp_cmd_sn %u)",
180                                 cmnd, conn, cmnd->pdu.bhs.sn,
181                                 session->exp_cmd_sn);
182                         if ((cmnd->conn == conn) &&
183                             (session->exp_cmd_sn == cmnd->pdu.bhs.sn)) {
184                                 TRACE_CONN_CLOSE_DBG("Freeing pending cmd %p",
185                                         cmnd);
186
187                                 list_del(&cmnd->pending_list_entry);
188                                 cmnd->pending = 0;
189
190                                 session->exp_cmd_sn++;
191
192                                 spin_unlock(&session->sn_lock);
193
194                                 req_cmnd_release_force(cmnd, 0);
195
196                                 req_freed = 1;
197                                 spin_lock(&session->sn_lock);
198                                 break;
199                         }
200                 }
201         } while (req_freed);
202         spin_unlock(&session->sn_lock);
203
204         return;
205 }
206
207 static void free_orphaned_pending_commands(struct iscsi_conn *conn)
208 {
209         struct iscsi_session *session = conn->session;
210         struct list_head *pending_list = &session->pending_list;
211         int req_freed;
212         struct iscsi_cmnd *cmnd;
213
214         spin_lock(&session->sn_lock);
215         do {
216                 req_freed = 0;
217                 list_for_each_entry(cmnd, pending_list, pending_list_entry) {
218                         TRACE_CONN_CLOSE_DBG("Pending cmd %p"
219                                 "(conn %p, cmd_sn %u, exp_cmd_sn %u)",
220                                 cmnd, conn, cmnd->pdu.bhs.sn,
221                                 session->exp_cmd_sn);
222                         if (cmnd->conn == conn) {
223                                 PRINT_ERROR("Freeing orphaned pending cmd %p",
224                                             cmnd);
225
226                                 list_del(&cmnd->pending_list_entry);
227                                 cmnd->pending = 0;
228
229                                 if (session->exp_cmd_sn == cmnd->pdu.bhs.sn)
230                                         session->exp_cmd_sn++;
231
232                                 spin_unlock(&session->sn_lock);
233
234                                 req_cmnd_release_force(cmnd, 0);
235
236                                 req_freed = 1;
237                                 spin_lock(&session->sn_lock);
238                                 break;
239                         }
240                 }
241         } while (req_freed);
242         spin_unlock(&session->sn_lock);
243
244         return;
245 }
246
247 #ifdef CONFIG_SCST_DEBUG
248 static void trace_conn_close(struct iscsi_conn *conn)
249 {
250         struct iscsi_cmnd *cmnd;
251 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
252         struct iscsi_cmnd *rsp;
253 #endif
254
255 #if 0
256         if (time_after(jiffies, start_waiting + 10*HZ))
257                 trace_flag |= TRACE_CONN_OC_DBG;
258 #endif
259
260         spin_lock_bh(&conn->cmd_list_lock);
261         list_for_each_entry(cmnd, &conn->cmd_list,
262                         cmd_list_entry) {
263                 TRACE_CONN_CLOSE_DBG(
264                         "cmd %p, scst_state %x, scst_cmd state %d, "
265                         "data_waiting %d, ref_cnt %d, sn %u, "
266                         "parent_req %p, pending %d",
267                         cmnd, cmnd->scst_state,
268                         (cmnd->parent_req && cmnd->scst_cmd) ?
269                                 cmnd->scst_cmd->state : -1,
270                         cmnd->data_waiting, atomic_read(&cmnd->ref_cnt),
271                         cmnd->pdu.bhs.sn, cmnd->parent_req, cmnd->pending);
272 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
273                 TRACE_CONN_CLOSE_DBG("net_ref_cnt %d, sg %p",
274                         atomic_read(&cmnd->net_ref_cnt),
275                         cmnd->sg);
276                 if (cmnd->sg != NULL) {
277                         int i;
278                         for (i = 0; i < cmnd->sg_cnt; i++) {
279                                 struct page *page = sg_page(&cmnd->sg[i]);
280                                 TRACE_CONN_CLOSE_DBG("page %p, "
281                                         "net_priv %p, _count %d",
282                                         page, page->net_priv,
283                                         atomic_read(&page->_count));
284                         }
285                 }
286
287                 sBUG_ON(cmnd->parent_req != NULL);
288
289                 spin_lock_bh(&cmnd->rsp_cmd_lock);
290                 list_for_each_entry(rsp, &cmnd->rsp_cmd_list,
291                                 rsp_cmd_list_entry) {
292                         TRACE_CONN_CLOSE_DBG("  rsp %p, "
293                             "ref_cnt %d, net_ref_cnt %d, sg %p",
294                             rsp, atomic_read(&rsp->ref_cnt),
295                             atomic_read(&rsp->net_ref_cnt), rsp->sg);
296                         if (rsp->sg != cmnd->sg && rsp->sg) {
297                                 int i;
298                                 for (i = 0; i < rsp->sg_cnt; i++) {
299                                         TRACE_CONN_CLOSE_DBG("    page %p, "
300                                           "net_priv %p, _count %d",
301                                           sg_page(&rsp->sg[i]),
302                                           sg_page(&rsp->sg[i])->net_priv,
303                                           atomic_read(&sg_page(&rsp->sg[i])->
304                                                 _count));
305                                 }
306                         }
307                 }
308                 spin_unlock_bh(&cmnd->rsp_cmd_lock);
309 #endif /* CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION */
310         }
311         spin_unlock_bh(&conn->cmd_list_lock);
312         return;
313 }
314 #else /* CONFIG_SCST_DEBUG */
315 static void trace_conn_close(struct iscsi_conn *conn) {}
316 #endif /* CONFIG_SCST_DEBUG */
317
318 void iscsi_task_mgmt_affected_cmds_done(struct scst_mgmt_cmd *scst_mcmd)
319 {
320         int fn = scst_mgmt_cmd_get_fn(scst_mcmd);
321         void *priv = scst_mgmt_cmd_get_tgt_priv(scst_mcmd);
322
323         TRACE_MGMT_DBG("scst_mcmd %p, fn %d, priv %p", scst_mcmd, fn, priv);
324
325         switch (fn) {
326         case SCST_NEXUS_LOSS_SESS:
327         case SCST_ABORT_ALL_TASKS_SESS:
328         {
329                 struct iscsi_conn *conn = (struct iscsi_conn *)priv;
330                 struct iscsi_session *sess = conn->session;
331                 struct iscsi_conn *c;
332
333                 mutex_lock(&sess->target->target_mutex);
334
335                 /*
336                  * We can't mark sess as shutting down earlier, because until
337                  * now it might have pending commands. Otherwise, in case of
338                  * reinstatement it might lead to data corruption, because
339                  * commands in being reinstated session can be executed
340                  * after commands in the new session.
341                  */
342                 sess->sess_shutting_down = 1;
343                 list_for_each_entry(c, &sess->conn_list, conn_list_entry) {
344                         if (!test_bit(ISCSI_CONN_SHUTTINGDOWN, &c->conn_aflags)) {
345                                 sess->sess_shutting_down = 0;
346                                 break;
347                         }
348                 }
349
350                 if (conn->conn_reinst_successor != NULL) {
351                         sBUG_ON(!test_bit(ISCSI_CONN_REINSTATING,
352                                   &conn->conn_reinst_successor->conn_aflags));
353                         conn_reinst_finished(conn->conn_reinst_successor);
354                         conn->conn_reinst_successor = NULL;
355                 } else if (sess->sess_reinst_successor != NULL) {
356                         sess_reinst_finished(sess->sess_reinst_successor);
357                         sess->sess_reinst_successor = NULL;
358                 }
359                 mutex_unlock(&sess->target->target_mutex);
360
361                 complete_all(&conn->ready_to_free);
362                 break;
363         }
364         default:
365                 /* Nothing to do */
366                 break;
367         }
368
369         return;
370 }
371
372 /* No locks */
373 static void close_conn(struct iscsi_conn *conn)
374 {
375         struct iscsi_session *session = conn->session;
376         struct iscsi_target *target = conn->target;
377         typeof(jiffies) start_waiting = jiffies;
378         typeof(jiffies) shut_start_waiting = start_waiting;
379         bool pending_reported = 0, wait_expired = 0, shut_expired = 0;
380         bool reinst;
381
382 #define CONN_PENDING_TIMEOUT    ((typeof(jiffies))10*HZ)
383 #define CONN_WAIT_TIMEOUT       ((typeof(jiffies))10*HZ)
384 #define CONN_REG_SHUT_TIMEOUT   ((typeof(jiffies))125*HZ)
385 #define CONN_DEL_SHUT_TIMEOUT   ((typeof(jiffies))10*HZ)
386
387         TRACE_ENTRY();
388
389         TRACE_CONN_CLOSE("Closing connection %p (conn_ref_cnt=%d)", conn,
390                 atomic_read(&conn->conn_ref_cnt));
391
392         iscsi_extracheck_is_rd_thread(conn);
393
394         sBUG_ON(!conn->closing);
395
396         if (conn->active_close) {
397                 /* We want all our already send operations to complete */
398                 conn->sock->ops->shutdown(conn->sock, RCV_SHUTDOWN);
399         } else {
400                 conn->sock->ops->shutdown(conn->sock,
401                         RCV_SHUTDOWN|SEND_SHUTDOWN);
402         }
403
404         mutex_lock(&session->target->target_mutex);
405
406         set_bit(ISCSI_CONN_SHUTTINGDOWN, &conn->conn_aflags);
407         reinst = (conn->conn_reinst_successor != NULL);
408
409         mutex_unlock(&session->target->target_mutex);
410
411         if (reinst) {
412                 int rc;
413                 int lun = 0;
414
415                 /* Abort all outstanding commands */
416                 rc = scst_rx_mgmt_fn_lun(session->scst_sess,
417                         SCST_ABORT_ALL_TASKS_SESS, (uint8_t *)&lun, sizeof(lun),
418                         SCST_NON_ATOMIC, conn);
419                 if (rc != 0)
420                         PRINT_ERROR("SCST_ABORT_ALL_TASKS_SESS failed %d", rc);
421         } else {
422                 int rc;
423                 int lun = 0;
424
425                 rc = scst_rx_mgmt_fn_lun(session->scst_sess,
426                         SCST_NEXUS_LOSS_SESS, (uint8_t *)&lun, sizeof(lun),
427                         SCST_NON_ATOMIC, conn);
428                 if (rc != 0)
429                         PRINT_ERROR("SCST_NEXUS_LOSS_SESS failed %d", rc);
430         }
431
432         if (conn->read_state != RX_INIT_BHS) {
433                 struct iscsi_cmnd *cmnd = conn->read_cmnd;
434
435                 if (cmnd->scst_state == ISCSI_CMD_STATE_RX_CMD) {
436                         TRACE_DBG("Going to wait for cmnd %p to change state "
437                                 "from RX_CMD", cmnd);
438                 }
439                 wait_event(conn->read_state_waitQ,
440                         cmnd->scst_state != ISCSI_CMD_STATE_RX_CMD);
441
442                 conn->read_cmnd = NULL;
443                 conn->read_state = RX_INIT_BHS;
444                 req_cmnd_release_force(cmnd, 0);
445         }
446
447         conn_abort(conn);
448
449         /* ToDo: not the best way to wait */
450         while (atomic_read(&conn->conn_ref_cnt) != 0) {
451                 mutex_lock(&target->target_mutex);
452                 spin_lock(&session->sn_lock);
453                 if (session->tm_rsp && session->tm_rsp->conn == conn) {
454                         struct iscsi_cmnd *tm_rsp = session->tm_rsp;
455                         TRACE(TRACE_MGMT_MINOR, "Dropping delayed TM rsp %p",
456                                 tm_rsp);
457                         session->tm_rsp = NULL;
458                         session->tm_active--;
459                         WARN_ON(session->tm_active < 0);
460                         spin_unlock(&session->sn_lock);
461                         mutex_unlock(&target->target_mutex);
462
463                         rsp_cmnd_release(tm_rsp);
464                 } else {
465                         spin_unlock(&session->sn_lock);
466                         mutex_unlock(&target->target_mutex);
467                 }
468
469                 /* It's safe to check it without sn_lock */
470                 if (!list_empty(&session->pending_list)) {
471                         TRACE_CONN_CLOSE_DBG("Disposing pending commands on "
472                                 "connection %p (conn_ref_cnt=%d)", conn,
473                                 atomic_read(&conn->conn_ref_cnt));
474
475                         free_pending_commands(conn);
476
477                         if (time_after(jiffies,
478                                 start_waiting + CONN_PENDING_TIMEOUT)) {
479                                 if (!pending_reported) {
480                                         TRACE_CONN_CLOSE("%s",
481                                                 "Pending wait time expired");
482                                         pending_reported = 1;
483                                 }
484                                 free_orphaned_pending_commands(conn);
485                         }
486                 }
487
488                 iscsi_make_conn_wr_active(conn);
489
490                 /* That's for active close only, actually */
491                 if (time_after(jiffies, start_waiting + CONN_WAIT_TIMEOUT) &&
492                     !wait_expired) {
493                         TRACE_CONN_CLOSE("Wait time expired (conn %p, "
494                                 "sk_state %d)",
495                                 conn, conn->sock->sk->sk_state);
496                         conn->sock->ops->shutdown(conn->sock, SEND_SHUTDOWN);
497                         wait_expired = 1;
498                         shut_start_waiting = jiffies;
499                 }
500
501                 if (wait_expired && !shut_expired &&
502                     time_after(jiffies, shut_start_waiting +
503                                 conn->deleting ? CONN_DEL_SHUT_TIMEOUT :
504                                                  CONN_REG_SHUT_TIMEOUT)) {
505                         TRACE_CONN_CLOSE("Wait time after shutdown expired "
506                                 "(conn %p, sk_state %d)", conn,
507                                 conn->sock->sk->sk_state);
508                         conn->sock->sk->sk_prot->disconnect(conn->sock->sk, 0);
509                         shut_expired = 1;
510                 }
511
512                 if (conn->deleting)
513                         msleep(200);
514                 else
515                         msleep(1000);
516
517                 TRACE_CONN_CLOSE_DBG("conn %p, conn_ref_cnt %d left, "
518                         "wr_state %d, exp_cmd_sn %u",
519                         conn, atomic_read(&conn->conn_ref_cnt),
520                         conn->wr_state, session->exp_cmd_sn);
521
522                 trace_conn_close(conn);
523
524                 iscsi_check_closewait(conn);
525         }
526
527         write_lock_bh(&conn->sock->sk->sk_callback_lock);
528         conn->sock->sk->sk_state_change = conn->old_state_change;
529         conn->sock->sk->sk_data_ready = conn->old_data_ready;
530         conn->sock->sk->sk_write_space = conn->old_write_space;
531         write_unlock_bh(&conn->sock->sk->sk_callback_lock);
532
533         while (1) {
534                 bool t;
535
536                 spin_lock_bh(&iscsi_wr_lock);
537                 t = (conn->wr_state == ISCSI_CONN_WR_STATE_IDLE);
538                 spin_unlock_bh(&iscsi_wr_lock);
539
540                 if (t && (atomic_read(&conn->conn_ref_cnt) == 0))
541                         break;
542
543                 TRACE_CONN_CLOSE_DBG("Waiting for wr thread (conn %p), "
544                         "wr_state %x", conn, conn->wr_state);
545                 msleep(50);
546         }
547
548         wait_for_completion(&conn->ready_to_free);
549
550         TRACE_CONN_CLOSE("Notifying user space about closing connection %p",
551                          conn);
552         event_send(target->tid, session->sid, conn->cid, E_CONN_CLOSE);
553
554 #ifdef CONFIG_SCST_PROC
555         mutex_lock(&target->target_mutex);
556         conn_free(conn);
557         mutex_unlock(&target->target_mutex);
558 #else
559         kobject_put(&conn->iscsi_conn_kobj);
560 #endif
561
562         TRACE_EXIT();
563         return;
564 }
565
566 static int close_conn_thr(void *arg)
567 {
568         struct iscsi_conn *conn = (struct iscsi_conn *)arg;
569
570         TRACE_ENTRY();
571
572 #ifdef CONFIG_SCST_EXTRACHECKS
573         /*
574          * To satisfy iscsi_extracheck_is_rd_thread() in functions called
575          * on the connection close. It is safe, because at this point conn
576          * can't be used by any other thread.
577          */
578         conn->rd_task = current;
579 #endif
580         close_conn(conn);
581
582         TRACE_EXIT();
583         return 0;
584 }
585
586 /* No locks */
587 static void start_close_conn(struct iscsi_conn *conn)
588 {
589         struct task_struct *t;
590
591         TRACE_ENTRY();
592
593         t = kthread_run(close_conn_thr, conn, "iscsi_conn_cleanup");
594         if (IS_ERR(t)) {
595                 PRINT_ERROR("kthread_run() failed (%ld), closing conn %p "
596                         "directly", PTR_ERR(t), conn);
597                 close_conn(conn);
598         }
599
600         TRACE_EXIT();
601         return;
602 }
603
604 static inline void iscsi_conn_init_read(struct iscsi_conn *conn,
605         void __user *data, size_t len)
606 {
607         conn->read_iov[0].iov_base = data;
608         conn->read_iov[0].iov_len = len;
609         conn->read_msg.msg_iov = conn->read_iov;
610         conn->read_msg.msg_iovlen = 1;
611         conn->read_size = len;
612         return;
613 }
614
615 static void iscsi_conn_prepare_read_ahs(struct iscsi_conn *conn,
616         struct iscsi_cmnd *cmnd)
617 {
618         int asize = (cmnd->pdu.ahssize + 3) & -4;
619
620         /* ToDo: __GFP_NOFAIL ?? */
621         cmnd->pdu.ahs = kmalloc(asize, __GFP_NOFAIL|GFP_KERNEL);
622         sBUG_ON(cmnd->pdu.ahs == NULL);
623         iscsi_conn_init_read(conn, (void __force __user *)cmnd->pdu.ahs, asize);
624         return;
625 }
626
627 static struct iscsi_cmnd *iscsi_get_send_cmnd(struct iscsi_conn *conn)
628 {
629         struct iscsi_cmnd *cmnd = NULL;
630
631         spin_lock_bh(&conn->write_list_lock);
632         if (!list_empty(&conn->write_list)) {
633                 cmnd = list_entry(conn->write_list.next, struct iscsi_cmnd,
634                                 write_list_entry);
635                 cmd_del_from_write_list(cmnd);
636                 cmnd->write_processing_started = 1;
637         }
638         spin_unlock_bh(&conn->write_list_lock);
639
640         return cmnd;
641 }
642
643 /* Returns number of bytes left to receive or <0 for error */
644 static int do_recv(struct iscsi_conn *conn)
645 {
646         int res;
647         mm_segment_t oldfs;
648         struct msghdr msg;
649         int first_len;
650
651         EXTRACHECKS_BUG_ON(conn->read_cmnd == NULL);
652
653         if (unlikely(conn->closing)) {
654                 res = -EIO;
655                 goto out;
656         }
657
658         /*
659          * We suppose that if sock_recvmsg() returned less data than requested,
660          * then next time it will return -EAGAIN, so there's no point to call
661          * it again.
662          */
663
664 restart:
665         memset(&msg, 0, sizeof(msg));
666         msg.msg_iov = conn->read_msg.msg_iov;
667         msg.msg_iovlen = conn->read_msg.msg_iovlen;
668         first_len = msg.msg_iov->iov_len;
669
670         oldfs = get_fs();
671         set_fs(get_ds());
672         res = sock_recvmsg(conn->sock, &msg, conn->read_size,
673                            MSG_DONTWAIT | MSG_NOSIGNAL);
674         set_fs(oldfs);
675
676         if (res > 0) {
677                 /*
678                  * To save some considerable effort and CPU power we
679                  * suppose that TCP functions adjust
680                  * conn->read_msg.msg_iov and conn->read_msg.msg_iovlen
681                  * on amount of copied data. This BUG_ON is intended
682                  * to catch if it is changed in the future.
683                  */
684                 sBUG_ON((res >= first_len) &&
685                         (conn->read_msg.msg_iov->iov_len != 0));
686                 conn->read_size -= res;
687                 if (conn->read_size != 0) {
688                         if (res >= first_len) {
689                                 int done = 1 + ((res - first_len) >> PAGE_SHIFT);
690                                 conn->read_msg.msg_iov += done;
691                                 conn->read_msg.msg_iovlen -= done;
692                         }
693                 }
694                 res = conn->read_size;
695         } else {
696                 switch (res) {
697                 case -EAGAIN:
698                         TRACE_DBG("EAGAIN received for conn %p", conn);
699                         res = conn->read_size;
700                         break;
701                 case -ERESTARTSYS:
702                         TRACE_DBG("ERESTARTSYS received for conn %p", conn);
703                         goto restart;
704                 default:
705                         PRINT_ERROR("sock_recvmsg() failed: %d", res);
706                         mark_conn_closed(conn);
707                         if (res == 0)
708                                 res = -EIO;
709                         break;
710                 }
711         }
712
713 out:
714         TRACE_EXIT_RES(res);
715         return res;
716 }
717
718 static int iscsi_rx_check_ddigest(struct iscsi_conn *conn)
719 {
720         struct iscsi_cmnd *cmnd = conn->read_cmnd;
721         int res;
722
723         res = do_recv(conn);
724         if (res == 0) {
725                 conn->read_state = RX_END;
726
727                 if (cmnd->pdu.datasize <= 16*1024) {
728                         /*
729                          * It's cache hot, so let's compute it inline. The
730                          * choice here about what will expose more latency:
731                          * possible cache misses or the digest calculation.
732                          */
733                         TRACE_DBG("cmnd %p, opcode %x: checking RX "
734                                 "ddigest inline", cmnd, cmnd_opcode(cmnd));
735                         cmnd->ddigest_checked = 1;
736                         res = digest_rx_data(cmnd);
737                         if (unlikely(res != 0)) {
738                                 mark_conn_closed(conn);
739                                 goto out;
740                         }
741                 } else if (cmnd_opcode(cmnd) == ISCSI_OP_SCSI_CMD) {
742                         cmd_add_on_rx_ddigest_list(cmnd, cmnd);
743                         cmnd_get(cmnd);
744                 } else if (cmnd_opcode(cmnd) != ISCSI_OP_SCSI_DATA_OUT) {
745                         /*
746                          * We could get here only for NOP-Out. ISCSI RFC
747                          * doesn't specify how to deal with digest errors in
748                          * this case. Is closing connection correct?
749                          */
750                         TRACE_DBG("cmnd %p, opcode %x: checking NOP RX "
751                                 "ddigest", cmnd, cmnd_opcode(cmnd));
752                         res = digest_rx_data(cmnd);
753                         if (unlikely(res != 0)) {
754                                 mark_conn_closed(conn);
755                                 goto out;
756                         }
757                 }
758         }
759
760 out:
761         return res;
762 }
763
764 /* No locks, conn is rd processing */
765 static void process_read_io(struct iscsi_conn *conn, int *closed)
766 {
767         struct iscsi_cmnd *cmnd = conn->read_cmnd;
768         int res = 0;
769
770         TRACE_ENTRY();
771
772         /* In case of error cmnd will be freed in close_conn() */
773
774         do {
775                 switch (conn->read_state) {
776                 case RX_INIT_BHS:
777                         EXTRACHECKS_BUG_ON(conn->read_cmnd != NULL);
778                         cmnd = cmnd_alloc(conn, NULL);
779                         conn->read_cmnd = cmnd;
780                         iscsi_conn_init_read(cmnd->conn,
781                                 (void __force __user *)&cmnd->pdu.bhs,
782                                 sizeof(cmnd->pdu.bhs));
783                         conn->read_state = RX_BHS;
784                         /* go through */
785
786                 case RX_BHS:
787                         res = do_recv(conn);
788                         if (res == 0) {
789                                 iscsi_cmnd_get_length(&cmnd->pdu);
790                                 if (cmnd->pdu.ahssize == 0) {
791                                         if ((conn->hdigest_type & DIGEST_NONE) == 0)
792                                                 conn->read_state = RX_INIT_HDIGEST;
793                                         else
794                                                 conn->read_state = RX_CMD_START;
795                                 } else {
796                                         iscsi_conn_prepare_read_ahs(conn, cmnd);
797                                         conn->read_state = RX_AHS;
798                                 }
799                         }
800                         break;
801
802                 case RX_CMD_START:
803                         res = cmnd_rx_start(cmnd);
804                         if (res == 0) {
805                                 if (cmnd->pdu.datasize == 0)
806                                         conn->read_state = RX_END;
807                                 else
808                                         conn->read_state = RX_DATA;
809                         } else if (res > 0)
810                                 conn->read_state = RX_CMD_CONTINUE;
811                         else
812                                 sBUG_ON(!conn->closing);
813                         break;
814
815                 case RX_CMD_CONTINUE:
816                         if (cmnd->scst_state == ISCSI_CMD_STATE_RX_CMD) {
817                                 TRACE_DBG("cmnd %p is still in RX_CMD state",
818                                         cmnd);
819                                 res = 1;
820                                 break;
821                         }
822                         res = cmnd_rx_continue(cmnd);
823                         if (unlikely(res != 0))
824                                 sBUG_ON(!conn->closing);
825                         else {
826                                 if (cmnd->pdu.datasize == 0)
827                                         conn->read_state = RX_END;
828                                 else
829                                         conn->read_state = RX_DATA;
830                         }
831                         break;
832
833                 case RX_DATA:
834                         res = do_recv(conn);
835                         if (res == 0) {
836                                 int psz = ((cmnd->pdu.datasize + 3) & -4) - cmnd->pdu.datasize;
837                                 if (psz != 0) {
838                                         TRACE_DBG("padding %d bytes", psz);
839                                         iscsi_conn_init_read(conn,
840                                                 (void __force __user *)&conn->rpadding, psz);
841                                         conn->read_state = RX_PADDING;
842                                 } else if ((conn->ddigest_type & DIGEST_NONE) != 0)
843                                         conn->read_state = RX_END;
844                                 else
845                                         conn->read_state = RX_INIT_DDIGEST;
846                         }
847                         break;
848
849                 case RX_END:
850                         if (unlikely(conn->read_size != 0)) {
851                                 PRINT_CRIT_ERROR("%d %x %d", res,
852                                         cmnd_opcode(cmnd), conn->read_size);
853                                 sBUG();
854                         }
855                         conn->read_cmnd = NULL;
856                         conn->read_state = RX_INIT_BHS;
857
858                         cmnd_rx_end(cmnd);
859
860                         EXTRACHECKS_BUG_ON(conn->read_size != 0);
861                         break;
862
863                 case RX_INIT_HDIGEST:
864                         iscsi_conn_init_read(conn,
865                                 (void __force __user *)&cmnd->hdigest, sizeof(u32));
866                         conn->read_state = RX_CHECK_HDIGEST;
867                         /* go through */
868
869                 case RX_CHECK_HDIGEST:
870                         res = do_recv(conn);
871                         if (res == 0) {
872                                 res = digest_rx_header(cmnd);
873                                 if (unlikely(res != 0)) {
874                                         PRINT_ERROR("rx header digest for "
875                                                 "initiator %s failed (%d)",
876                                                 conn->session->initiator_name,
877                                                 res);
878                                         mark_conn_closed(conn);
879                                 } else
880                                         conn->read_state = RX_CMD_START;
881                         }
882                         break;
883
884                 case RX_INIT_DDIGEST:
885                         iscsi_conn_init_read(conn,
886                                 (void __force __user *)&cmnd->ddigest,
887                                 sizeof(u32));
888                         conn->read_state = RX_CHECK_DDIGEST;
889                         /* go through */
890
891                 case RX_CHECK_DDIGEST:
892                         res = iscsi_rx_check_ddigest(conn);
893                         break;
894
895                 case RX_AHS:
896                         res = do_recv(conn);
897                         if (res == 0) {
898                                 if ((conn->hdigest_type & DIGEST_NONE) == 0)
899                                         conn->read_state = RX_INIT_HDIGEST;
900                                 else
901                                         conn->read_state = RX_CMD_START;
902                         }
903                         break;
904
905                 case RX_PADDING:
906                         res = do_recv(conn);
907                         if (res == 0) {
908                                 if ((conn->ddigest_type & DIGEST_NONE) == 0)
909                                         conn->read_state = RX_INIT_DDIGEST;
910                                 else
911                                         conn->read_state = RX_END;
912                         }
913                         break;
914
915                 default:
916                         PRINT_CRIT_ERROR("%d %x", conn->read_state, cmnd_opcode(cmnd));
917                         sBUG();
918                 }
919         } while (res == 0);
920
921         if (unlikely(conn->closing)) {
922                 start_close_conn(conn);
923                 *closed = 1;
924         }
925
926         TRACE_EXIT();
927         return;
928 }
929
930 /*
931  * Called under iscsi_rd_lock and BHs disabled, but will drop it inside,
932  * then reaquire.
933  */
934 static void scst_do_job_rd(void)
935         __acquires(&iscsi_rd_lock)
936         __releases(&iscsi_rd_lock)
937 {
938         TRACE_ENTRY();
939
940         /*
941          * We delete/add to tail connections to maintain fairness between them.
942          */
943
944         while (!list_empty(&iscsi_rd_list)) {
945                 int closed = 0;
946                 struct iscsi_conn *conn = list_entry(iscsi_rd_list.next,
947                         typeof(*conn), rd_list_entry);
948
949                 list_del(&conn->rd_list_entry);
950
951                 sBUG_ON(conn->rd_state == ISCSI_CONN_RD_STATE_PROCESSING);
952                 conn->rd_data_ready = 0;
953                 conn->rd_state = ISCSI_CONN_RD_STATE_PROCESSING;
954 #ifdef CONFIG_SCST_EXTRACHECKS
955                 conn->rd_task = current;
956 #endif
957                 spin_unlock_bh(&iscsi_rd_lock);
958
959                 process_read_io(conn, &closed);
960
961                 spin_lock_bh(&iscsi_rd_lock);
962
963                 if (closed)
964                         continue;
965
966 #ifdef CONFIG_SCST_EXTRACHECKS
967                 conn->rd_task = NULL;
968 #endif
969                 if (conn->rd_data_ready) {
970                         list_add_tail(&conn->rd_list_entry, &iscsi_rd_list);
971                         conn->rd_state = ISCSI_CONN_RD_STATE_IN_LIST;
972                 } else
973                         conn->rd_state = ISCSI_CONN_RD_STATE_IDLE;
974         }
975
976         TRACE_EXIT();
977         return;
978 }
979
980 static inline int test_rd_list(void)
981 {
982         int res = !list_empty(&iscsi_rd_list) ||
983                   unlikely(kthread_should_stop());
984         return res;
985 }
986
987 int istrd(void *arg)
988 {
989         TRACE_ENTRY();
990
991         PRINT_INFO("Read thread started, PID %d", current->pid);
992
993         current->flags |= PF_NOFREEZE;
994
995         spin_lock_bh(&iscsi_rd_lock);
996         while (!kthread_should_stop()) {
997                 wait_queue_t wait;
998                 init_waitqueue_entry(&wait, current);
999
1000                 if (!test_rd_list()) {
1001                         add_wait_queue_exclusive_head(&iscsi_rd_waitQ, &wait);
1002                         for (;;) {
1003                                 set_current_state(TASK_INTERRUPTIBLE);
1004                                 if (test_rd_list())
1005                                         break;
1006                                 spin_unlock_bh(&iscsi_rd_lock);
1007                                 schedule();
1008                                 spin_lock_bh(&iscsi_rd_lock);
1009                         }
1010                         set_current_state(TASK_RUNNING);
1011                         remove_wait_queue(&iscsi_rd_waitQ, &wait);
1012                 }
1013                 scst_do_job_rd();
1014         }
1015         spin_unlock_bh(&iscsi_rd_lock);
1016
1017         /*
1018          * If kthread_should_stop() is true, we are guaranteed to be
1019          * on the module unload, so iscsi_rd_list must be empty.
1020          */
1021         sBUG_ON(!list_empty(&iscsi_rd_list));
1022
1023         PRINT_INFO("Read thread PID %d finished", current->pid);
1024
1025         TRACE_EXIT();
1026         return 0;
1027 }
1028
1029 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
1030 static inline void __iscsi_get_page_callback(struct iscsi_cmnd *cmd)
1031 {
1032         int v;
1033
1034         TRACE_NET_PAGE("cmd %p, new net_ref_cnt %d",
1035                 cmd, atomic_read(&cmd->net_ref_cnt)+1);
1036
1037         v = atomic_inc_return(&cmd->net_ref_cnt);
1038         if (v == 1) {
1039                 TRACE_NET_PAGE("getting cmd %p", cmd);
1040                 cmnd_get(cmd);
1041         }
1042         return;
1043 }
1044
1045 void iscsi_get_page_callback(struct page *page)
1046 {
1047         struct iscsi_cmnd *cmd = (struct iscsi_cmnd *)page->net_priv;
1048
1049         TRACE_NET_PAGE("page %p, _count %d", page,
1050                 atomic_read(&page->_count));
1051
1052         __iscsi_get_page_callback(cmd);
1053         return;
1054 }
1055
1056 static inline void __iscsi_put_page_callback(struct iscsi_cmnd *cmd)
1057 {
1058         TRACE_NET_PAGE("cmd %p, new net_ref_cnt %d", cmd,
1059                 atomic_read(&cmd->net_ref_cnt)-1);
1060
1061         if (atomic_dec_and_test(&cmd->net_ref_cnt)) {
1062                 int i, sg_cnt = cmd->sg_cnt;
1063                 for (i = 0; i < sg_cnt; i++) {
1064                         struct page *page = sg_page(&cmd->sg[i]);
1065                         TRACE_NET_PAGE("Clearing page %p", page);
1066                         if (page->net_priv == cmd)
1067                                 page->net_priv = NULL;
1068                 }
1069                 cmnd_put(cmd);
1070         }
1071         return;
1072 }
1073
1074 void iscsi_put_page_callback(struct page *page)
1075 {
1076         struct iscsi_cmnd *cmd = (struct iscsi_cmnd *)page->net_priv;
1077
1078         TRACE_NET_PAGE("page %p, _count %d", page,
1079                 atomic_read(&page->_count));
1080
1081         __iscsi_put_page_callback(cmd);
1082         return;
1083 }
1084
1085 static void check_net_priv(struct iscsi_cmnd *cmd, struct page *page)
1086 {
1087         if ((atomic_read(&cmd->net_ref_cnt) == 1) && (page->net_priv == cmd)) {
1088                 TRACE_DBG("sendpage() not called get_page(), zeroing net_priv "
1089                         "%p (page %p)", page->net_priv, page);
1090                 page->net_priv = NULL;
1091         }
1092         return;
1093 }
1094 #else
1095 static inline void check_net_priv(struct iscsi_cmnd *cmd, struct page *page) {}
1096 static inline void __iscsi_get_page_callback(struct iscsi_cmnd *cmd) {}
1097 static inline void __iscsi_put_page_callback(struct iscsi_cmnd *cmd) {}
1098 #endif
1099
1100 /* This is partially taken from the Ardis code. */
1101 static int write_data(struct iscsi_conn *conn)
1102 {
1103         mm_segment_t oldfs;
1104         struct file *file;
1105         struct iovec *iop;
1106         struct socket *sock;
1107         ssize_t (*sock_sendpage)(struct socket *, struct page *, int, size_t,
1108                                  int);
1109         ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int);
1110         struct iscsi_cmnd *write_cmnd = conn->write_cmnd;
1111         struct iscsi_cmnd *ref_cmd;
1112         struct page *page;
1113         struct scatterlist *sg;
1114         int saved_size, size, sendsize;
1115         int length, offset, idx;
1116         int flags, res, count, sg_size;
1117         bool do_put = false, ref_cmd_to_parent;
1118
1119         TRACE_ENTRY();
1120
1121         iscsi_extracheck_is_wr_thread(conn);
1122
1123         if (write_cmnd->own_sg == 0) {
1124                 ref_cmd = write_cmnd->parent_req;
1125                 ref_cmd_to_parent = true;
1126         } else {
1127                 ref_cmd = write_cmnd;
1128                 ref_cmd_to_parent = false;
1129         }
1130
1131         if (!ref_cmd->on_written_list) {
1132                 TRACE_DBG("Adding cmd %p to conn %p written_list", ref_cmd,
1133                         conn);
1134                 spin_lock_bh(&conn->write_list_lock);
1135                 ref_cmd->on_written_list = 1;
1136                 ref_cmd->write_timeout = jiffies + ISCSI_RSP_TIMEOUT;
1137                 list_add_tail(&ref_cmd->written_list_entry,
1138                         &conn->written_list);
1139                 spin_unlock_bh(&conn->write_list_lock);
1140         }
1141
1142         if (!timer_pending(&conn->rsp_timer)) {
1143                 sBUG_ON(!ref_cmd->on_written_list);
1144                 spin_lock_bh(&conn->write_list_lock);
1145                 if (likely(!timer_pending(&conn->rsp_timer))) {
1146                         TRACE_DBG("Starting timer on %ld (conn %p)",
1147                                 ref_cmd->write_timeout, conn);
1148                         conn->rsp_timer.expires = ref_cmd->write_timeout;
1149                         add_timer(&conn->rsp_timer);
1150                 }
1151                 spin_unlock_bh(&conn->write_list_lock);
1152         }
1153
1154         file = conn->file;
1155         size = conn->write_size;
1156         saved_size = size;
1157         iop = conn->write_iop;
1158         count = conn->write_iop_used;
1159
1160         if (iop) {
1161                 while (1) {
1162                         loff_t off = 0;
1163                         int rest;
1164
1165                         sBUG_ON(count > (signed)(sizeof(conn->write_iov) /
1166                                                 sizeof(conn->write_iov[0])));
1167 retry:
1168                         oldfs = get_fs();
1169                         set_fs(KERNEL_DS);
1170                         res = vfs_writev(file,
1171                                          (struct iovec __force __user *)iop,
1172                                          count, &off);
1173                         set_fs(oldfs);
1174                         TRACE_WRITE("sid %#Lx, cid %u, res %d, iov_len %ld",
1175                                     (long long unsigned int)conn->session->sid,
1176                                     conn->cid, res, (long)iop->iov_len);
1177                         if (unlikely(res <= 0)) {
1178                                 if (res == -EAGAIN) {
1179                                         conn->write_iop = iop;
1180                                         conn->write_iop_used = count;
1181                                         goto out_iov;
1182                                 } else if (res == -EINTR)
1183                                         goto retry;
1184                                 goto out_err;
1185                         }
1186
1187                         rest = res;
1188                         size -= res;
1189                         while ((typeof(rest))iop->iov_len <= rest && rest) {
1190                                 rest -= iop->iov_len;
1191                                 iop++;
1192                                 count--;
1193                         }
1194                         if (count == 0) {
1195                                 conn->write_iop = NULL;
1196                                 conn->write_iop_used = 0;
1197                                 if (size)
1198                                         break;
1199                                 goto out_iov;
1200                         }
1201                         sBUG_ON(iop > conn->write_iov + sizeof(conn->write_iov)
1202                                                   /sizeof(conn->write_iov[0]));
1203                         iop->iov_base += rest;
1204                         iop->iov_len -= rest;
1205                 }
1206         }
1207
1208         sg = write_cmnd->sg;
1209         if (unlikely(sg == NULL)) {
1210                 PRINT_INFO("WARNING: Data missed (cmd %p)!", write_cmnd);
1211                 res = 0;
1212                 goto out;
1213         }
1214
1215         /* To protect from too early transfer completion race */
1216         __iscsi_get_page_callback(ref_cmd);
1217         do_put = true;
1218
1219         sock = conn->sock;
1220
1221 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
1222         sock_sendpage = sock->ops->sendpage;
1223 #else
1224         if ((write_cmnd->parent_req->scst_cmd != NULL) &&
1225             scst_cmd_get_dh_data_buff_alloced(write_cmnd->parent_req->scst_cmd))
1226                 sock_sendpage = sock_no_sendpage;
1227         else
1228                 sock_sendpage = sock->ops->sendpage;
1229 #endif
1230
1231         flags = MSG_DONTWAIT;
1232         sg_size = size;
1233
1234         if (sg != write_cmnd->rsp_sg) {
1235                 offset = conn->write_offset + sg[0].offset;
1236                 idx = offset >> PAGE_SHIFT;
1237                 offset &= ~PAGE_MASK;
1238                 length = min(size, (int)PAGE_SIZE - offset);
1239                 TRACE_WRITE("write_offset %d, sg_size %d, idx %d, offset %d, "
1240                         "length %d", conn->write_offset, sg_size, idx, offset,
1241                         length);
1242         } else {
1243                 idx = 0;
1244                 offset = conn->write_offset;
1245                 while (offset >= sg[idx].length) {
1246                         offset -= sg[idx].length;
1247                         idx++;
1248                 }
1249                 length = sg[idx].length - offset;
1250                 offset += sg[idx].offset;
1251                 sock_sendpage = sock_no_sendpage;
1252                 TRACE_WRITE("rsp_sg: write_offset %d, sg_size %d, idx %d, "
1253                         "offset %d, length %d", conn->write_offset, sg_size,
1254                         idx, offset, length);
1255         }
1256         page = sg_page(&sg[idx]);
1257
1258         while (1) {
1259                 sendpage = sock_sendpage;
1260
1261 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
1262                 {
1263                         static DEFINE_SPINLOCK(net_priv_lock);
1264                         spin_lock(&net_priv_lock);
1265                         if (unlikely(page->net_priv != NULL)) {
1266                                 if (page->net_priv != ref_cmd) {
1267                                         /*
1268                                          * This might happen if user space
1269                                          * supplies to scst_user the same
1270                                          * pages in different commands or in
1271                                          * case of zero-copy FILEIO, when
1272                                          * several initiators request the same
1273                                          * data simultaneously.
1274                                          */
1275                                         TRACE_DBG("net_priv isn't NULL and != "
1276                                             "ref_cmd (write_cmnd %p, ref_cmd "
1277                                             "%p, sg %p, idx %d, page %p, "
1278                                             "net_priv %p)",
1279                                             write_cmnd, ref_cmd, sg, idx,
1280                                             page, page->net_priv);
1281                                         sendpage = sock_no_sendpage;
1282                                 }
1283                         } else
1284                                 page->net_priv = ref_cmd;
1285                         spin_unlock(&net_priv_lock);
1286                 }
1287 #endif
1288                 sendsize = min(size, length);
1289                 if (size <= sendsize) {
1290 retry2:
1291                         res = sendpage(sock, page, offset, size, flags);
1292                         TRACE_WRITE("Final %s sid %#Lx, cid %u, res %d (page "
1293                                 "index %lu, offset %u, size %u, cmd %p, "
1294                                 "page %p)", (sendpage != sock_no_sendpage) ?
1295                                                 "sendpage" : "sock_no_sendpage",
1296                                 (long long unsigned int)conn->session->sid,
1297                                 conn->cid, res, page->index,
1298                                 offset, size, write_cmnd, page);
1299                         if (unlikely(res <= 0)) {
1300                                 if (res == -EINTR)
1301                                         goto retry2;
1302                                 else
1303                                         goto out_res;
1304                         }
1305
1306                         check_net_priv(ref_cmd, page);
1307                         if (res == size) {
1308                                 conn->write_size = 0;
1309                                 res = saved_size;
1310                                 goto out_put;
1311                         }
1312
1313                         offset += res;
1314                         size -= res;
1315                         goto retry2;
1316                 }
1317
1318 retry1:
1319                 res = sendpage(sock, page, offset, sendsize, flags | MSG_MORE);
1320                 TRACE_WRITE("%s sid %#Lx, cid %u, res %d (page index %lu, "
1321                         "offset %u, sendsize %u, size %u, cmd %p, page %p)",
1322                         (sendpage != sock_no_sendpage) ? "sendpage" :
1323                                                          "sock_no_sendpage",
1324                         (unsigned long long)conn->session->sid, conn->cid,
1325                         res, page->index, offset, sendsize, size,
1326                         write_cmnd, page);
1327                 if (unlikely(res <= 0)) {
1328                         if (res == -EINTR)
1329                                 goto retry1;
1330                         else
1331                                 goto out_res;
1332                 }
1333
1334                 check_net_priv(ref_cmd, page);
1335
1336                 size -= res;
1337
1338                 if (res == sendsize) {
1339                         idx++;
1340                         EXTRACHECKS_BUG_ON(idx >= ref_cmd->sg_cnt);
1341                         page = sg_page(&sg[idx]);
1342                         length = sg[idx].length;
1343                         offset = sg[idx].offset;
1344                 } else {
1345                         offset += res;
1346                         sendsize -= res;
1347                         goto retry1;
1348                 }
1349         }
1350
1351 out_off:
1352         conn->write_offset += sg_size - size;
1353
1354 out_iov:
1355         conn->write_size = size;
1356         if ((saved_size == size) && res == -EAGAIN)
1357                 goto out_put;
1358
1359         res = saved_size - size;
1360
1361 out_put:
1362         if (do_put)
1363                 __iscsi_put_page_callback(ref_cmd);
1364
1365 out:
1366         TRACE_EXIT_RES(res);
1367         return res;
1368
1369 out_res:
1370         check_net_priv(ref_cmd, page);
1371         if (res == -EAGAIN)
1372                 goto out_off;
1373         /* else go through */
1374
1375 out_err:
1376 #ifndef CONFIG_SCST_DEBUG
1377         if (!conn->closing)
1378 #endif
1379         {
1380                 PRINT_ERROR("error %d at sid:cid %#Lx:%u, cmnd %p", res,
1381                             (long long unsigned int)conn->session->sid,
1382                             conn->cid, conn->write_cmnd);
1383         }
1384         if (ref_cmd_to_parent &&
1385             ((ref_cmd->scst_cmd != NULL) || (ref_cmd->scst_aen != NULL))) {
1386                 if (ref_cmd->scst_state == ISCSI_CMD_STATE_AEN)
1387                         scst_set_aen_delivery_status(ref_cmd->scst_aen,
1388                                 SCST_AEN_RES_FAILED);
1389                 else
1390                         scst_set_delivery_status(ref_cmd->scst_cmd,
1391                                 SCST_CMD_DELIVERY_FAILED);
1392         }
1393         goto out_put;
1394 }
1395
1396 static int exit_tx(struct iscsi_conn *conn, int res)
1397 {
1398         iscsi_extracheck_is_wr_thread(conn);
1399
1400         switch (res) {
1401         case -EAGAIN:
1402         case -ERESTARTSYS:
1403                 res = 0;
1404                 break;
1405         default:
1406 #ifndef CONFIG_SCST_DEBUG
1407                 if (!conn->closing)
1408 #endif
1409                 {
1410                         PRINT_ERROR("Sending data failed: initiator %s, "
1411                                 "write_size %d, write_state %d, res %d",
1412                                 conn->session->initiator_name,
1413                                 conn->write_size,
1414                                 conn->write_state, res);
1415                 }
1416                 conn->write_state = TX_END;
1417                 conn->write_size = 0;
1418                 mark_conn_closed(conn);
1419                 break;
1420         }
1421         return res;
1422 }
1423
1424 static int tx_ddigest(struct iscsi_cmnd *cmnd, int state)
1425 {
1426         int res, rest = cmnd->conn->write_size;
1427         struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
1428         struct kvec iov;
1429
1430         iscsi_extracheck_is_wr_thread(cmnd->conn);
1431
1432         TRACE_DBG("Sending data digest %x (cmd %p)", cmnd->ddigest, cmnd);
1433
1434         iov.iov_base = (char *)(&cmnd->ddigest) + (sizeof(u32) - rest);
1435         iov.iov_len = rest;
1436
1437         res = kernel_sendmsg(cmnd->conn->sock, &msg, &iov, 1, rest);
1438         if (res > 0) {
1439                 cmnd->conn->write_size -= res;
1440                 if (!cmnd->conn->write_size)
1441                         cmnd->conn->write_state = state;
1442         } else
1443                 res = exit_tx(cmnd->conn, res);
1444
1445         return res;
1446 }
1447
1448 static void init_tx_hdigest(struct iscsi_cmnd *cmnd)
1449 {
1450         struct iscsi_conn *conn = cmnd->conn;
1451         struct iovec *iop;
1452
1453         iscsi_extracheck_is_wr_thread(conn);
1454
1455         digest_tx_header(cmnd);
1456
1457         sBUG_ON(conn->write_iop_used >=
1458                 (signed)(sizeof(conn->write_iov)/sizeof(conn->write_iov[0])));
1459
1460         iop = &conn->write_iop[conn->write_iop_used];
1461         conn->write_iop_used++;
1462         iop->iov_base = (void __force __user *)&(cmnd->hdigest);
1463         iop->iov_len = sizeof(u32);
1464         conn->write_size += sizeof(u32);
1465
1466         return;
1467 }
1468
1469 static int tx_padding(struct iscsi_cmnd *cmnd, int state)
1470 {
1471         int res, rest = cmnd->conn->write_size;
1472         struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
1473         struct kvec iov;
1474         static const uint32_t padding;
1475
1476         iscsi_extracheck_is_wr_thread(cmnd->conn);
1477
1478         TRACE_DBG("Sending %d padding bytes (cmd %p)", rest, cmnd);
1479
1480         iov.iov_base = (char *)(&padding) + (sizeof(uint32_t) - rest);
1481         iov.iov_len = rest;
1482
1483         res = kernel_sendmsg(cmnd->conn->sock, &msg, &iov, 1, rest);
1484         if (res > 0) {
1485                 cmnd->conn->write_size -= res;
1486                 if (!cmnd->conn->write_size)
1487                         cmnd->conn->write_state = state;
1488         } else
1489                 res = exit_tx(cmnd->conn, res);
1490
1491         return res;
1492 }
1493
1494 static int iscsi_do_send(struct iscsi_conn *conn, int state)
1495 {
1496         int res;
1497
1498         iscsi_extracheck_is_wr_thread(conn);
1499
1500         res = write_data(conn);
1501         if (res > 0) {
1502                 if (!conn->write_size)
1503                         conn->write_state = state;
1504         } else
1505                 res = exit_tx(conn, res);
1506
1507         return res;
1508 }
1509
1510 /*
1511  * No locks, conn is wr processing.
1512  *
1513  * IMPORTANT! Connection conn must be protected by additional conn_get()
1514  * upon entrance in this function, because otherwise it could be destroyed
1515  * inside as a result of cmnd release.
1516  */
1517 int iscsi_send(struct iscsi_conn *conn)
1518 {
1519         struct iscsi_cmnd *cmnd = conn->write_cmnd;
1520         int ddigest, res = 0;
1521
1522         TRACE_ENTRY();
1523
1524         TRACE_DBG("conn %p, write_cmnd %p", conn, cmnd);
1525
1526         iscsi_extracheck_is_wr_thread(conn);
1527
1528         ddigest = conn->ddigest_type != DIGEST_NONE ? 1 : 0;
1529
1530         switch (conn->write_state) {
1531         case TX_INIT:
1532                 sBUG_ON(cmnd != NULL);
1533                 cmnd = conn->write_cmnd = iscsi_get_send_cmnd(conn);
1534                 if (!cmnd)
1535                         goto out;
1536                 cmnd_tx_start(cmnd);
1537                 if (!(conn->hdigest_type & DIGEST_NONE))
1538                         init_tx_hdigest(cmnd);
1539                 conn->write_state = TX_BHS_DATA;
1540         case TX_BHS_DATA:
1541                 res = iscsi_do_send(conn, cmnd->pdu.datasize ?
1542                                         TX_INIT_PADDING : TX_END);
1543                 if (res <= 0 || conn->write_state != TX_INIT_PADDING)
1544                         break;
1545         case TX_INIT_PADDING:
1546                 cmnd->conn->write_size = ((cmnd->pdu.datasize + 3) & -4) -
1547                                                 cmnd->pdu.datasize;
1548                 if (cmnd->conn->write_size != 0)
1549                         conn->write_state = TX_PADDING;
1550                 else if (ddigest)
1551                         conn->write_state = TX_INIT_DDIGEST;
1552                  else
1553                         conn->write_state = TX_END;
1554                 break;
1555         case TX_PADDING:
1556                 res = tx_padding(cmnd, ddigest ? TX_INIT_DDIGEST : TX_END);
1557                 if (res <= 0 || conn->write_state != TX_INIT_DDIGEST)
1558                         break;
1559         case TX_INIT_DDIGEST:
1560                 cmnd->conn->write_size = sizeof(u32);
1561                 conn->write_state = TX_DDIGEST;
1562         case TX_DDIGEST:
1563                 res = tx_ddigest(cmnd, TX_END);
1564                 break;
1565         default:
1566                 PRINT_CRIT_ERROR("%d %d %x", res, conn->write_state,
1567                         cmnd_opcode(cmnd));
1568                 sBUG();
1569         }
1570
1571         if (res == 0)
1572                 goto out;
1573
1574         if (conn->write_state != TX_END)
1575                 goto out;
1576
1577         if (unlikely(conn->write_size)) {
1578                 PRINT_CRIT_ERROR("%d %x %u", res, cmnd_opcode(cmnd),
1579                         conn->write_size);
1580                 sBUG();
1581         }
1582         cmnd_tx_end(cmnd);
1583
1584         rsp_cmnd_release(cmnd);
1585
1586         conn->write_cmnd = NULL;
1587         conn->write_state = TX_INIT;
1588
1589 out:
1590         TRACE_EXIT_RES(res);
1591         return res;
1592 }
1593
1594 /* No locks, conn is wr processing.
1595  *
1596  * IMPORTANT! Connection conn must be protected by additional conn_get()
1597  * upon entrance in this function, because otherwise it could be destroyed
1598  * inside as a result of iscsi_send(), which releases sent commands.
1599  */
1600 static int process_write_queue(struct iscsi_conn *conn)
1601 {
1602         int res = 0;
1603
1604         TRACE_ENTRY();
1605
1606         if (likely(test_write_ready(conn)))
1607                 res = iscsi_send(conn);
1608
1609         TRACE_EXIT_RES(res);
1610         return res;
1611 }
1612
1613 /*
1614  * Called under iscsi_wr_lock and BHs disabled, but will drop it inside,
1615  * then reaquire.
1616  */
1617 static void scst_do_job_wr(void)
1618         __acquires(&iscsi_wr_lock)
1619         __releases(&iscsi_wr_lock)
1620 {
1621         TRACE_ENTRY();
1622
1623         /*
1624          * We delete/add to tail connections to maintain fairness between them.
1625          */
1626
1627         while (!list_empty(&iscsi_wr_list)) {
1628                 int rc;
1629                 struct iscsi_conn *conn = list_entry(iscsi_wr_list.next,
1630                         typeof(*conn), wr_list_entry);
1631
1632                 TRACE_DBG("conn %p, wr_state %x, wr_space_ready %d, "
1633                         "write ready %d", conn, conn->wr_state,
1634                         conn->wr_space_ready, test_write_ready(conn));
1635
1636                 list_del(&conn->wr_list_entry);
1637
1638                 sBUG_ON(conn->wr_state == ISCSI_CONN_WR_STATE_PROCESSING);
1639
1640                 conn->wr_state = ISCSI_CONN_WR_STATE_PROCESSING;
1641                 conn->wr_space_ready = 0;
1642 #ifdef CONFIG_SCST_EXTRACHECKS
1643                 conn->wr_task = current;
1644 #endif
1645                 spin_unlock_bh(&iscsi_wr_lock);
1646
1647                 conn_get(conn);
1648
1649                 rc = process_write_queue(conn);
1650
1651                 spin_lock_bh(&iscsi_wr_lock);
1652 #ifdef CONFIG_SCST_EXTRACHECKS
1653                 conn->wr_task = NULL;
1654 #endif
1655                 if ((rc == -EAGAIN) && !conn->wr_space_ready) {
1656                         conn->wr_state = ISCSI_CONN_WR_STATE_SPACE_WAIT;
1657                         goto cont;
1658                 }
1659
1660                 if (test_write_ready(conn)) {
1661                         list_add_tail(&conn->wr_list_entry, &iscsi_wr_list);
1662                         conn->wr_state = ISCSI_CONN_WR_STATE_IN_LIST;
1663                 } else
1664                         conn->wr_state = ISCSI_CONN_WR_STATE_IDLE;
1665
1666 cont:
1667                 conn_put(conn);
1668         }
1669
1670         TRACE_EXIT();
1671         return;
1672 }
1673
1674 static inline int test_wr_list(void)
1675 {
1676         int res = !list_empty(&iscsi_wr_list) ||
1677                   unlikely(kthread_should_stop());
1678         return res;
1679 }
1680
1681 int istwr(void *arg)
1682 {
1683         TRACE_ENTRY();
1684
1685         PRINT_INFO("Write thread started, PID %d", current->pid);
1686
1687         current->flags |= PF_NOFREEZE;
1688
1689         spin_lock_bh(&iscsi_wr_lock);
1690         while (!kthread_should_stop()) {
1691                 wait_queue_t wait;
1692                 init_waitqueue_entry(&wait, current);
1693
1694                 if (!test_wr_list()) {
1695                         add_wait_queue_exclusive_head(&iscsi_wr_waitQ, &wait);
1696                         for (;;) {
1697                                 set_current_state(TASK_INTERRUPTIBLE);
1698                                 if (test_wr_list())
1699                                         break;
1700                                 spin_unlock_bh(&iscsi_wr_lock);
1701                                 schedule();
1702                                 spin_lock_bh(&iscsi_wr_lock);
1703                         }
1704                         set_current_state(TASK_RUNNING);
1705                         remove_wait_queue(&iscsi_wr_waitQ, &wait);
1706                 }
1707                 scst_do_job_wr();
1708         }
1709         spin_unlock_bh(&iscsi_wr_lock);
1710
1711         /*
1712          * If kthread_should_stop() is true, we are guaranteed to be
1713          * on the module unload, so iscsi_wr_list must be empty.
1714          */
1715         sBUG_ON(!list_empty(&iscsi_wr_list));
1716
1717         PRINT_INFO("Write thread PID %d finished", current->pid);
1718
1719         TRACE_EXIT();
1720         return 0;
1721 }