290eca82ee6f4253f8eabf0b4dea1bb7f2add31a
[mirror/scst/.git] / iscsi-scst / kernel / nthread.c
1 /*
2  *  Network threads.
3  *
4  *  Copyright (C) 2004 - 2005 FUJITA Tomonori <tomof@acm.org>
5  *  Copyright (C) 2007 - 2009 Vladislav Bolkhovitin
6  *  Copyright (C) 2007 - 2009 ID7 Ltd.
7  *
8  *  This program is free software; you can redistribute it and/or
9  *  modify it under the terms of the GNU General Public License
10  *  as published by the Free Software Foundation.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  *  GNU General Public License for more details.
16  */
17
18 #include <linux/sched.h>
19 #include <linux/file.h>
20 #include <linux/kthread.h>
21 #include <asm/ioctls.h>
22 #include <linux/delay.h>
23 #include <net/tcp.h>
24
25 #include "iscsi.h"
26 #include "digest.h"
27
28 enum rx_state {
29         RX_INIT_BHS, /* Must be zero for better "switch" optimization. */
30         RX_BHS,
31         RX_CMD_START,
32         RX_DATA,
33         RX_END,
34
35         RX_CMD_CONTINUE,
36         RX_INIT_HDIGEST,
37         RX_CHECK_HDIGEST,
38         RX_INIT_DDIGEST,
39         RX_CHECK_DDIGEST,
40         RX_AHS,
41         RX_PADDING,
42 };
43
44 enum tx_state {
45         TX_INIT = 0, /* Must be zero for better "switch" optimization. */
46         TX_BHS_DATA,
47         TX_INIT_PADDING,
48         TX_PADDING,
49         TX_INIT_DDIGEST,
50         TX_DDIGEST,
51         TX_END,
52 };
53
54 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
55 static void iscsi_check_closewait(struct iscsi_conn *conn)
56 {
57         struct iscsi_cmnd *cmnd;
58
59         TRACE_ENTRY();
60
61         TRACE_CONN_CLOSE_DBG("conn %p, sk_state %d", conn,
62                 conn->sock->sk->sk_state);
63
64         if (conn->sock->sk->sk_state != TCP_CLOSE) {
65                 TRACE_CONN_CLOSE_DBG("conn %p, skipping", conn);
66                 goto out;
67         }
68
69         /*
70          * No data are going to be sent, so all queued buffers can be freed
71          * now. In many cases TCP does that only in close(), but we can't rely
72          * on user space on calling it.
73          */
74
75 again:
76         spin_lock_bh(&conn->cmd_list_lock);
77         list_for_each_entry(cmnd, &conn->cmd_list, cmd_list_entry) {
78                 struct iscsi_cmnd *rsp;
79                 int restart = 0;
80
81                 TRACE_CONN_CLOSE_DBG("cmd %p, scst_state %x, data_waiting %d, "
82                         "ref_cnt %d, parent_req %p, net_ref_cnt %d, sg %p",
83                         cmnd, cmnd->scst_state, cmnd->data_waiting,
84                         atomic_read(&cmnd->ref_cnt), cmnd->parent_req,
85                         atomic_read(&cmnd->net_ref_cnt), cmnd->sg);
86
87                 sBUG_ON(cmnd->parent_req != NULL);
88
89                 if (cmnd->sg != NULL) {
90                         int i;
91
92                         if (cmnd_get_check(cmnd))
93                                 continue;
94
95                         for (i = 0; i < cmnd->sg_cnt; i++) {
96                                 struct page *page = sg_page(&cmnd->sg[i]);
97                                 TRACE_CONN_CLOSE_DBG("page %p, net_priv %p, "
98                                         "_count %d", page, page->net_priv,
99                                         atomic_read(&page->_count));
100
101                                 if (page->net_priv != NULL) {
102                                         if (restart == 0) {
103                                                 spin_unlock_bh(&conn->cmd_list_lock);
104                                                 restart = 1;
105                                         }
106                                         while (page->net_priv != NULL)
107                                                 iscsi_put_page_callback(page);
108                                 }
109                         }
110                         cmnd_put(cmnd);
111
112                         if (restart)
113                                 goto again;
114                 }
115
116                 spin_lock_bh(&cmnd->rsp_cmd_lock);
117                 list_for_each_entry(rsp, &cmnd->rsp_cmd_list,
118                                 rsp_cmd_list_entry) {
119                         TRACE_CONN_CLOSE_DBG("  rsp %p, ref_cnt %d, "
120                                 "net_ref_cnt %d, sg %p",
121                                 rsp, atomic_read(&rsp->ref_cnt),
122                                 atomic_read(&rsp->net_ref_cnt), rsp->sg);
123
124                         if ((rsp->sg != cmnd->sg) && (rsp->sg != NULL)) {
125                                 int i;
126
127                                 if (cmnd_get_check(rsp))
128                                         continue;
129
130                                 for (i = 0; i < rsp->sg_cnt; i++) {
131                                         struct page *page =
132                                                 sg_page(&rsp->sg[i]);
133                                         TRACE_CONN_CLOSE_DBG(
134                                                 "    page %p, net_priv %p, "
135                                                 "_count %d",
136                                                 page, page->net_priv,
137                                                 atomic_read(&page->_count));
138
139                                         if (page->net_priv != NULL) {
140                                                 if (restart == 0) {
141                                                         spin_unlock_bh(&cmnd->rsp_cmd_lock);
142                                                         spin_unlock_bh(&conn->cmd_list_lock);
143                                                         restart = 1;
144                                                 }
145                                                 while (page->net_priv != NULL)
146                                                         iscsi_put_page_callback(page);
147                                         }
148                                 }
149                                 cmnd_put(rsp);
150
151                                 if (restart)
152                                         goto again;
153                         }
154                 }
155                 spin_unlock_bh(&cmnd->rsp_cmd_lock);
156         }
157         spin_unlock_bh(&conn->cmd_list_lock);
158
159 out:
160         TRACE_EXIT();
161         return;
162 }
163 #else
164 static inline void iscsi_check_closewait(struct iscsi_conn *conn) {};
165 #endif
166
167 static void free_pending_commands(struct iscsi_conn *conn)
168 {
169         struct iscsi_session *session = conn->session;
170         struct list_head *pending_list = &session->pending_list;
171         int req_freed;
172         struct iscsi_cmnd *cmnd;
173
174         spin_lock(&session->sn_lock);
175         do {
176                 req_freed = 0;
177                 list_for_each_entry(cmnd, pending_list, pending_list_entry) {
178                         TRACE_CONN_CLOSE_DBG("Pending cmd %p"
179                                 "(conn %p, cmd_sn %u, exp_cmd_sn %u)",
180                                 cmnd, conn, cmnd->pdu.bhs.sn,
181                                 session->exp_cmd_sn);
182                         if ((cmnd->conn == conn) &&
183                             (session->exp_cmd_sn == cmnd->pdu.bhs.sn)) {
184                                 TRACE_CONN_CLOSE_DBG("Freeing pending cmd %p",
185                                         cmnd);
186
187                                 list_del(&cmnd->pending_list_entry);
188                                 cmnd->pending = 0;
189
190                                 session->exp_cmd_sn++;
191
192                                 spin_unlock(&session->sn_lock);
193
194                                 req_cmnd_release_force(cmnd, 0);
195
196                                 req_freed = 1;
197                                 spin_lock(&session->sn_lock);
198                                 break;
199                         }
200                 }
201         } while (req_freed);
202         spin_unlock(&session->sn_lock);
203
204         return;
205 }
206
207 static void free_orphaned_pending_commands(struct iscsi_conn *conn)
208 {
209         struct iscsi_session *session = conn->session;
210         struct list_head *pending_list = &session->pending_list;
211         int req_freed;
212         struct iscsi_cmnd *cmnd;
213
214         spin_lock(&session->sn_lock);
215         do {
216                 req_freed = 0;
217                 list_for_each_entry(cmnd, pending_list, pending_list_entry) {
218                         TRACE_CONN_CLOSE_DBG("Pending cmd %p"
219                                 "(conn %p, cmd_sn %u, exp_cmd_sn %u)",
220                                 cmnd, conn, cmnd->pdu.bhs.sn,
221                                 session->exp_cmd_sn);
222                         if (cmnd->conn == conn) {
223                                 PRINT_ERROR("Freeing orphaned pending cmd %p",
224                                             cmnd);
225
226                                 list_del(&cmnd->pending_list_entry);
227                                 cmnd->pending = 0;
228
229                                 if (session->exp_cmd_sn == cmnd->pdu.bhs.sn)
230                                         session->exp_cmd_sn++;
231
232                                 spin_unlock(&session->sn_lock);
233
234                                 req_cmnd_release_force(cmnd, 0);
235
236                                 req_freed = 1;
237                                 spin_lock(&session->sn_lock);
238                                 break;
239                         }
240                 }
241         } while (req_freed);
242         spin_unlock(&session->sn_lock);
243
244         return;
245 }
246
247 #ifdef CONFIG_SCST_DEBUG
248 static void trace_conn_close(struct iscsi_conn *conn)
249 {
250         struct iscsi_cmnd *cmnd;
251 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
252         struct iscsi_cmnd *rsp;
253 #endif
254
255 #if 0
256         if (time_after(jiffies, start_waiting + 10*HZ))
257                 trace_flag |= TRACE_CONN_OC_DBG;
258 #endif
259
260         spin_lock_bh(&conn->cmd_list_lock);
261         list_for_each_entry(cmnd, &conn->cmd_list,
262                         cmd_list_entry) {
263                 TRACE_CONN_CLOSE_DBG(
264                         "cmd %p, scst_state %x, scst_cmd state %d, "
265                         "data_waiting %d, ref_cnt %d, sn %u, "
266                         "parent_req %p, pending %d",
267                         cmnd, cmnd->scst_state,
268                         (cmnd->parent_req && cmnd->scst_cmd) ?
269                                 cmnd->scst_cmd->state : -1,
270                         cmnd->data_waiting, atomic_read(&cmnd->ref_cnt),
271                         cmnd->pdu.bhs.sn, cmnd->parent_req, cmnd->pending);
272 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
273                 TRACE_CONN_CLOSE_DBG("net_ref_cnt %d, sg %p",
274                         atomic_read(&cmnd->net_ref_cnt),
275                         cmnd->sg);
276                 if (cmnd->sg != NULL) {
277                         int i;
278                         for (i = 0; i < cmnd->sg_cnt; i++) {
279                                 struct page *page = sg_page(&cmnd->sg[i]);
280                                 TRACE_CONN_CLOSE_DBG("page %p, "
281                                         "net_priv %p, _count %d",
282                                         page, page->net_priv,
283                                         atomic_read(&page->_count));
284                         }
285                 }
286
287                 sBUG_ON(cmnd->parent_req != NULL);
288
289                 spin_lock_bh(&cmnd->rsp_cmd_lock);
290                 list_for_each_entry(rsp, &cmnd->rsp_cmd_list,
291                                 rsp_cmd_list_entry) {
292                         TRACE_CONN_CLOSE_DBG("  rsp %p, "
293                             "ref_cnt %d, net_ref_cnt %d, sg %p",
294                             rsp, atomic_read(&rsp->ref_cnt),
295                             atomic_read(&rsp->net_ref_cnt), rsp->sg);
296                         if (rsp->sg != cmnd->sg && rsp->sg) {
297                                 int i;
298                                 for (i = 0; i < rsp->sg_cnt; i++) {
299                                         TRACE_CONN_CLOSE_DBG("    page %p, "
300                                           "net_priv %p, _count %d",
301                                           sg_page(&rsp->sg[i]),
302                                           sg_page(&rsp->sg[i])->net_priv,
303                                           atomic_read(&sg_page(&rsp->sg[i])->
304                                                 _count));
305                                 }
306                         }
307                 }
308                 spin_unlock_bh(&cmnd->rsp_cmd_lock);
309 #endif /* CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION */
310         }
311         spin_unlock_bh(&conn->cmd_list_lock);
312         return;
313 }
314 #else /* CONFIG_SCST_DEBUG */
315 static void trace_conn_close(struct iscsi_conn *conn) {}
316 #endif /* CONFIG_SCST_DEBUG */
317
318 void iscsi_task_mgmt_affected_cmds_done(struct scst_mgmt_cmd *scst_mcmd)
319 {
320         int fn = scst_mgmt_cmd_get_fn(scst_mcmd);
321         void *priv = scst_mgmt_cmd_get_tgt_priv(scst_mcmd);
322
323         TRACE_MGMT_DBG("scst_mcmd %p, fn %d, priv %p", scst_mcmd, fn, priv);
324
325         switch (fn) {
326         case SCST_NEXUS_LOSS_SESS:
327         case SCST_ABORT_ALL_TASKS_SESS:
328         {
329                 struct iscsi_conn *conn = (struct iscsi_conn *)priv;
330                 struct iscsi_session *sess = conn->session;
331                 struct iscsi_conn *c;
332
333                 mutex_lock(&sess->target->target_mutex);
334
335                 /*
336                  * We can't mark sess as shutting down earlier, because until
337                  * now it might have pending commands. Otherwise, in case of
338                  * reinstatement it might lead to data corruption, because
339                  * commands in being reinstated session can be executed
340                  * after commands in the new session.
341                  */
342                 sess->sess_shutting_down = 1;
343                 list_for_each_entry(c, &sess->conn_list, conn_list_entry) {
344                         if (!test_bit(ISCSI_CONN_SHUTTINGDOWN, &c->conn_aflags)) {
345                                 sess->sess_shutting_down = 0;
346                                 break;
347                         }
348                 }
349
350                 if (conn->conn_reinst_successor != NULL) {
351                         sBUG_ON(!test_bit(ISCSI_CONN_REINSTATING,
352                                   &conn->conn_reinst_successor->conn_aflags));
353                         conn_reinst_finished(conn->conn_reinst_successor);
354                         conn->conn_reinst_successor = NULL;
355                 } else if (sess->sess_reinst_successor != NULL) {
356                         sess_reinst_finished(sess->sess_reinst_successor);
357                         sess->sess_reinst_successor = NULL;
358                 }
359                 mutex_unlock(&sess->target->target_mutex);
360
361                 complete_all(&conn->ready_to_free);
362                 break;
363         }
364         default:
365                 /* Nothing to do */
366                 break;
367         }
368
369         return;
370 }
371
372 /* No locks */
373 static void close_conn(struct iscsi_conn *conn)
374 {
375         struct iscsi_session *session = conn->session;
376         struct iscsi_target *target = conn->target;
377         typeof(jiffies) start_waiting = jiffies;
378         typeof(jiffies) shut_start_waiting = start_waiting;
379         bool pending_reported = 0, wait_expired = 0, shut_expired = 0;
380         bool reinst;
381
382 #define CONN_PENDING_TIMEOUT    ((typeof(jiffies))10*HZ)
383 #define CONN_WAIT_TIMEOUT       ((typeof(jiffies))10*HZ)
384 #define CONN_REG_SHUT_TIMEOUT   ((typeof(jiffies))125*HZ)
385 #define CONN_DEL_SHUT_TIMEOUT   ((typeof(jiffies))10*HZ)
386
387         TRACE_ENTRY();
388
389         TRACE_CONN_CLOSE("Closing connection %p (conn_ref_cnt=%d)", conn,
390                 atomic_read(&conn->conn_ref_cnt));
391
392         iscsi_extracheck_is_rd_thread(conn);
393
394         sBUG_ON(!conn->closing);
395
396         if (conn->active_close) {
397                 /* We want all our already send operations to complete */
398                 conn->sock->ops->shutdown(conn->sock, RCV_SHUTDOWN);
399         } else {
400                 conn->sock->ops->shutdown(conn->sock,
401                         RCV_SHUTDOWN|SEND_SHUTDOWN);
402         }
403
404         mutex_lock(&session->target->target_mutex);
405
406         set_bit(ISCSI_CONN_SHUTTINGDOWN, &conn->conn_aflags);
407         reinst = (conn->conn_reinst_successor != NULL);
408
409         mutex_unlock(&session->target->target_mutex);
410
411         if (reinst) {
412                 int rc;
413                 int lun = 0;
414
415                 /* Abort all outstanding commands */
416                 rc = scst_rx_mgmt_fn_lun(session->scst_sess,
417                         SCST_ABORT_ALL_TASKS_SESS, (uint8_t *)&lun, sizeof(lun),
418                         SCST_NON_ATOMIC, conn);
419                 if (rc != 0)
420                         PRINT_ERROR("SCST_ABORT_ALL_TASKS_SESS failed %d", rc);
421         } else {
422                 int rc;
423                 int lun = 0;
424
425                 rc = scst_rx_mgmt_fn_lun(session->scst_sess,
426                         SCST_NEXUS_LOSS_SESS, (uint8_t *)&lun, sizeof(lun),
427                         SCST_NON_ATOMIC, conn);
428                 if (rc != 0)
429                         PRINT_ERROR("SCST_NEXUS_LOSS_SESS failed %d", rc);
430         }
431
432         if (conn->read_state != RX_INIT_BHS) {
433                 struct iscsi_cmnd *cmnd = conn->read_cmnd;
434
435                 if (cmnd->scst_state == ISCSI_CMD_STATE_RX_CMD) {
436                         TRACE_DBG("Going to wait for cmnd %p to change state "
437                                 "from RX_CMD", cmnd);
438                 }
439                 wait_event(conn->read_state_waitQ,
440                         cmnd->scst_state != ISCSI_CMD_STATE_RX_CMD);
441
442                 conn->read_cmnd = NULL;
443                 conn->read_state = RX_INIT_BHS;
444                 req_cmnd_release_force(cmnd, 0);
445         }
446
447         conn_abort(conn);
448
449         /* ToDo: not the best way to wait */
450         while (atomic_read(&conn->conn_ref_cnt) != 0) {
451                 mutex_lock(&target->target_mutex);
452                 spin_lock(&session->sn_lock);
453                 if (session->tm_rsp && session->tm_rsp->conn == conn) {
454                         struct iscsi_cmnd *tm_rsp = session->tm_rsp;
455                         TRACE(TRACE_MGMT_MINOR, "Dropping delayed TM rsp %p",
456                                 tm_rsp);
457                         session->tm_rsp = NULL;
458                         session->tm_active--;
459                         WARN_ON(session->tm_active < 0);
460                         spin_unlock(&session->sn_lock);
461                         mutex_unlock(&target->target_mutex);
462
463                         rsp_cmnd_release(tm_rsp);
464                 } else {
465                         spin_unlock(&session->sn_lock);
466                         mutex_unlock(&target->target_mutex);
467                 }
468
469                 /* It's safe to check it without sn_lock */
470                 if (!list_empty(&session->pending_list)) {
471                         TRACE_CONN_CLOSE_DBG("Disposing pending commands on "
472                                 "connection %p (conn_ref_cnt=%d)", conn,
473                                 atomic_read(&conn->conn_ref_cnt));
474
475                         free_pending_commands(conn);
476
477                         if (time_after(jiffies,
478                                 start_waiting + CONN_PENDING_TIMEOUT)) {
479                                 if (!pending_reported) {
480                                         TRACE_CONN_CLOSE("%s",
481                                                 "Pending wait time expired");
482                                         pending_reported = 1;
483                                 }
484                                 free_orphaned_pending_commands(conn);
485                         }
486                 }
487
488                 iscsi_make_conn_wr_active(conn);
489
490                 /* That's for active close only, actually */
491                 if (time_after(jiffies, start_waiting + CONN_WAIT_TIMEOUT) &&
492                     !wait_expired) {
493                         TRACE_CONN_CLOSE("Wait time expired (conn %p, "
494                                 "sk_state %d)",
495                                 conn, conn->sock->sk->sk_state);
496                         conn->sock->ops->shutdown(conn->sock, SEND_SHUTDOWN);
497                         wait_expired = 1;
498                         shut_start_waiting = jiffies;
499                 }
500
501                 if (wait_expired && !shut_expired &&
502                     time_after(jiffies, shut_start_waiting +
503                                 conn->deleting ? CONN_DEL_SHUT_TIMEOUT :
504                                                  CONN_REG_SHUT_TIMEOUT)) {
505                         TRACE_CONN_CLOSE("Wait time after shutdown expired "
506                                 "(conn %p, sk_state %d)", conn,
507                                 conn->sock->sk->sk_state);
508                         conn->sock->sk->sk_prot->disconnect(conn->sock->sk, 0);
509                         shut_expired = 1;
510                 }
511
512                 if (conn->deleting)
513                         msleep(200);
514                 else
515                         msleep(1000);
516
517                 TRACE_CONN_CLOSE_DBG("conn %p, conn_ref_cnt %d left, "
518                         "wr_state %d, exp_cmd_sn %u",
519                         conn, atomic_read(&conn->conn_ref_cnt),
520                         conn->wr_state, session->exp_cmd_sn);
521
522                 trace_conn_close(conn);
523
524                 iscsi_check_closewait(conn);
525         }
526
527         write_lock_bh(&conn->sock->sk->sk_callback_lock);
528         conn->sock->sk->sk_state_change = conn->old_state_change;
529         conn->sock->sk->sk_data_ready = conn->old_data_ready;
530         conn->sock->sk->sk_write_space = conn->old_write_space;
531         write_unlock_bh(&conn->sock->sk->sk_callback_lock);
532
533         while (1) {
534                 bool t;
535
536                 spin_lock_bh(&iscsi_wr_lock);
537                 t = (conn->wr_state == ISCSI_CONN_WR_STATE_IDLE);
538                 spin_unlock_bh(&iscsi_wr_lock);
539
540                 if (t && (atomic_read(&conn->conn_ref_cnt) == 0))
541                         break;
542
543                 TRACE_CONN_CLOSE_DBG("Waiting for wr thread (conn %p), "
544                         "wr_state %x", conn, conn->wr_state);
545                 msleep(50);
546         }
547
548         wait_for_completion(&conn->ready_to_free);
549
550         TRACE_CONN_CLOSE("Notifying user space about closing connection %p",
551                          conn);
552         event_send(target->tid, session->sid, conn->cid, E_CONN_CLOSE);
553
554 #ifdef CONFIG_SCST_PROC
555         mutex_lock(&target->target_mutex);
556         conn_free(conn);
557         mutex_unlock(&target->target_mutex);
558 #else
559         kobject_put(&conn->iscsi_conn_kobj);
560 #endif
561
562         TRACE_EXIT();
563         return;
564 }
565
566 static int close_conn_thr(void *arg)
567 {
568         struct iscsi_conn *conn = (struct iscsi_conn *)arg;
569
570         TRACE_ENTRY();
571
572 #ifdef CONFIG_SCST_EXTRACHECKS
573         /*
574          * To satisfy iscsi_extracheck_is_rd_thread() in functions called
575          * on the connection close. It is safe, because at this point conn
576          * can't be used by any other thread.
577          */
578         conn->rd_task = current;
579 #endif
580         close_conn(conn);
581
582         TRACE_EXIT();
583         return 0;
584 }
585
586 /* No locks */
587 static void start_close_conn(struct iscsi_conn *conn)
588 {
589         struct task_struct *t;
590
591         TRACE_ENTRY();
592
593         t = kthread_run(close_conn_thr, conn, "iscsi_conn_cleanup");
594         if (IS_ERR(t)) {
595                 PRINT_ERROR("kthread_run() failed (%ld), closing conn %p "
596                         "directly", PTR_ERR(t), conn);
597                 close_conn(conn);
598         }
599
600         TRACE_EXIT();
601         return;
602 }
603
604 static inline void iscsi_conn_init_read(struct iscsi_conn *conn,
605         void __user *data, size_t len)
606 {
607         conn->read_iov[0].iov_base = data;
608         conn->read_iov[0].iov_len = len;
609         conn->read_msg.msg_iov = conn->read_iov;
610         conn->read_msg.msg_iovlen = 1;
611         conn->read_size = len;
612         return;
613 }
614
615 static void iscsi_conn_prepare_read_ahs(struct iscsi_conn *conn,
616         struct iscsi_cmnd *cmnd)
617 {
618         int asize = (cmnd->pdu.ahssize + 3) & -4;
619
620         /* ToDo: __GFP_NOFAIL ?? */
621         cmnd->pdu.ahs = kmalloc(asize, __GFP_NOFAIL|GFP_KERNEL);
622         sBUG_ON(cmnd->pdu.ahs == NULL);
623         iscsi_conn_init_read(conn, (void __force __user *)cmnd->pdu.ahs, asize);
624         return;
625 }
626
627 static struct iscsi_cmnd *iscsi_get_send_cmnd(struct iscsi_conn *conn)
628 {
629         struct iscsi_cmnd *cmnd = NULL;
630
631         spin_lock_bh(&conn->write_list_lock);
632         if (!list_empty(&conn->write_list)) {
633                 cmnd = list_entry(conn->write_list.next, struct iscsi_cmnd,
634                                 write_list_entry);
635                 cmd_del_from_write_list(cmnd);
636                 cmnd->write_processing_started = 1;
637         }
638         spin_unlock_bh(&conn->write_list_lock);
639
640         return cmnd;
641 }
642
643 /* Returns number of bytes left to receive or <0 for error */
644 static int do_recv(struct iscsi_conn *conn)
645 {
646         int res;
647         mm_segment_t oldfs;
648         struct msghdr msg;
649         int first_len;
650
651         EXTRACHECKS_BUG_ON(conn->read_cmnd == NULL);
652
653         if (unlikely(conn->closing)) {
654                 res = -EIO;
655                 goto out;
656         }
657
658         /*
659          * We suppose that if sock_recvmsg() returned less data than requested,
660          * then next time it will return -EAGAIN, so there's no point to call
661          * it again.
662          */
663
664 restart:
665         memset(&msg, 0, sizeof(msg));
666         msg.msg_iov = conn->read_msg.msg_iov;
667         msg.msg_iovlen = conn->read_msg.msg_iovlen;
668         first_len = msg.msg_iov->iov_len;
669
670         oldfs = get_fs();
671         set_fs(get_ds());
672         res = sock_recvmsg(conn->sock, &msg, conn->read_size,
673                            MSG_DONTWAIT | MSG_NOSIGNAL);
674         set_fs(oldfs);
675
676         if (res > 0) {
677                 /*
678                  * To save some considerable effort and CPU power we
679                  * suppose that TCP functions adjust
680                  * conn->read_msg.msg_iov and conn->read_msg.msg_iovlen
681                  * on amount of copied data. This BUG_ON is intended
682                  * to catch if it is changed in the future.
683                  */
684                 sBUG_ON((res >= first_len) &&
685                         (conn->read_msg.msg_iov->iov_len != 0));
686                 conn->read_size -= res;
687                 if (conn->read_size != 0) {
688                         if (res >= first_len) {
689                                 int done = 1 + ((res - first_len) >> PAGE_SHIFT);
690                                 conn->read_msg.msg_iov += done;
691                                 conn->read_msg.msg_iovlen -= done;
692                         }
693                 }
694                 res = conn->read_size;
695         } else {
696                 switch (res) {
697                 case -EAGAIN:
698                         TRACE_DBG("EAGAIN received for conn %p", conn);
699                         res = conn->read_size;
700                         break;
701                 case -ERESTARTSYS:
702                         TRACE_DBG("ERESTARTSYS received for conn %p", conn);
703                         goto restart;
704                 default:
705                         PRINT_ERROR("sock_recvmsg() failed: %d", res);
706                         mark_conn_closed(conn);
707                         if (res == 0)
708                                 res = -EIO;
709                         break;
710                 }
711         }
712
713 out:
714         TRACE_EXIT_RES(res);
715         return res;
716 }
717
718 static int iscsi_rx_check_ddigest(struct iscsi_conn *conn)
719 {
720         struct iscsi_cmnd *cmnd = conn->read_cmnd;
721         int res;
722
723         res = do_recv(conn);
724         if (res == 0) {
725                 conn->read_state = RX_END;
726
727                 if (cmnd->pdu.datasize <= 16*1024) {
728                         /*
729                          * It's cache hot, so let's compute it inline. The
730                          * choice here about what will expose more latency:
731                          * possible cache misses or the digest calculation.
732                          */
733                         TRACE_DBG("cmnd %p, opcode %x: checking RX "
734                                 "ddigest inline", cmnd, cmnd_opcode(cmnd));
735                         cmnd->ddigest_checked = 1;
736                         res = digest_rx_data(cmnd);
737                         if (unlikely(res != 0)) {
738                                 mark_conn_closed(conn);
739                                 goto out;
740                         }
741                 } else if (cmnd_opcode(cmnd) == ISCSI_OP_SCSI_CMD) {
742                         cmd_add_on_rx_ddigest_list(cmnd, cmnd);
743                         cmnd_get(cmnd);
744                 } else if (cmnd_opcode(cmnd) != ISCSI_OP_SCSI_DATA_OUT) {
745                         /*
746                          * We could get here only for NOP-Out. ISCSI RFC
747                          * doesn't specify how to deal with digest errors in
748                          * this case. Is closing connection correct?
749                          */
750                         TRACE_DBG("cmnd %p, opcode %x: checking NOP RX "
751                                 "ddigest", cmnd, cmnd_opcode(cmnd));
752                         res = digest_rx_data(cmnd);
753                         if (unlikely(res != 0)) {
754                                 mark_conn_closed(conn);
755                                 goto out;
756                         }
757                 }
758         }
759
760 out:
761         return res;
762 }
763
764 /* No locks, conn is rd processing */
765 static int process_read_io(struct iscsi_conn *conn, int *closed)
766 {
767         struct iscsi_cmnd *cmnd = conn->read_cmnd;
768         int res;
769
770         TRACE_ENTRY();
771
772         /* In case of error cmnd will be freed in close_conn() */
773
774         do {
775                 switch (conn->read_state) {
776                 case RX_INIT_BHS:
777                         EXTRACHECKS_BUG_ON(conn->read_cmnd != NULL);
778                         cmnd = cmnd_alloc(conn, NULL);
779                         conn->read_cmnd = cmnd;
780                         iscsi_conn_init_read(cmnd->conn,
781                                 (void __force __user *)&cmnd->pdu.bhs,
782                                 sizeof(cmnd->pdu.bhs));
783                         conn->read_state = RX_BHS;
784                         /* go through */
785
786                 case RX_BHS:
787                         res = do_recv(conn);
788                         if (res == 0) {
789                                 iscsi_cmnd_get_length(&cmnd->pdu);
790                                 if (cmnd->pdu.ahssize == 0) {
791                                         if ((conn->hdigest_type & DIGEST_NONE) == 0)
792                                                 conn->read_state = RX_INIT_HDIGEST;
793                                         else
794                                                 conn->read_state = RX_CMD_START;
795                                 } else {
796                                         iscsi_conn_prepare_read_ahs(conn, cmnd);
797                                         conn->read_state = RX_AHS;
798                                 }
799                         }
800                         break;
801
802                 case RX_CMD_START:
803                         res = cmnd_rx_start(cmnd);
804                         if (res == 0) {
805                                 if (cmnd->pdu.datasize == 0)
806                                         conn->read_state = RX_END;
807                                 else
808                                         conn->read_state = RX_DATA;
809                         } else if (res > 0)
810                                 conn->read_state = RX_CMD_CONTINUE;
811                         else
812                                 sBUG_ON(!conn->closing);
813                         break;
814
815                 case RX_CMD_CONTINUE:
816                         if (cmnd->scst_state == ISCSI_CMD_STATE_RX_CMD) {
817                                 TRACE_DBG("cmnd %p is still in RX_CMD state",
818                                         cmnd);
819                                 res = 1;
820                                 break;
821                         }
822                         res = cmnd_rx_continue(cmnd);
823                         if (unlikely(res != 0))
824                                 sBUG_ON(!conn->closing);
825                         else {
826                                 if (cmnd->pdu.datasize == 0)
827                                         conn->read_state = RX_END;
828                                 else
829                                         conn->read_state = RX_DATA;
830                         }
831                         break;
832
833                 case RX_DATA:
834                         res = do_recv(conn);
835                         if (res == 0) {
836                                 int psz = ((cmnd->pdu.datasize + 3) & -4) - cmnd->pdu.datasize;
837                                 if (psz != 0) {
838                                         TRACE_DBG("padding %d bytes", psz);
839                                         iscsi_conn_init_read(conn,
840                                                 (void __force __user *)&conn->rpadding, psz);
841                                         conn->read_state = RX_PADDING;
842                                 } else if ((conn->ddigest_type & DIGEST_NONE) != 0)
843                                         conn->read_state = RX_END;
844                                 else
845                                         conn->read_state = RX_INIT_DDIGEST;
846                         }
847                         break;
848
849                 case RX_END:
850                         if (unlikely(conn->read_size != 0)) {
851                                 PRINT_CRIT_ERROR("conn read_size !=0 on RX_END "
852                                         "(conn %p, op %x, read_size %d)", conn,
853                                         cmnd_opcode(cmnd), conn->read_size);
854                                 sBUG();
855                         }
856                         conn->read_cmnd = NULL;
857                         conn->read_state = RX_INIT_BHS;
858
859                         cmnd_rx_end(cmnd);
860
861                         EXTRACHECKS_BUG_ON(conn->read_size != 0);
862
863                         /*
864                          * To maintain fairness. Res must be 0 here anyway, the
865                          * assignment is only to remove compiler warning about
866                          * uninitialized variable.
867                          */
868                         res = 0;
869                         goto out;
870
871                 case RX_INIT_HDIGEST:
872                         iscsi_conn_init_read(conn,
873                                 (void __force __user *)&cmnd->hdigest, sizeof(u32));
874                         conn->read_state = RX_CHECK_HDIGEST;
875                         /* go through */
876
877                 case RX_CHECK_HDIGEST:
878                         res = do_recv(conn);
879                         if (res == 0) {
880                                 res = digest_rx_header(cmnd);
881                                 if (unlikely(res != 0)) {
882                                         PRINT_ERROR("rx header digest for "
883                                                 "initiator %s failed (%d)",
884                                                 conn->session->initiator_name,
885                                                 res);
886                                         mark_conn_closed(conn);
887                                 } else
888                                         conn->read_state = RX_CMD_START;
889                         }
890                         break;
891
892                 case RX_INIT_DDIGEST:
893                         iscsi_conn_init_read(conn,
894                                 (void __force __user *)&cmnd->ddigest,
895                                 sizeof(u32));
896                         conn->read_state = RX_CHECK_DDIGEST;
897                         /* go through */
898
899                 case RX_CHECK_DDIGEST:
900                         res = iscsi_rx_check_ddigest(conn);
901                         break;
902
903                 case RX_AHS:
904                         res = do_recv(conn);
905                         if (res == 0) {
906                                 if ((conn->hdigest_type & DIGEST_NONE) == 0)
907                                         conn->read_state = RX_INIT_HDIGEST;
908                                 else
909                                         conn->read_state = RX_CMD_START;
910                         }
911                         break;
912
913                 case RX_PADDING:
914                         res = do_recv(conn);
915                         if (res == 0) {
916                                 if ((conn->ddigest_type & DIGEST_NONE) == 0)
917                                         conn->read_state = RX_INIT_DDIGEST;
918                                 else
919                                         conn->read_state = RX_END;
920                         }
921                         break;
922
923                 default:
924                         PRINT_CRIT_ERROR("%d %x", conn->read_state, cmnd_opcode(cmnd));
925                         res = -1; /* to keep compiler happy */
926                         sBUG();
927                 }
928         } while (res == 0);
929
930         if (unlikely(conn->closing)) {
931                 start_close_conn(conn);
932                 *closed = 1;
933         }
934
935 out:
936         TRACE_EXIT_RES(res);
937         return res;
938 }
939
940 /*
941  * Called under iscsi_rd_lock and BHs disabled, but will drop it inside,
942  * then reaquire.
943  */
944 static void scst_do_job_rd(void)
945         __acquires(&iscsi_rd_lock)
946         __releases(&iscsi_rd_lock)
947 {
948         TRACE_ENTRY();
949
950         /*
951          * We delete/add to tail connections to maintain fairness between them.
952          */
953
954         while (!list_empty(&iscsi_rd_list)) {
955                 int closed = 0, rc;
956                 struct iscsi_conn *conn = list_entry(iscsi_rd_list.next,
957                         typeof(*conn), rd_list_entry);
958
959                 list_del(&conn->rd_list_entry);
960
961                 sBUG_ON(conn->rd_state == ISCSI_CONN_RD_STATE_PROCESSING);
962                 conn->rd_data_ready = 0;
963                 conn->rd_state = ISCSI_CONN_RD_STATE_PROCESSING;
964 #ifdef CONFIG_SCST_EXTRACHECKS
965                 conn->rd_task = current;
966 #endif
967                 spin_unlock_bh(&iscsi_rd_lock);
968
969                 rc = process_read_io(conn, &closed);
970
971                 spin_lock_bh(&iscsi_rd_lock);
972
973                 if (closed)
974                         continue;
975
976 #ifdef CONFIG_SCST_EXTRACHECKS
977                 conn->rd_task = NULL;
978 #endif
979                 if ((rc == 0) || conn->rd_data_ready) {
980                         list_add_tail(&conn->rd_list_entry, &iscsi_rd_list);
981                         conn->rd_state = ISCSI_CONN_RD_STATE_IN_LIST;
982                 } else
983                         conn->rd_state = ISCSI_CONN_RD_STATE_IDLE;
984         }
985
986         TRACE_EXIT();
987         return;
988 }
989
990 static inline int test_rd_list(void)
991 {
992         int res = !list_empty(&iscsi_rd_list) ||
993                   unlikely(kthread_should_stop());
994         return res;
995 }
996
997 int istrd(void *arg)
998 {
999         TRACE_ENTRY();
1000
1001         PRINT_INFO("Read thread started, PID %d", current->pid);
1002
1003         current->flags |= PF_NOFREEZE;
1004
1005         spin_lock_bh(&iscsi_rd_lock);
1006         while (!kthread_should_stop()) {
1007                 wait_queue_t wait;
1008                 init_waitqueue_entry(&wait, current);
1009
1010                 if (!test_rd_list()) {
1011                         add_wait_queue_exclusive_head(&iscsi_rd_waitQ, &wait);
1012                         for (;;) {
1013                                 set_current_state(TASK_INTERRUPTIBLE);
1014                                 if (test_rd_list())
1015                                         break;
1016                                 spin_unlock_bh(&iscsi_rd_lock);
1017                                 schedule();
1018                                 spin_lock_bh(&iscsi_rd_lock);
1019                         }
1020                         set_current_state(TASK_RUNNING);
1021                         remove_wait_queue(&iscsi_rd_waitQ, &wait);
1022                 }
1023                 scst_do_job_rd();
1024         }
1025         spin_unlock_bh(&iscsi_rd_lock);
1026
1027         /*
1028          * If kthread_should_stop() is true, we are guaranteed to be
1029          * on the module unload, so iscsi_rd_list must be empty.
1030          */
1031         sBUG_ON(!list_empty(&iscsi_rd_list));
1032
1033         PRINT_INFO("Read thread PID %d finished", current->pid);
1034
1035         TRACE_EXIT();
1036         return 0;
1037 }
1038
1039 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
1040 static inline void __iscsi_get_page_callback(struct iscsi_cmnd *cmd)
1041 {
1042         int v;
1043
1044         TRACE_NET_PAGE("cmd %p, new net_ref_cnt %d",
1045                 cmd, atomic_read(&cmd->net_ref_cnt)+1);
1046
1047         v = atomic_inc_return(&cmd->net_ref_cnt);
1048         if (v == 1) {
1049                 TRACE_NET_PAGE("getting cmd %p", cmd);
1050                 cmnd_get(cmd);
1051         }
1052         return;
1053 }
1054
1055 void iscsi_get_page_callback(struct page *page)
1056 {
1057         struct iscsi_cmnd *cmd = (struct iscsi_cmnd *)page->net_priv;
1058
1059         TRACE_NET_PAGE("page %p, _count %d", page,
1060                 atomic_read(&page->_count));
1061
1062         __iscsi_get_page_callback(cmd);
1063         return;
1064 }
1065
1066 static inline void __iscsi_put_page_callback(struct iscsi_cmnd *cmd)
1067 {
1068         TRACE_NET_PAGE("cmd %p, new net_ref_cnt %d", cmd,
1069                 atomic_read(&cmd->net_ref_cnt)-1);
1070
1071         if (atomic_dec_and_test(&cmd->net_ref_cnt)) {
1072                 int i, sg_cnt = cmd->sg_cnt;
1073                 for (i = 0; i < sg_cnt; i++) {
1074                         struct page *page = sg_page(&cmd->sg[i]);
1075                         TRACE_NET_PAGE("Clearing page %p", page);
1076                         if (page->net_priv == cmd)
1077                                 page->net_priv = NULL;
1078                 }
1079                 cmnd_put(cmd);
1080         }
1081         return;
1082 }
1083
1084 void iscsi_put_page_callback(struct page *page)
1085 {
1086         struct iscsi_cmnd *cmd = (struct iscsi_cmnd *)page->net_priv;
1087
1088         TRACE_NET_PAGE("page %p, _count %d", page,
1089                 atomic_read(&page->_count));
1090
1091         __iscsi_put_page_callback(cmd);
1092         return;
1093 }
1094
1095 static void check_net_priv(struct iscsi_cmnd *cmd, struct page *page)
1096 {
1097         if ((atomic_read(&cmd->net_ref_cnt) == 1) && (page->net_priv == cmd)) {
1098                 TRACE_DBG("sendpage() not called get_page(), zeroing net_priv "
1099                         "%p (page %p)", page->net_priv, page);
1100                 page->net_priv = NULL;
1101         }
1102         return;
1103 }
1104 #else
1105 static inline void check_net_priv(struct iscsi_cmnd *cmd, struct page *page) {}
1106 static inline void __iscsi_get_page_callback(struct iscsi_cmnd *cmd) {}
1107 static inline void __iscsi_put_page_callback(struct iscsi_cmnd *cmd) {}
1108 #endif
1109
1110 /* This is partially taken from the Ardis code. */
1111 static int write_data(struct iscsi_conn *conn)
1112 {
1113         mm_segment_t oldfs;
1114         struct file *file;
1115         struct iovec *iop;
1116         struct socket *sock;
1117         ssize_t (*sock_sendpage)(struct socket *, struct page *, int, size_t,
1118                                  int);
1119         ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int);
1120         struct iscsi_cmnd *write_cmnd = conn->write_cmnd;
1121         struct iscsi_cmnd *ref_cmd;
1122         struct page *page;
1123         struct scatterlist *sg;
1124         int saved_size, size, sendsize;
1125         int length, offset, idx;
1126         int flags, res, count, sg_size;
1127         bool do_put = false, ref_cmd_to_parent;
1128
1129         TRACE_ENTRY();
1130
1131         iscsi_extracheck_is_wr_thread(conn);
1132
1133         if (write_cmnd->own_sg == 0) {
1134                 ref_cmd = write_cmnd->parent_req;
1135                 ref_cmd_to_parent = true;
1136         } else {
1137                 ref_cmd = write_cmnd;
1138                 ref_cmd_to_parent = false;
1139         }
1140
1141         if (!ref_cmd->on_written_list) {
1142                 TRACE_DBG("Adding cmd %p to conn %p written_list", ref_cmd,
1143                         conn);
1144                 spin_lock_bh(&conn->write_list_lock);
1145                 ref_cmd->on_written_list = 1;
1146                 ref_cmd->write_timeout = jiffies + ISCSI_RSP_TIMEOUT;
1147                 list_add_tail(&ref_cmd->written_list_entry,
1148                         &conn->written_list);
1149                 spin_unlock_bh(&conn->write_list_lock);
1150         }
1151
1152         if (!timer_pending(&conn->rsp_timer)) {
1153                 sBUG_ON(!ref_cmd->on_written_list);
1154                 spin_lock_bh(&conn->write_list_lock);
1155                 if (likely(!timer_pending(&conn->rsp_timer))) {
1156                         TRACE_DBG("Starting timer on %ld (conn %p)",
1157                                 ref_cmd->write_timeout, conn);
1158                         conn->rsp_timer.expires = ref_cmd->write_timeout;
1159                         add_timer(&conn->rsp_timer);
1160                 }
1161                 spin_unlock_bh(&conn->write_list_lock);
1162         }
1163
1164         file = conn->file;
1165         size = conn->write_size;
1166         saved_size = size;
1167         iop = conn->write_iop;
1168         count = conn->write_iop_used;
1169
1170         if (iop) {
1171                 while (1) {
1172                         loff_t off = 0;
1173                         int rest;
1174
1175                         sBUG_ON(count > (signed)(sizeof(conn->write_iov) /
1176                                                 sizeof(conn->write_iov[0])));
1177 retry:
1178                         oldfs = get_fs();
1179                         set_fs(KERNEL_DS);
1180                         res = vfs_writev(file,
1181                                          (struct iovec __force __user *)iop,
1182                                          count, &off);
1183                         set_fs(oldfs);
1184                         TRACE_WRITE("sid %#Lx, cid %u, res %d, iov_len %ld",
1185                                     (long long unsigned int)conn->session->sid,
1186                                     conn->cid, res, (long)iop->iov_len);
1187                         if (unlikely(res <= 0)) {
1188                                 if (res == -EAGAIN) {
1189                                         conn->write_iop = iop;
1190                                         conn->write_iop_used = count;
1191                                         goto out_iov;
1192                                 } else if (res == -EINTR)
1193                                         goto retry;
1194                                 goto out_err;
1195                         }
1196
1197                         rest = res;
1198                         size -= res;
1199                         while ((typeof(rest))iop->iov_len <= rest && rest) {
1200                                 rest -= iop->iov_len;
1201                                 iop++;
1202                                 count--;
1203                         }
1204                         if (count == 0) {
1205                                 conn->write_iop = NULL;
1206                                 conn->write_iop_used = 0;
1207                                 if (size)
1208                                         break;
1209                                 goto out_iov;
1210                         }
1211                         sBUG_ON(iop > conn->write_iov + sizeof(conn->write_iov)
1212                                                   /sizeof(conn->write_iov[0]));
1213                         iop->iov_base += rest;
1214                         iop->iov_len -= rest;
1215                 }
1216         }
1217
1218         sg = write_cmnd->sg;
1219         if (unlikely(sg == NULL)) {
1220                 PRINT_INFO("WARNING: Data missed (cmd %p)!", write_cmnd);
1221                 res = 0;
1222                 goto out;
1223         }
1224
1225         /* To protect from too early transfer completion race */
1226         __iscsi_get_page_callback(ref_cmd);
1227         do_put = true;
1228
1229         sock = conn->sock;
1230
1231 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
1232         sock_sendpage = sock->ops->sendpage;
1233 #else
1234         if ((write_cmnd->parent_req->scst_cmd != NULL) &&
1235             scst_cmd_get_dh_data_buff_alloced(write_cmnd->parent_req->scst_cmd))
1236                 sock_sendpage = sock_no_sendpage;
1237         else
1238                 sock_sendpage = sock->ops->sendpage;
1239 #endif
1240
1241         flags = MSG_DONTWAIT;
1242         sg_size = size;
1243
1244         if (sg != write_cmnd->rsp_sg) {
1245                 offset = conn->write_offset + sg[0].offset;
1246                 idx = offset >> PAGE_SHIFT;
1247                 offset &= ~PAGE_MASK;
1248                 length = min(size, (int)PAGE_SIZE - offset);
1249                 TRACE_WRITE("write_offset %d, sg_size %d, idx %d, offset %d, "
1250                         "length %d", conn->write_offset, sg_size, idx, offset,
1251                         length);
1252         } else {
1253                 idx = 0;
1254                 offset = conn->write_offset;
1255                 while (offset >= sg[idx].length) {
1256                         offset -= sg[idx].length;
1257                         idx++;
1258                 }
1259                 length = sg[idx].length - offset;
1260                 offset += sg[idx].offset;
1261                 sock_sendpage = sock_no_sendpage;
1262                 TRACE_WRITE("rsp_sg: write_offset %d, sg_size %d, idx %d, "
1263                         "offset %d, length %d", conn->write_offset, sg_size,
1264                         idx, offset, length);
1265         }
1266         page = sg_page(&sg[idx]);
1267
1268         while (1) {
1269                 sendpage = sock_sendpage;
1270
1271 #if defined(CONFIG_TCP_ZERO_COPY_TRANSFER_COMPLETION_NOTIFICATION)
1272                 {
1273                         static DEFINE_SPINLOCK(net_priv_lock);
1274                         spin_lock(&net_priv_lock);
1275                         if (unlikely(page->net_priv != NULL)) {
1276                                 if (page->net_priv != ref_cmd) {
1277                                         /*
1278                                          * This might happen if user space
1279                                          * supplies to scst_user the same
1280                                          * pages in different commands or in
1281                                          * case of zero-copy FILEIO, when
1282                                          * several initiators request the same
1283                                          * data simultaneously.
1284                                          */
1285                                         TRACE_DBG("net_priv isn't NULL and != "
1286                                             "ref_cmd (write_cmnd %p, ref_cmd "
1287                                             "%p, sg %p, idx %d, page %p, "
1288                                             "net_priv %p)",
1289                                             write_cmnd, ref_cmd, sg, idx,
1290                                             page, page->net_priv);
1291                                         sendpage = sock_no_sendpage;
1292                                 }
1293                         } else
1294                                 page->net_priv = ref_cmd;
1295                         spin_unlock(&net_priv_lock);
1296                 }
1297 #endif
1298                 sendsize = min(size, length);
1299                 if (size <= sendsize) {
1300 retry2:
1301                         res = sendpage(sock, page, offset, size, flags);
1302                         TRACE_WRITE("Final %s sid %#Lx, cid %u, res %d (page "
1303                                 "index %lu, offset %u, size %u, cmd %p, "
1304                                 "page %p)", (sendpage != sock_no_sendpage) ?
1305                                                 "sendpage" : "sock_no_sendpage",
1306                                 (long long unsigned int)conn->session->sid,
1307                                 conn->cid, res, page->index,
1308                                 offset, size, write_cmnd, page);
1309                         if (unlikely(res <= 0)) {
1310                                 if (res == -EINTR)
1311                                         goto retry2;
1312                                 else
1313                                         goto out_res;
1314                         }
1315
1316                         check_net_priv(ref_cmd, page);
1317                         if (res == size) {
1318                                 conn->write_size = 0;
1319                                 res = saved_size;
1320                                 goto out_put;
1321                         }
1322
1323                         offset += res;
1324                         size -= res;
1325                         goto retry2;
1326                 }
1327
1328 retry1:
1329                 res = sendpage(sock, page, offset, sendsize, flags | MSG_MORE);
1330                 TRACE_WRITE("%s sid %#Lx, cid %u, res %d (page index %lu, "
1331                         "offset %u, sendsize %u, size %u, cmd %p, page %p)",
1332                         (sendpage != sock_no_sendpage) ? "sendpage" :
1333                                                          "sock_no_sendpage",
1334                         (unsigned long long)conn->session->sid, conn->cid,
1335                         res, page->index, offset, sendsize, size,
1336                         write_cmnd, page);
1337                 if (unlikely(res <= 0)) {
1338                         if (res == -EINTR)
1339                                 goto retry1;
1340                         else
1341                                 goto out_res;
1342                 }
1343
1344                 check_net_priv(ref_cmd, page);
1345
1346                 size -= res;
1347
1348                 if (res == sendsize) {
1349                         idx++;
1350                         EXTRACHECKS_BUG_ON(idx >= ref_cmd->sg_cnt);
1351                         page = sg_page(&sg[idx]);
1352                         length = sg[idx].length;
1353                         offset = sg[idx].offset;
1354                 } else {
1355                         offset += res;
1356                         sendsize -= res;
1357                         goto retry1;
1358                 }
1359         }
1360
1361 out_off:
1362         conn->write_offset += sg_size - size;
1363
1364 out_iov:
1365         conn->write_size = size;
1366         if ((saved_size == size) && res == -EAGAIN)
1367                 goto out_put;
1368
1369         res = saved_size - size;
1370
1371 out_put:
1372         if (do_put)
1373                 __iscsi_put_page_callback(ref_cmd);
1374
1375 out:
1376         TRACE_EXIT_RES(res);
1377         return res;
1378
1379 out_res:
1380         check_net_priv(ref_cmd, page);
1381         if (res == -EAGAIN)
1382                 goto out_off;
1383         /* else go through */
1384
1385 out_err:
1386 #ifndef CONFIG_SCST_DEBUG
1387         if (!conn->closing)
1388 #endif
1389         {
1390                 PRINT_ERROR("error %d at sid:cid %#Lx:%u, cmnd %p", res,
1391                             (long long unsigned int)conn->session->sid,
1392                             conn->cid, conn->write_cmnd);
1393         }
1394         if (ref_cmd_to_parent &&
1395             ((ref_cmd->scst_cmd != NULL) || (ref_cmd->scst_aen != NULL))) {
1396                 if (ref_cmd->scst_state == ISCSI_CMD_STATE_AEN)
1397                         scst_set_aen_delivery_status(ref_cmd->scst_aen,
1398                                 SCST_AEN_RES_FAILED);
1399                 else
1400                         scst_set_delivery_status(ref_cmd->scst_cmd,
1401                                 SCST_CMD_DELIVERY_FAILED);
1402         }
1403         goto out_put;
1404 }
1405
1406 static int exit_tx(struct iscsi_conn *conn, int res)
1407 {
1408         iscsi_extracheck_is_wr_thread(conn);
1409
1410         switch (res) {
1411         case -EAGAIN:
1412         case -ERESTARTSYS:
1413                 res = 0;
1414                 break;
1415         default:
1416 #ifndef CONFIG_SCST_DEBUG
1417                 if (!conn->closing)
1418 #endif
1419                 {
1420                         PRINT_ERROR("Sending data failed: initiator %s, "
1421                                 "write_size %d, write_state %d, res %d",
1422                                 conn->session->initiator_name,
1423                                 conn->write_size,
1424                                 conn->write_state, res);
1425                 }
1426                 conn->write_state = TX_END;
1427                 conn->write_size = 0;
1428                 mark_conn_closed(conn);
1429                 break;
1430         }
1431         return res;
1432 }
1433
1434 static int tx_ddigest(struct iscsi_cmnd *cmnd, int state)
1435 {
1436         int res, rest = cmnd->conn->write_size;
1437         struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
1438         struct kvec iov;
1439
1440         iscsi_extracheck_is_wr_thread(cmnd->conn);
1441
1442         TRACE_DBG("Sending data digest %x (cmd %p)", cmnd->ddigest, cmnd);
1443
1444         iov.iov_base = (char *)(&cmnd->ddigest) + (sizeof(u32) - rest);
1445         iov.iov_len = rest;
1446
1447         res = kernel_sendmsg(cmnd->conn->sock, &msg, &iov, 1, rest);
1448         if (res > 0) {
1449                 cmnd->conn->write_size -= res;
1450                 if (!cmnd->conn->write_size)
1451                         cmnd->conn->write_state = state;
1452         } else
1453                 res = exit_tx(cmnd->conn, res);
1454
1455         return res;
1456 }
1457
1458 static void init_tx_hdigest(struct iscsi_cmnd *cmnd)
1459 {
1460         struct iscsi_conn *conn = cmnd->conn;
1461         struct iovec *iop;
1462
1463         iscsi_extracheck_is_wr_thread(conn);
1464
1465         digest_tx_header(cmnd);
1466
1467         sBUG_ON(conn->write_iop_used >=
1468                 (signed)(sizeof(conn->write_iov)/sizeof(conn->write_iov[0])));
1469
1470         iop = &conn->write_iop[conn->write_iop_used];
1471         conn->write_iop_used++;
1472         iop->iov_base = (void __force __user *)&(cmnd->hdigest);
1473         iop->iov_len = sizeof(u32);
1474         conn->write_size += sizeof(u32);
1475
1476         return;
1477 }
1478
1479 static int tx_padding(struct iscsi_cmnd *cmnd, int state)
1480 {
1481         int res, rest = cmnd->conn->write_size;
1482         struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
1483         struct kvec iov;
1484         static const uint32_t padding;
1485
1486         iscsi_extracheck_is_wr_thread(cmnd->conn);
1487
1488         TRACE_DBG("Sending %d padding bytes (cmd %p)", rest, cmnd);
1489
1490         iov.iov_base = (char *)(&padding) + (sizeof(uint32_t) - rest);
1491         iov.iov_len = rest;
1492
1493         res = kernel_sendmsg(cmnd->conn->sock, &msg, &iov, 1, rest);
1494         if (res > 0) {
1495                 cmnd->conn->write_size -= res;
1496                 if (!cmnd->conn->write_size)
1497                         cmnd->conn->write_state = state;
1498         } else
1499                 res = exit_tx(cmnd->conn, res);
1500
1501         return res;
1502 }
1503
1504 static int iscsi_do_send(struct iscsi_conn *conn, int state)
1505 {
1506         int res;
1507
1508         iscsi_extracheck_is_wr_thread(conn);
1509
1510         res = write_data(conn);
1511         if (res > 0) {
1512                 if (!conn->write_size)
1513                         conn->write_state = state;
1514         } else
1515                 res = exit_tx(conn, res);
1516
1517         return res;
1518 }
1519
1520 /*
1521  * No locks, conn is wr processing.
1522  *
1523  * IMPORTANT! Connection conn must be protected by additional conn_get()
1524  * upon entrance in this function, because otherwise it could be destroyed
1525  * inside as a result of cmnd release.
1526  */
1527 int iscsi_send(struct iscsi_conn *conn)
1528 {
1529         struct iscsi_cmnd *cmnd = conn->write_cmnd;
1530         int ddigest, res = 0;
1531
1532         TRACE_ENTRY();
1533
1534         TRACE_DBG("conn %p, write_cmnd %p", conn, cmnd);
1535
1536         iscsi_extracheck_is_wr_thread(conn);
1537
1538         ddigest = conn->ddigest_type != DIGEST_NONE ? 1 : 0;
1539
1540         switch (conn->write_state) {
1541         case TX_INIT:
1542                 sBUG_ON(cmnd != NULL);
1543                 cmnd = conn->write_cmnd = iscsi_get_send_cmnd(conn);
1544                 if (!cmnd)
1545                         goto out;
1546                 cmnd_tx_start(cmnd);
1547                 if (!(conn->hdigest_type & DIGEST_NONE))
1548                         init_tx_hdigest(cmnd);
1549                 conn->write_state = TX_BHS_DATA;
1550         case TX_BHS_DATA:
1551                 res = iscsi_do_send(conn, cmnd->pdu.datasize ?
1552                                         TX_INIT_PADDING : TX_END);
1553                 if (res <= 0 || conn->write_state != TX_INIT_PADDING)
1554                         break;
1555         case TX_INIT_PADDING:
1556                 cmnd->conn->write_size = ((cmnd->pdu.datasize + 3) & -4) -
1557                                                 cmnd->pdu.datasize;
1558                 if (cmnd->conn->write_size != 0)
1559                         conn->write_state = TX_PADDING;
1560                 else if (ddigest)
1561                         conn->write_state = TX_INIT_DDIGEST;
1562                  else
1563                         conn->write_state = TX_END;
1564                 break;
1565         case TX_PADDING:
1566                 res = tx_padding(cmnd, ddigest ? TX_INIT_DDIGEST : TX_END);
1567                 if (res <= 0 || conn->write_state != TX_INIT_DDIGEST)
1568                         break;
1569         case TX_INIT_DDIGEST:
1570                 cmnd->conn->write_size = sizeof(u32);
1571                 conn->write_state = TX_DDIGEST;
1572         case TX_DDIGEST:
1573                 res = tx_ddigest(cmnd, TX_END);
1574                 break;
1575         default:
1576                 PRINT_CRIT_ERROR("%d %d %x", res, conn->write_state,
1577                         cmnd_opcode(cmnd));
1578                 sBUG();
1579         }
1580
1581         if (res == 0)
1582                 goto out;
1583
1584         if (conn->write_state != TX_END)
1585                 goto out;
1586
1587         if (unlikely(conn->write_size)) {
1588                 PRINT_CRIT_ERROR("%d %x %u", res, cmnd_opcode(cmnd),
1589                         conn->write_size);
1590                 sBUG();
1591         }
1592         cmnd_tx_end(cmnd);
1593
1594         rsp_cmnd_release(cmnd);
1595
1596         conn->write_cmnd = NULL;
1597         conn->write_state = TX_INIT;
1598
1599 out:
1600         TRACE_EXIT_RES(res);
1601         return res;
1602 }
1603
1604 /* No locks, conn is wr processing.
1605  *
1606  * IMPORTANT! Connection conn must be protected by additional conn_get()
1607  * upon entrance in this function, because otherwise it could be destroyed
1608  * inside as a result of iscsi_send(), which releases sent commands.
1609  */
1610 static int process_write_queue(struct iscsi_conn *conn)
1611 {
1612         int res = 0;
1613
1614         TRACE_ENTRY();
1615
1616         if (likely(test_write_ready(conn)))
1617                 res = iscsi_send(conn);
1618
1619         TRACE_EXIT_RES(res);
1620         return res;
1621 }
1622
1623 /*
1624  * Called under iscsi_wr_lock and BHs disabled, but will drop it inside,
1625  * then reaquire.
1626  */
1627 static void scst_do_job_wr(void)
1628         __acquires(&iscsi_wr_lock)
1629         __releases(&iscsi_wr_lock)
1630 {
1631         TRACE_ENTRY();
1632
1633         /*
1634          * We delete/add to tail connections to maintain fairness between them.
1635          */
1636
1637         while (!list_empty(&iscsi_wr_list)) {
1638                 int rc;
1639                 struct iscsi_conn *conn = list_entry(iscsi_wr_list.next,
1640                         typeof(*conn), wr_list_entry);
1641
1642                 TRACE_DBG("conn %p, wr_state %x, wr_space_ready %d, "
1643                         "write ready %d", conn, conn->wr_state,
1644                         conn->wr_space_ready, test_write_ready(conn));
1645
1646                 list_del(&conn->wr_list_entry);
1647
1648                 sBUG_ON(conn->wr_state == ISCSI_CONN_WR_STATE_PROCESSING);
1649
1650                 conn->wr_state = ISCSI_CONN_WR_STATE_PROCESSING;
1651                 conn->wr_space_ready = 0;
1652 #ifdef CONFIG_SCST_EXTRACHECKS
1653                 conn->wr_task = current;
1654 #endif
1655                 spin_unlock_bh(&iscsi_wr_lock);
1656
1657                 conn_get(conn);
1658
1659                 rc = process_write_queue(conn);
1660
1661                 spin_lock_bh(&iscsi_wr_lock);
1662 #ifdef CONFIG_SCST_EXTRACHECKS
1663                 conn->wr_task = NULL;
1664 #endif
1665                 if ((rc == -EAGAIN) && !conn->wr_space_ready) {
1666                         conn->wr_state = ISCSI_CONN_WR_STATE_SPACE_WAIT;
1667                         goto cont;
1668                 }
1669
1670                 if (test_write_ready(conn)) {
1671                         list_add_tail(&conn->wr_list_entry, &iscsi_wr_list);
1672                         conn->wr_state = ISCSI_CONN_WR_STATE_IN_LIST;
1673                 } else
1674                         conn->wr_state = ISCSI_CONN_WR_STATE_IDLE;
1675
1676 cont:
1677                 conn_put(conn);
1678         }
1679
1680         TRACE_EXIT();
1681         return;
1682 }
1683
1684 static inline int test_wr_list(void)
1685 {
1686         int res = !list_empty(&iscsi_wr_list) ||
1687                   unlikely(kthread_should_stop());
1688         return res;
1689 }
1690
1691 int istwr(void *arg)
1692 {
1693         TRACE_ENTRY();
1694
1695         PRINT_INFO("Write thread started, PID %d", current->pid);
1696
1697         current->flags |= PF_NOFREEZE;
1698
1699         spin_lock_bh(&iscsi_wr_lock);
1700         while (!kthread_should_stop()) {
1701                 wait_queue_t wait;
1702                 init_waitqueue_entry(&wait, current);
1703
1704                 if (!test_wr_list()) {
1705                         add_wait_queue_exclusive_head(&iscsi_wr_waitQ, &wait);
1706                         for (;;) {
1707                                 set_current_state(TASK_INTERRUPTIBLE);
1708                                 if (test_wr_list())
1709                                         break;
1710                                 spin_unlock_bh(&iscsi_wr_lock);
1711                                 schedule();
1712                                 spin_lock_bh(&iscsi_wr_lock);
1713                         }
1714                         set_current_state(TASK_RUNNING);
1715                         remove_wait_queue(&iscsi_wr_waitQ, &wait);
1716                 }
1717                 scst_do_job_wr();
1718         }
1719         spin_unlock_bh(&iscsi_wr_lock);
1720
1721         /*
1722          * If kthread_should_stop() is true, we are guaranteed to be
1723          * on the module unload, so iscsi_wr_list must be empty.
1724          */
1725         sBUG_ON(!list_empty(&iscsi_wr_list));
1726
1727         PRINT_INFO("Write thread PID %d finished", current->pid);
1728
1729         TRACE_EXIT();
1730         return 0;
1731 }