- Memory barriers cleanup. Comments for them improved
authorvlnb <vlnb@d57e44dd-8a1f-0410-8b47-8ef2f437770f>
Wed, 10 Dec 2008 10:55:01 +0000 (10:55 +0000)
committervlnb <vlnb@d57e44dd-8a1f-0410-8b47-8ef2f437770f>
Wed, 10 Dec 2008 10:55:01 +0000 (10:55 +0000)
 - Small docs update
 - srpt/README_in-tree added

git-svn-id: https://scst.svn.sourceforge.net/svnroot/scst/trunk@614 d57e44dd-8a1f-0410-8b47-8ef2f437770f

12 files changed:
iscsi-scst/kernel/iscsi.c
iscsi-scst/kernel/iscsi.h
scripts/generate-kernel-patch
scst/README
scst/README_in-tree
scst/src/dev_handlers/scst_user.c
scst/src/dev_handlers/scst_vdisk.c
scst/src/scst_lib.c
scst/src/scst_main.c
scst/src/scst_priv.h
scst/src/scst_targ.c
srpt/README_in-tree [new file with mode: 0644]

index 6f6833a..64a38d9 100644 (file)
@@ -2512,7 +2512,9 @@ static inline void iscsi_set_state_wake_up(struct iscsi_cmnd *req,
         * We use wait_event() to wait for the state change, but it checks its
         * condition without any protection, so without cmnd_get() it is
         * possible that req will die "immediately" after the state assignment
-        * and wake_up() will operate on dead data.
+        * and wake_up() will operate on dead data. We use the ordered version
+        * of cmnd_get(), because "get" must be done before the state
+        * assignment.
         */
        cmnd_get_ordered(req);
        req->scst_state = new_state;
@@ -2710,6 +2712,12 @@ static int iscsi_xmit_response(struct scst_cmd *scst_cmd)
                sBUG();
 #endif
 
+       /*
+        * "_ordered" here to protect from reorder, which can lead to
+        * preliminary connection destroy in req_cmnd_release(). Just in
+        * case, actually, because reordering shouldn't go so far, but who
+        * knows..
+        */
        conn_get_ordered(conn);
        req_cmnd_release(req);
        iscsi_try_local_processing(conn);
index 52693c3..750db9c 100644 (file)
@@ -475,6 +475,7 @@ static inline void cmnd_get(struct iscsi_cmnd *cmnd)
 static inline void cmnd_get_ordered(struct iscsi_cmnd *cmnd)
 {
        cmnd_get(cmnd);
+       /* See comments for each cmnd_get_ordered() use */
        smp_mb__after_atomic_inc();
 }
 
@@ -546,6 +547,7 @@ static inline void conn_get(struct iscsi_conn *conn)
 static inline void conn_get_ordered(struct iscsi_conn *conn)
 {
        conn_get(conn);
+       /* See comments for each conn_get_ordered() use */
        smp_mb__after_atomic_inc();
 }
 
@@ -556,8 +558,9 @@ static inline void conn_put(struct iscsi_conn *conn)
        sBUG_ON(atomic_read(&conn->conn_ref_cnt) == 0);
 
        /*
-        * It always ordered to protect from undesired side effects like
-        * accessing just destroyed object because of this *_dec() reordering.
+        * Make it always ordered to protect from undesired side effects like
+        * accessing just destroyed by close_conn() conn caused by reordering
+        * of this atomic_dec().
         */
        smp_mb__before_atomic_dec();
        atomic_dec(&conn->conn_ref_cnt);
index 5837c69..4c07cdd 100755 (executable)
@@ -359,7 +359,7 @@ else
 fi \
 | process_patch "srpt.diff"
 
-add_file "srpt/README" "Documentation/scst/README.srpt" \
+add_file "srpt/README_in-tree" "Documentation/scst/README.srpt" \
 | process_patch "srpt-doc.diff"
 
 
index 0c9e6a9..f9a02e5 100644 (file)
@@ -274,6 +274,8 @@ For changing VMSPLIT option (CONFIG_VMSPLIT to be precise) you should in
 
  - General setup->Configure standard kernel features (for small systems): ON
 
+ - General setup->Prompt for development and/or incomplete code/drivers: ON
+
  - Processor type and features->High Memory Support: OFF
 
  - Processor type and features->Memory split: according to amount of
index fb09671..5fb3adf 100644 (file)
@@ -222,6 +222,8 @@ For changing VMSPLIT option (CONFIG_VMSPLIT to be precise) you should in
 
  - General setup->Configure standard kernel features (for small systems): ON
 
+ - General setup->Prompt for development and/or incomplete code/drivers: ON
+
  - Processor type and features->High Memory Support: OFF
 
  - Processor type and features->Memory split: according to amount of
index 04bbd00..df9b6bd 100644 (file)
@@ -1270,6 +1270,11 @@ static int dev_user_process_reply_exec(struct scst_user_cmd *ucmd,
                if (unlikely((cmd->data_direction == SCST_DATA_READ) ||
                             (cmd->resp_data_len != 0)))
                        goto out_inval;
+               /*
+                * background_exec assignment must be after ucmd get.
+                * Otherwise, due to reorder, in dev_user_process_reply()
+                * it is possible that ucmd is destroyed before it "got" here.
+                */
                ucmd_get_ordered(ucmd);
                ucmd->background_exec = 1;
                TRACE_DBG("Background ucmd %p", ucmd);
@@ -1383,6 +1388,8 @@ static int dev_user_process_reply(struct scst_user_dev *dev,
                goto out_unlock;
        }
 
+       /* To sync. with dev_user_process_reply_exec(). See comment there. */
+       smp_mb();
        if (ucmd->background_exec) {
                state = UCMD_STATE_EXECING;
                goto unlock_process;
index 47231a6..8445a30 100644 (file)
@@ -2450,7 +2450,6 @@ static void blockio_exec_rw(struct scst_cmd *cmd, struct scst_vdisk_thr *thr,
 
        /* +1 to prevent erroneous too early command completion */
        atomic_set(&blockio_work->bios_inflight, bios+1);
-       smp_wmb();
 
        while (hbio) {
                bio = hbio;
index 37a3a53..9cf80fa 100644 (file)
@@ -1528,9 +1528,10 @@ void scst_check_retries(struct scst_tgt *tgt)
 
        /*
         * We don't worry about overflow of finished_cmds, because we check
-        * only for its change
+        * only for its change.
         */
        atomic_inc(&tgt->finished_cmds);
+       /* See comment in scst_queue_retry_cmd() */
        smp_mb__after_atomic_inc();
        if (unlikely(tgt->retry_cmds > 0)) {
                struct scst_cmd *c, *tc;
@@ -1541,8 +1542,7 @@ void scst_check_retries(struct scst_tgt *tgt)
 
                spin_lock_irqsave(&tgt->tgt_lock, flags);
                list_for_each_entry_safe(c, tc, &tgt->retry_cmd_list,
-                               cmd_list_entry)
-               {
+                               cmd_list_entry) {
                        tgt->retry_cmds--;
 
                        TRACE_RETRY("Moving retry cmd %p to head of active "
@@ -3125,7 +3125,13 @@ static void scst_block_dev(struct scst_device *dev, int outstanding)
        __scst_block_dev(dev);
        spin_unlock_bh(&dev->dev_lock);
 
-       /* spin_unlock_bh() doesn't provide the necessary memory barrier */
+       /*
+        * Memory barrier is necessary here, because we need to read
+        * on_dev_count in wait_event() below after we increased block_count.
+        * Otherwise, we can miss wake up in scst_dec_on_dev_cmd().
+        * We use the explicit barrier, because spin_unlock_bh() doesn't
+        * provide the necessary memory barrier functionality.
+        */
        smp_mb();
 
        TRACE_MGMT_DBG("Waiting during blocking outstanding %d (on_dev_count "
@@ -3211,7 +3217,6 @@ repeat:
                spin_lock_bh(&dev->dev_lock);
                if (unlikely(test_bit(SCST_CMD_ABORTED, &cmd->cmd_flags)))
                        goto out_unlock;
-               barrier(); /* to reread block_count */
                if (dev->block_count > 0) {
                        scst_dec_on_dev_cmd(cmd);
                        TRACE_MGMT_DBG("Delaying cmd %p due to blocking or "
@@ -3230,7 +3235,6 @@ repeat:
        }
        if (unlikely(dev->dev_serialized)) {
                spin_lock_bh(&dev->dev_lock);
-               barrier(); /* to reread block_count */
                if (dev->block_count == 0) {
                        TRACE_MGMT_DBG("cmd %p (tag %llu), blocking further "
                                "cmds due to serializing (dev %p)", cmd,
@@ -3405,7 +3409,6 @@ void scst_xmit_process_aborted_cmd(struct scst_cmd *cmd)
 
        scst_done_cmd_mgmt(cmd);
 
-       smp_rmb();
        if (test_bit(SCST_CMD_ABORTED_OTHER, &cmd->cmd_flags)) {
                if (cmd->completed) {
                        /* It's completed and it's OK to return its result */
@@ -3548,6 +3551,7 @@ static void tm_dbg_timer_fn(unsigned long arg)
 {
        TRACE_MGMT_DBG("%s", "delayed cmd timer expired");
        tm_dbg_flags.tm_dbg_release = 1;
+       /* Used to make sure that all woken up threads see the new value */
        smp_wmb();
        wake_up_all(tm_dbg_p_cmd_list_waitQ);
 }
@@ -3763,6 +3767,10 @@ void tm_dbg_task_mgmt(struct scst_device *dev, const char *fn, int force)
                        tm_dbg_delayed_cmds_count);
                tm_dbg_change_state();
                tm_dbg_flags.tm_dbg_release = 1;
+               /*
+                * Used to make sure that all woken up threads see the new
+                * value.
+                */
                smp_wmb();
                if (tm_dbg_p_cmd_list_waitQ != NULL)
                        wake_up_all(tm_dbg_p_cmd_list_waitQ);
index b671c0b..173fac1 100644 (file)
@@ -501,13 +501,18 @@ int scst_suspend_activity(bool interruptible)
 
        set_bit(SCST_FLAG_SUSPENDING, &scst_flags);
        set_bit(SCST_FLAG_SUSPENDED, &scst_flags);
+       /*
+        * Assignment of SCST_FLAG_SUSPENDING and SCST_FLAG_SUSPENDED must be
+        * ordered with scst_cmd_count. Otherwise lockless logic in
+        * scst_translate_lun() and scst_mgmt_translate_lun() won't work.
+        */
        smp_mb__after_set_bit();
 
        /*
         * See comment in scst_user.c::dev_user_task_mgmt_fn() for more
         * information about scst_user behavior.
         *
-        * ToDo: make the global suspending unneeded (Switch to per-device
+        * ToDo: make the global suspending unneeded (switch to per-device
         * reference counting? That would mean to switch off from lockless
         * implementation of scst_translate_lun().. )
         */
@@ -530,6 +535,7 @@ int scst_suspend_activity(bool interruptible)
                goto out_clear;
 
        clear_bit(SCST_FLAG_SUSPENDING, &scst_flags);
+       /* See comment about smp_mb() above */
        smp_mb__after_clear_bit();
 
        TRACE_MGMT_DBG("Waiting for %d active commands finally to complete",
@@ -551,6 +557,7 @@ out:
 
 out_clear:
        clear_bit(SCST_FLAG_SUSPENDING, &scst_flags);
+       /* See comment about smp_mb() above */
        smp_mb__after_clear_bit();
        goto out_up;
 }
@@ -568,6 +575,10 @@ static void __scst_resume_activity(void)
                goto out;
 
        clear_bit(SCST_FLAG_SUSPENDED, &scst_flags);
+       /*
+        * The barrier is needed to make sure all woken up threads see the
+        * cleared flag. Not sure if it's really needed, but let's be safe.
+        */
        smp_mb__after_clear_bit();
 
        list_for_each_entry(l, &scst_cmd_lists_list, lists_list_entry) {
index 84b0698..b14dbf9 100644 (file)
@@ -420,6 +420,7 @@ static inline void scst_dec_on_dev_cmd(struct scst_cmd *cmd)
                scst_unblock_dev(dev);
 
        atomic_dec(&dev->on_dev_count);
+       /* See comment in scst_block_dev() */
        smp_mb__after_atomic_dec();
 
        TRACE_DBG("New on_dev_count %d", atomic_read(&dev->on_dev_count));
@@ -438,6 +439,7 @@ static inline void __scst_get(int barrier)
        TRACE_DBG("Incrementing scst_cmd_count(%d)",
                atomic_read(&scst_cmd_count));
 
+       /* See comment about smp_mb() in scst_suspend_activity() */
        if (barrier)
                smp_mb__after_atomic_inc();
 }
@@ -446,6 +448,7 @@ static inline void __scst_put(void)
 {
        int f;
        f = atomic_dec_and_test(&scst_cmd_count);
+       /* See comment about smp_mb() in scst_suspend_activity() */
        if (f && unlikely(test_bit(SCST_FLAG_SUSPENDED, &scst_flags))) {
                TRACE_MGMT_DBG("%s", "Waking up scst_dev_cmd_waitQ");
                wake_up_all(&scst_dev_cmd_waitQ);
index 43bb002..4b470c2 100644 (file)
@@ -120,7 +120,8 @@ static int scst_init_cmd(struct scst_cmd *cmd, enum scst_exec_context *context)
        }
        /*
         * Memory barrier isn't necessary here, because CPU appears to
-        * be self-consistent
+        * be self-consistent and we don't care about the race, described
+        * in comment in scst_do_job_init().
         */
 
        rc = __scst_init_cmd(cmd);
@@ -885,6 +886,12 @@ static int scst_queue_retry_cmd(struct scst_cmd *cmd, int finished_cmds)
 
        spin_lock_irqsave(&tgt->tgt_lock, flags);
        tgt->retry_cmds++;
+       /*
+        * Memory barrier is needed here, because we need the exact order
+        * between the read and write between retry_cmds and finished_cmds to
+        * not miss the case when a command finished while we queuing it for
+        * retry after the finished_cmds check.
+        */
        smp_mb();
        TRACE_RETRY("TGT QUEUE FULL: incrementing retry_cmds %d",
              tgt->retry_cmds);
@@ -1684,7 +1691,6 @@ int scst_check_local_events(struct scst_cmd *cmd)
                         * was_reset.
                         */
                        spin_lock_bh(&dev->dev_lock);
-                       barrier(); /* to reread was_reset */
                        if (dev->scsi_dev->was_reset) {
                                TRACE(TRACE_MGMT, "was_reset is %d", 1);
                                scst_set_cmd_error(cmd,
@@ -1764,12 +1770,16 @@ void scst_inc_expected_sn(struct scst_tgt_dev *tgt_dev, atomic_t *slot)
 
 inc:
        /*
-        * No locks is needed, because only one thread at time can
-        * be here (serialized by sn). Also it is supposed that there
-        * could not be half-incremented halves.
+        * No protection of expected_sn is needed, because only one thread
+        * at time can be here (serialized by sn). Also it is supposed that
+        * there could not be half-incremented halves.
         */
        tgt_dev->expected_sn++;
-       smp_mb(); /* write must be before def_cmd_count read */
+       /*
+        * Write must be before def_cmd_count read to be in sync. with
+        * scst_post_exec_sn(). See comment in scst_send_for_exec().
+        */
+       smp_mb(); 
        TRACE_SN("Next expected_sn: %ld", tgt_dev->expected_sn);
 
 out:
@@ -2168,6 +2178,17 @@ static int scst_send_for_exec(struct scst_cmd **active_cmd)
                spin_lock_irq(&tgt_dev->sn_lock);
 
                tgt_dev->def_cmd_count++;
+               /*
+                * Memory barrier is needed here to implement lockless fast
+                * path. We need the exact order of read and write between
+                * def_cmd_count and expected_sn. Otherwise, we can miss case,
+                * when expected_sn was changed to be equal to cmd->sn while
+                * we are queuing cmd the deferred list after the expected_sn
+                * below. It will lead to a forever stuck command. But with
+                * the barrier in such case __scst_check_deferred_commands()
+                * will be called and it will take sn_lock, so we will be
+                * synchronized.
+                */
                smp_mb();
 
                expected_sn = tgt_dev->expected_sn;
@@ -3044,6 +3065,7 @@ static int scst_translate_lun(struct scst_cmd *cmd)
 
        TRACE_ENTRY();
 
+       /* See comment about smp_mb() in scst_suspend_activity() */
        __scst_get(1);
 
        if (likely(!test_bit(SCST_FLAG_SUSPENDED, &scst_flags))) {
@@ -3209,11 +3231,11 @@ restart:
                 * it's inserting to it, but another command at the same time
                 * seeing init cmd list empty and goes directly, because it
                 * could affect only commands from the same initiator to the
-                * same tgt_dev, but init_cmd_done() doesn't guarantee the order
-                * in case of simultaneous such calls anyway.
+                * same tgt_dev, but scst_cmd_init_done*() doesn't guarantee
+                * the order in case of simultaneous such calls anyway.
                 */
                TRACE_MGMT_DBG("Deleting cmd %p from init cmd list", cmd);
-               smp_wmb();
+               smp_wmb(); /* enforce the required order */
                list_del(&cmd->cmd_list_entry);
                spin_unlock(&scst_init_lock);
 
@@ -3571,6 +3593,7 @@ static int scst_mgmt_translate_lun(struct scst_mgmt_cmd *mcmd)
        TRACE_DBG("Finding tgt_dev for mgmt cmd %p (lun %lld)", mcmd,
              (long long unsigned int)mcmd->lun);
 
+       /* See comment about smp_mb() in scst_suspend_activity() */
        __scst_get(1);
 
        if (unlikely(test_bit(SCST_FLAG_SUSPENDED, &scst_flags) &&
@@ -3817,10 +3840,8 @@ void scst_abort_cmd(struct scst_cmd *cmd, struct scst_mgmt_cmd *mcmd,
 
        if (other_ini) {
                /* Might be necessary if command aborted several times */
-               if (!test_bit(SCST_CMD_ABORTED, &cmd->cmd_flags)) {
+               if (!test_bit(SCST_CMD_ABORTED, &cmd->cmd_flags))
                        set_bit(SCST_CMD_ABORTED_OTHER, &cmd->cmd_flags);
-                       smp_mb__after_set_bit();
-               }
        } else {
                /* Might be necessary if command aborted several times */
                clear_bit(SCST_CMD_ABORTED_OTHER, &cmd->cmd_flags);
diff --git a/srpt/README_in-tree b/srpt/README_in-tree
new file mode 100644 (file)
index 0000000..327ed09
--- /dev/null
@@ -0,0 +1,85 @@
+SCSI RDMA Protocol (SRP) Target driver for Linux
+=================================================
+
+The SRP Target driver is designed to work directly on top of the
+OpenFabrics OFED-1.x software stack (http://www.openfabrics.org) or
+the Infiniband drivers in the Linux kernel tree
+(http://www.kernel.org). The SRP target driver also interfaces with
+the generic SCSI target mid-level driver called SCST
+(http://scst.sourceforge.net).
+
+How-to run
+-----------
+
+A. On srp target machine
+1. Please refer to SCST's README for loading scst driver and its
+dev_handlers drivers (scst_disk, scst_vdisk block or file IO mode, nullio, ...)
+
+Example 1: working with real back-end scsi disks
+a. modprobe scst
+b. modprobe scst_disk
+c. cat /proc/scsi_tgt/scsi_tgt
+
+ibstor00:~ # cat /proc/scsi_tgt/scsi_tgt
+Device (host:ch:id:lun or name)                             Device handler
+0:0:0:0                                                     dev_disk
+4:0:0:0                                                     dev_disk
+5:0:0:0                                                     dev_disk
+6:0:0:0                                                     dev_disk
+7:0:0:0                                                     dev_disk
+
+Now you want to exclude the first scsi disk and expose the last 4 scsi disks as
+IB/SRP luns for I/O
+echo "add 4:0:0:0 0" >/proc/scsi_tgt/groups/Default/devices
+echo "add 5:0:0:0 1" >/proc/scsi_tgt/groups/Default/devices
+echo "add 6:0:0:0 2" >/proc/scsi_tgt/groups/Default/devices
+echo "add 7:0:0:0 3" >/proc/scsi_tgt/groups/Default/devices
+
+Example 2: working with VDISK FILEIO mode (using md0 device and file 10G-file)
+a. modprobe scst
+b. modprobe scst_vdisk
+c. echo "open vdisk0 /dev/md0" > /proc/scsi_tgt/vdisk/vdisk
+d. echo "open vdisk1 /10G-file" > /proc/scsi_tgt/vdisk/vdisk
+e. echo "add vdisk0 0" >/proc/scsi_tgt/groups/Default/devices
+f. echo "add vdisk1 1" >/proc/scsi_tgt/groups/Default/devices
+
+Example 3: working with VDISK BLOCKIO mode (using md0 device, sda, and cciss/c1d0)
+a. modprobe scst
+b. modprobe scst_vdisk
+c. echo "open vdisk0 /dev/md0 BLOCKIO" > /proc/scsi_tgt/vdisk/vdisk
+d. echo "open vdisk1 /dev/sda BLOCKIO" > /proc/scsi_tgt/vdisk/vdisk
+e. echo "open vdisk2 /dev/cciss/c1d0 BLOCKIO" > /proc/scsi_tgt/vdisk/vdisk
+f. echo "add vdisk0 0" >/proc/scsi_tgt/groups/Default/devices
+g. echo "add vdisk1 1" >/proc/scsi_tgt/groups/Default/devices
+h. echo "add vdisk2 2" >/proc/scsi_tgt/groups/Default/devices
+
+2. modprobe ib_srpt
+
+
+B. On initiator machines you can manualy do the following steps:
+1. modprobe ib_srp
+2. ipsrpdm -c (to discover new SRP target)
+3. echo <new target info> > /sys/class/infiniband_srp/srp-mthca0-1/add_target
+4. fdisk -l (will show new discovered scsi disks)
+
+Example:
+Assume that you use port 1 of first HCA in the system ie. mthca0
+
+[root@lab104 ~]# ibsrpdm -c -d /dev/infiniband/umad0
+id_ext=0002c90200226cf4,ioc_guid=0002c90200226cf4,
+dgid=fe800000000000000002c90200226cf5,pkey=ffff,service_id=0002c90200226cf4
+[root@lab104 ~]# echo id_ext=0002c90200226cf4,ioc_guid=0002c90200226cf4,
+dgid=fe800000000000000002c90200226cf5,pkey=ffff,service_id=0002c90200226cf4 >
+/sys/class/infiniband_srp/srp-mthca0-1/add_target
+
+OR
+
++ You can edit /etc/infiniband/openib.conf to load srp driver and srp HA daemon
+automatically ie. set SRP_LOAD=yes, and SRPHA_ENABLE=yes
++ To set up and use high availability feature you need dm-multipath driver
+and multipath tool
++ Please refer to OFED-1.x SRP's user manual for more in-details instructions
+on how-to enable/use HA feature
+
+To minimize QUEUEFULL conditions, you can apply scst_increase_max_tgt_cmds
+patch from SRPT package from http://sourceforge.net/project/showfiles.php?group_id=110471