* We use wait_event() to wait for the state change, but it checks its
* condition without any protection, so without cmnd_get() it is
* possible that req will die "immediately" after the state assignment
- * and wake_up() will operate on dead data.
+ * and wake_up() will operate on dead data. We use the ordered version
+ * of cmnd_get(), because "get" must be done before the state
+ * assignment.
*/
cmnd_get_ordered(req);
req->scst_state = new_state;
sBUG();
#endif
+ /*
+ * "_ordered" here to protect from reorder, which can lead to
+ * preliminary connection destroy in req_cmnd_release(). Just in
+ * case, actually, because reordering shouldn't go so far, but who
+ * knows..
+ */
conn_get_ordered(conn);
req_cmnd_release(req);
iscsi_try_local_processing(conn);
static inline void cmnd_get_ordered(struct iscsi_cmnd *cmnd)
{
cmnd_get(cmnd);
+ /* See comments for each cmnd_get_ordered() use */
smp_mb__after_atomic_inc();
}
static inline void conn_get_ordered(struct iscsi_conn *conn)
{
conn_get(conn);
+ /* See comments for each conn_get_ordered() use */
smp_mb__after_atomic_inc();
}
sBUG_ON(atomic_read(&conn->conn_ref_cnt) == 0);
/*
- * It always ordered to protect from undesired side effects like
- * accessing just destroyed object because of this *_dec() reordering.
+ * Make it always ordered to protect from undesired side effects like
+ * accessing just destroyed by close_conn() conn caused by reordering
+ * of this atomic_dec().
*/
smp_mb__before_atomic_dec();
atomic_dec(&conn->conn_ref_cnt);
fi \
| process_patch "srpt.diff"
-add_file "srpt/README" "Documentation/scst/README.srpt" \
+add_file "srpt/README_in-tree" "Documentation/scst/README.srpt" \
| process_patch "srpt-doc.diff"
- General setup->Configure standard kernel features (for small systems): ON
+ - General setup->Prompt for development and/or incomplete code/drivers: ON
+
- Processor type and features->High Memory Support: OFF
- Processor type and features->Memory split: according to amount of
- General setup->Configure standard kernel features (for small systems): ON
+ - General setup->Prompt for development and/or incomplete code/drivers: ON
+
- Processor type and features->High Memory Support: OFF
- Processor type and features->Memory split: according to amount of
if (unlikely((cmd->data_direction == SCST_DATA_READ) ||
(cmd->resp_data_len != 0)))
goto out_inval;
+ /*
+ * background_exec assignment must be after ucmd get.
+ * Otherwise, due to reorder, in dev_user_process_reply()
+ * it is possible that ucmd is destroyed before it "got" here.
+ */
ucmd_get_ordered(ucmd);
ucmd->background_exec = 1;
TRACE_DBG("Background ucmd %p", ucmd);
goto out_unlock;
}
+ /* To sync. with dev_user_process_reply_exec(). See comment there. */
+ smp_mb();
if (ucmd->background_exec) {
state = UCMD_STATE_EXECING;
goto unlock_process;
/* +1 to prevent erroneous too early command completion */
atomic_set(&blockio_work->bios_inflight, bios+1);
- smp_wmb();
while (hbio) {
bio = hbio;
/*
* We don't worry about overflow of finished_cmds, because we check
- * only for its change
+ * only for its change.
*/
atomic_inc(&tgt->finished_cmds);
+ /* See comment in scst_queue_retry_cmd() */
smp_mb__after_atomic_inc();
if (unlikely(tgt->retry_cmds > 0)) {
struct scst_cmd *c, *tc;
spin_lock_irqsave(&tgt->tgt_lock, flags);
list_for_each_entry_safe(c, tc, &tgt->retry_cmd_list,
- cmd_list_entry)
- {
+ cmd_list_entry) {
tgt->retry_cmds--;
TRACE_RETRY("Moving retry cmd %p to head of active "
__scst_block_dev(dev);
spin_unlock_bh(&dev->dev_lock);
- /* spin_unlock_bh() doesn't provide the necessary memory barrier */
+ /*
+ * Memory barrier is necessary here, because we need to read
+ * on_dev_count in wait_event() below after we increased block_count.
+ * Otherwise, we can miss wake up in scst_dec_on_dev_cmd().
+ * We use the explicit barrier, because spin_unlock_bh() doesn't
+ * provide the necessary memory barrier functionality.
+ */
smp_mb();
TRACE_MGMT_DBG("Waiting during blocking outstanding %d (on_dev_count "
spin_lock_bh(&dev->dev_lock);
if (unlikely(test_bit(SCST_CMD_ABORTED, &cmd->cmd_flags)))
goto out_unlock;
- barrier(); /* to reread block_count */
if (dev->block_count > 0) {
scst_dec_on_dev_cmd(cmd);
TRACE_MGMT_DBG("Delaying cmd %p due to blocking or "
}
if (unlikely(dev->dev_serialized)) {
spin_lock_bh(&dev->dev_lock);
- barrier(); /* to reread block_count */
if (dev->block_count == 0) {
TRACE_MGMT_DBG("cmd %p (tag %llu), blocking further "
"cmds due to serializing (dev %p)", cmd,
scst_done_cmd_mgmt(cmd);
- smp_rmb();
if (test_bit(SCST_CMD_ABORTED_OTHER, &cmd->cmd_flags)) {
if (cmd->completed) {
/* It's completed and it's OK to return its result */
{
TRACE_MGMT_DBG("%s", "delayed cmd timer expired");
tm_dbg_flags.tm_dbg_release = 1;
+ /* Used to make sure that all woken up threads see the new value */
smp_wmb();
wake_up_all(tm_dbg_p_cmd_list_waitQ);
}
tm_dbg_delayed_cmds_count);
tm_dbg_change_state();
tm_dbg_flags.tm_dbg_release = 1;
+ /*
+ * Used to make sure that all woken up threads see the new
+ * value.
+ */
smp_wmb();
if (tm_dbg_p_cmd_list_waitQ != NULL)
wake_up_all(tm_dbg_p_cmd_list_waitQ);
set_bit(SCST_FLAG_SUSPENDING, &scst_flags);
set_bit(SCST_FLAG_SUSPENDED, &scst_flags);
+ /*
+ * Assignment of SCST_FLAG_SUSPENDING and SCST_FLAG_SUSPENDED must be
+ * ordered with scst_cmd_count. Otherwise lockless logic in
+ * scst_translate_lun() and scst_mgmt_translate_lun() won't work.
+ */
smp_mb__after_set_bit();
/*
* See comment in scst_user.c::dev_user_task_mgmt_fn() for more
* information about scst_user behavior.
*
- * ToDo: make the global suspending unneeded (Switch to per-device
+ * ToDo: make the global suspending unneeded (switch to per-device
* reference counting? That would mean to switch off from lockless
* implementation of scst_translate_lun().. )
*/
goto out_clear;
clear_bit(SCST_FLAG_SUSPENDING, &scst_flags);
+ /* See comment about smp_mb() above */
smp_mb__after_clear_bit();
TRACE_MGMT_DBG("Waiting for %d active commands finally to complete",
out_clear:
clear_bit(SCST_FLAG_SUSPENDING, &scst_flags);
+ /* See comment about smp_mb() above */
smp_mb__after_clear_bit();
goto out_up;
}
goto out;
clear_bit(SCST_FLAG_SUSPENDED, &scst_flags);
+ /*
+ * The barrier is needed to make sure all woken up threads see the
+ * cleared flag. Not sure if it's really needed, but let's be safe.
+ */
smp_mb__after_clear_bit();
list_for_each_entry(l, &scst_cmd_lists_list, lists_list_entry) {
scst_unblock_dev(dev);
atomic_dec(&dev->on_dev_count);
+ /* See comment in scst_block_dev() */
smp_mb__after_atomic_dec();
TRACE_DBG("New on_dev_count %d", atomic_read(&dev->on_dev_count));
TRACE_DBG("Incrementing scst_cmd_count(%d)",
atomic_read(&scst_cmd_count));
+ /* See comment about smp_mb() in scst_suspend_activity() */
if (barrier)
smp_mb__after_atomic_inc();
}
{
int f;
f = atomic_dec_and_test(&scst_cmd_count);
+ /* See comment about smp_mb() in scst_suspend_activity() */
if (f && unlikely(test_bit(SCST_FLAG_SUSPENDED, &scst_flags))) {
TRACE_MGMT_DBG("%s", "Waking up scst_dev_cmd_waitQ");
wake_up_all(&scst_dev_cmd_waitQ);
}
/*
* Memory barrier isn't necessary here, because CPU appears to
- * be self-consistent
+ * be self-consistent and we don't care about the race, described
+ * in comment in scst_do_job_init().
*/
rc = __scst_init_cmd(cmd);
spin_lock_irqsave(&tgt->tgt_lock, flags);
tgt->retry_cmds++;
+ /*
+ * Memory barrier is needed here, because we need the exact order
+ * between the read and write between retry_cmds and finished_cmds to
+ * not miss the case when a command finished while we queuing it for
+ * retry after the finished_cmds check.
+ */
smp_mb();
TRACE_RETRY("TGT QUEUE FULL: incrementing retry_cmds %d",
tgt->retry_cmds);
* was_reset.
*/
spin_lock_bh(&dev->dev_lock);
- barrier(); /* to reread was_reset */
if (dev->scsi_dev->was_reset) {
TRACE(TRACE_MGMT, "was_reset is %d", 1);
scst_set_cmd_error(cmd,
inc:
/*
- * No locks is needed, because only one thread at time can
- * be here (serialized by sn). Also it is supposed that there
- * could not be half-incremented halves.
+ * No protection of expected_sn is needed, because only one thread
+ * at time can be here (serialized by sn). Also it is supposed that
+ * there could not be half-incremented halves.
*/
tgt_dev->expected_sn++;
- smp_mb(); /* write must be before def_cmd_count read */
+ /*
+ * Write must be before def_cmd_count read to be in sync. with
+ * scst_post_exec_sn(). See comment in scst_send_for_exec().
+ */
+ smp_mb();
TRACE_SN("Next expected_sn: %ld", tgt_dev->expected_sn);
out:
spin_lock_irq(&tgt_dev->sn_lock);
tgt_dev->def_cmd_count++;
+ /*
+ * Memory barrier is needed here to implement lockless fast
+ * path. We need the exact order of read and write between
+ * def_cmd_count and expected_sn. Otherwise, we can miss case,
+ * when expected_sn was changed to be equal to cmd->sn while
+ * we are queuing cmd the deferred list after the expected_sn
+ * below. It will lead to a forever stuck command. But with
+ * the barrier in such case __scst_check_deferred_commands()
+ * will be called and it will take sn_lock, so we will be
+ * synchronized.
+ */
smp_mb();
expected_sn = tgt_dev->expected_sn;
TRACE_ENTRY();
+ /* See comment about smp_mb() in scst_suspend_activity() */
__scst_get(1);
if (likely(!test_bit(SCST_FLAG_SUSPENDED, &scst_flags))) {
* it's inserting to it, but another command at the same time
* seeing init cmd list empty and goes directly, because it
* could affect only commands from the same initiator to the
- * same tgt_dev, but init_cmd_done() doesn't guarantee the order
- * in case of simultaneous such calls anyway.
+ * same tgt_dev, but scst_cmd_init_done*() doesn't guarantee
+ * the order in case of simultaneous such calls anyway.
*/
TRACE_MGMT_DBG("Deleting cmd %p from init cmd list", cmd);
- smp_wmb();
+ smp_wmb(); /* enforce the required order */
list_del(&cmd->cmd_list_entry);
spin_unlock(&scst_init_lock);
TRACE_DBG("Finding tgt_dev for mgmt cmd %p (lun %lld)", mcmd,
(long long unsigned int)mcmd->lun);
+ /* See comment about smp_mb() in scst_suspend_activity() */
__scst_get(1);
if (unlikely(test_bit(SCST_FLAG_SUSPENDED, &scst_flags) &&
if (other_ini) {
/* Might be necessary if command aborted several times */
- if (!test_bit(SCST_CMD_ABORTED, &cmd->cmd_flags)) {
+ if (!test_bit(SCST_CMD_ABORTED, &cmd->cmd_flags))
set_bit(SCST_CMD_ABORTED_OTHER, &cmd->cmd_flags);
- smp_mb__after_set_bit();
- }
} else {
/* Might be necessary if command aborted several times */
clear_bit(SCST_CMD_ABORTED_OTHER, &cmd->cmd_flags);
--- /dev/null
+SCSI RDMA Protocol (SRP) Target driver for Linux
+=================================================
+
+The SRP Target driver is designed to work directly on top of the
+OpenFabrics OFED-1.x software stack (http://www.openfabrics.org) or
+the Infiniband drivers in the Linux kernel tree
+(http://www.kernel.org). The SRP target driver also interfaces with
+the generic SCSI target mid-level driver called SCST
+(http://scst.sourceforge.net).
+
+How-to run
+-----------
+
+A. On srp target machine
+1. Please refer to SCST's README for loading scst driver and its
+dev_handlers drivers (scst_disk, scst_vdisk block or file IO mode, nullio, ...)
+
+Example 1: working with real back-end scsi disks
+a. modprobe scst
+b. modprobe scst_disk
+c. cat /proc/scsi_tgt/scsi_tgt
+
+ibstor00:~ # cat /proc/scsi_tgt/scsi_tgt
+Device (host:ch:id:lun or name) Device handler
+0:0:0:0 dev_disk
+4:0:0:0 dev_disk
+5:0:0:0 dev_disk
+6:0:0:0 dev_disk
+7:0:0:0 dev_disk
+
+Now you want to exclude the first scsi disk and expose the last 4 scsi disks as
+IB/SRP luns for I/O
+echo "add 4:0:0:0 0" >/proc/scsi_tgt/groups/Default/devices
+echo "add 5:0:0:0 1" >/proc/scsi_tgt/groups/Default/devices
+echo "add 6:0:0:0 2" >/proc/scsi_tgt/groups/Default/devices
+echo "add 7:0:0:0 3" >/proc/scsi_tgt/groups/Default/devices
+
+Example 2: working with VDISK FILEIO mode (using md0 device and file 10G-file)
+a. modprobe scst
+b. modprobe scst_vdisk
+c. echo "open vdisk0 /dev/md0" > /proc/scsi_tgt/vdisk/vdisk
+d. echo "open vdisk1 /10G-file" > /proc/scsi_tgt/vdisk/vdisk
+e. echo "add vdisk0 0" >/proc/scsi_tgt/groups/Default/devices
+f. echo "add vdisk1 1" >/proc/scsi_tgt/groups/Default/devices
+
+Example 3: working with VDISK BLOCKIO mode (using md0 device, sda, and cciss/c1d0)
+a. modprobe scst
+b. modprobe scst_vdisk
+c. echo "open vdisk0 /dev/md0 BLOCKIO" > /proc/scsi_tgt/vdisk/vdisk
+d. echo "open vdisk1 /dev/sda BLOCKIO" > /proc/scsi_tgt/vdisk/vdisk
+e. echo "open vdisk2 /dev/cciss/c1d0 BLOCKIO" > /proc/scsi_tgt/vdisk/vdisk
+f. echo "add vdisk0 0" >/proc/scsi_tgt/groups/Default/devices
+g. echo "add vdisk1 1" >/proc/scsi_tgt/groups/Default/devices
+h. echo "add vdisk2 2" >/proc/scsi_tgt/groups/Default/devices
+
+2. modprobe ib_srpt
+
+
+B. On initiator machines you can manualy do the following steps:
+1. modprobe ib_srp
+2. ipsrpdm -c (to discover new SRP target)
+3. echo <new target info> > /sys/class/infiniband_srp/srp-mthca0-1/add_target
+4. fdisk -l (will show new discovered scsi disks)
+
+Example:
+Assume that you use port 1 of first HCA in the system ie. mthca0
+
+[root@lab104 ~]# ibsrpdm -c -d /dev/infiniband/umad0
+id_ext=0002c90200226cf4,ioc_guid=0002c90200226cf4,
+dgid=fe800000000000000002c90200226cf5,pkey=ffff,service_id=0002c90200226cf4
+[root@lab104 ~]# echo id_ext=0002c90200226cf4,ioc_guid=0002c90200226cf4,
+dgid=fe800000000000000002c90200226cf5,pkey=ffff,service_id=0002c90200226cf4 >
+/sys/class/infiniband_srp/srp-mthca0-1/add_target
+
+OR
+
++ You can edit /etc/infiniband/openib.conf to load srp driver and srp HA daemon
+automatically ie. set SRP_LOAD=yes, and SRPHA_ENABLE=yes
++ To set up and use high availability feature you need dm-multipath driver
+and multipath tool
++ Please refer to OFED-1.x SRP's user manual for more in-details instructions
+on how-to enable/use HA feature
+
+To minimize QUEUEFULL conditions, you can apply scst_increase_max_tgt_cmds
+patch from SRPT package from http://sourceforge.net/project/showfiles.php?group_id=110471