(target mode)
There is a race between the following:
CPU 1:
scst_hw_pending_work_fn() ->
sqa_on_hw_pending_cmd_timeout() ->
qlt_abort_cmd() ->
qlt_unmap_sg()
CPU 2:
qla_do_work() ->
qla24xx_process_response_queue() ->
qlt_do_ctio_completion() ->
qlt_unmap_sg()
Two CPUs calling qlt_unmap_sg() on the same cmd at the same time
results in an oops:
dma_unmap_sg_attrs()
BUG_ON(!valid_dma_direction(dir));
This race is more likely to happen because qlt_abort_cmd() may cause the
hardware to send a CTIO.
The solution is to lock cmd->qpair->qp_lock_ptr when aborting a command.
This makes it possible to check the cmd state and make decisions about
what to do without racing with the CTIO handler and other code.
- Lock cmd->qpair->qp_lock_ptr when aborting a cmd.
- Eliminate cmd->cmd_lock and change cmd->aborted back to a bitfield
since it is now protected by qp_lock_ptr just like all the other
flags.
- Add another command state QLA_TGT_STATE_DONE to avoid any possible
races between qlt_abort_cmd() and tgt_ops->free_cmd().
- Add the cmd->sent_term_exchg flag to indicate if
qlt_send_term_exchange() has already been called.
- For SCST (scst_hw_pending_work_fn()), export qlt_send_term_exchange()
and qlt_unmap_sg() so that they can be called directly instead of
trying to make qlt_abort_cmd() work for both HW timeout and TMR abort.
- Add TRC_CTIO_IGNORED for scst_hw_pending_work_fn().
Fixes: 26f9ce53817a ("scsi: qla2xxx: Fix missed DMA unmap for aborted commands")
Signed-off-by: Tony Battersby <tonyb@cybernetics.com>
---
drivers/scsi/qla2xxx/qla_target.c | 204 +++++++++++++----------------
drivers/scsi/qla2xxx/qla_target.h | 19 ++-
drivers/scsi/qla2xxx/tcm_qla2xxx.c | 2 +
3 files changed, 105 insertions(+), 120 deletions(-)
diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c
index b700bfc642b3..1160ca4c118f 100644
--- a/drivers/scsi/qla2xxx/qla_target.c
+++ b/drivers/scsi/qla2xxx/qla_target.c
@@ -104,8 +104,6 @@ static void qlt_response_pkt(struct scsi_qla_host *ha, struct rsp_que *rsp,
response_t *pkt);
static int qlt_issue_task_mgmt(struct fc_port *sess, u64 lun,
int fn, void *iocb, int flags);
-static void qlt_send_term_exchange(struct qla_qpair *, struct qla_tgt_cmd
- *cmd, struct atio_from_isp *atio, int ha_locked, int ul_abort);
static void qlt_alloc_qfull_cmd(struct scsi_qla_host *vha,
struct atio_from_isp *atio, uint16_t status, int qfull);
static void qlt_disable_vha(struct scsi_qla_host *vha);
@@ -136,20 +134,6 @@ static struct workqueue_struct *qla_tgt_wq;
static DEFINE_MUTEX(qla_tgt_mutex);
static LIST_HEAD(qla_tgt_glist);
-static const char *prot_op_str(u32 prot_op)
-{
- switch (prot_op) {
- case TARGET_PROT_NORMAL: return "NORMAL";
- case TARGET_PROT_DIN_INSERT: return "DIN_INSERT";
- case TARGET_PROT_DOUT_INSERT: return "DOUT_INSERT";
- case TARGET_PROT_DIN_STRIP: return "DIN_STRIP";
- case TARGET_PROT_DOUT_STRIP: return "DOUT_STRIP";
- case TARGET_PROT_DIN_PASS: return "DIN_PASS";
- case TARGET_PROT_DOUT_PASS: return "DOUT_PASS";
- default: return "UNKNOWN";
- }
-}
-
/* This API intentionally takes dest as a parameter, rather than returning
* int value to avoid caller forgetting to issue wmb() after the store */
void qlt_do_generation_tick(struct scsi_qla_host *vha, int *dest)
@@ -252,7 +236,7 @@ static void qlt_queue_unknown_atio(scsi_qla_host_t *vha,
return;
out_term:
- qlt_send_term_exchange(vha->hw->base_qpair, NULL, atio, ha_locked, 0);
+ qlt_send_term_exchange(vha->hw->base_qpair, NULL, atio, ha_locked);
goto out;
}
@@ -271,7 +255,7 @@ static void qlt_try_to_dequeue_unknown_atios(struct scsi_qla_host *vha,
"Freeing unknown %s %p, because of Abort\n",
"ATIO_TYPE7", u);
qlt_send_term_exchange(vha->hw->base_qpair, NULL,
- &u->atio, ha_locked, 0);
+ &u->atio, ha_locked);
goto abort;
}
@@ -285,7 +269,7 @@ static void qlt_try_to_dequeue_unknown_atios(struct scsi_qla_host *vha,
"Freeing unknown %s %p, because tgt is being stopped\n",
"ATIO_TYPE7", u);
qlt_send_term_exchange(vha->hw->base_qpair, NULL,
- &u->atio, ha_locked, 0);
+ &u->atio, ha_locked);
} else {
ql_dbg(ql_dbg_async + ql_dbg_verbose, vha, 0x503d,
"Reschedule u %p, vha %p, host %p\n", u, vha, host);
@@ -2447,7 +2431,7 @@ static int qlt_pci_map_calc_cnt(struct qla_tgt_prm *prm)
return -1;
}
-static void qlt_unmap_sg(struct scsi_qla_host *vha, struct qla_tgt_cmd *cmd)
+void qlt_unmap_sg(struct scsi_qla_host *vha, struct qla_tgt_cmd *cmd)
{
struct qla_hw_data *ha;
struct qla_qpair *qpair;
@@ -2473,6 +2457,7 @@ static void qlt_unmap_sg(struct scsi_qla_host *vha, struct qla_tgt_cmd *cmd)
dma_pool_free(ha->dl_dma_pool, cmd->ctx, cmd->ctx->crc_ctx_dma);
}
+EXPORT_SYMBOL(qlt_unmap_sg);
static int qlt_check_reserve_free_req(struct qla_qpair *qpair,
uint32_t req_cnt)
@@ -3461,7 +3446,6 @@ qlt_handle_dif_error(struct qla_qpair *qpair, struct qla_tgt_cmd *cmd,
uint8_t *ep = &sts->expected_dif[0];
uint64_t lba = cmd->se_cmd.t_task_lba;
uint8_t scsi_status, sense_key, asc, ascq;
- unsigned long flags;
struct scsi_qla_host *vha = cmd->vha;
cmd->trc_flags |= TRC_DIF_ERR;
@@ -3535,13 +3519,10 @@ qlt_handle_dif_error(struct qla_qpair *qpair, struct qla_tgt_cmd *cmd,
vha->hw->tgt.tgt_ops->handle_data(cmd);
break;
default:
- spin_lock_irqsave(&cmd->cmd_lock, flags);
- if (cmd->aborted) {
- spin_unlock_irqrestore(&cmd->cmd_lock, flags);
+ if (cmd->sent_term_exchg) {
vha->hw->tgt.tgt_ops->free_cmd(cmd);
break;
}
- spin_unlock_irqrestore(&cmd->cmd_lock, flags);
qlt_send_resp_ctio(qpair, cmd, scsi_status, sense_key, asc,
ascq);
@@ -3696,9 +3677,22 @@ static int __qlt_send_term_exchange(struct qla_qpair *qpair,
return 0;
}
-static void qlt_send_term_exchange(struct qla_qpair *qpair,
- struct qla_tgt_cmd *cmd, struct atio_from_isp *atio, int ha_locked,
- int ul_abort)
+/*
+ * Aborting a command that is active in the FW (i.e. cmd->cmd_sent_to_fw == 1)
+ * will usually trigger the FW to send a completion CTIO with error status,
+ * and the driver will then call the ->handle_data() or ->free_cmd() callbacks.
+ * This can be used to clear a command that is locked up in the FW unless there
+ * is something more seriously wrong.
+ *
+ * Aborting a command that is not active in the FW (i.e.
+ * cmd->cmd_sent_to_fw == 0) will not directly trigger any callbacks. Instead,
+ * when the target mode midlevel calls qlt_rdy_to_xfer() or
+ * qlt_xmit_response(), the driver will see that the cmd has been aborted and
+ * call the appropriate callback immediately without performing the requested
+ * operation.
+ */
+void qlt_send_term_exchange(struct qla_qpair *qpair,
+ struct qla_tgt_cmd *cmd, struct atio_from_isp *atio, int ha_locked)
{
struct scsi_qla_host *vha;
unsigned long flags = 0;
@@ -3722,10 +3716,14 @@ static void qlt_send_term_exchange(struct qla_qpair *qpair,
qlt_alloc_qfull_cmd(vha, atio, 0, 0);
done:
- if (cmd && !ul_abort && !cmd->aborted) {
- if (cmd->sg_mapped)
- qlt_unmap_sg(vha, cmd);
- vha->hw->tgt.tgt_ops->free_cmd(cmd);
+ if (cmd) {
+ /*
+ * Set this even if -ENOMEM above, since term exchange will be
+ * sent eventually...
+ */
+ cmd->sent_term_exchg = 1;
+ cmd->aborted = 1;
+ cmd->jiffies_at_term_exchg = jiffies;
}
if (!ha_locked)
@@ -3733,6 +3731,7 @@ static void qlt_send_term_exchange(struct qla_qpair *qpair,
return;
}
+EXPORT_SYMBOL(qlt_send_term_exchange);
static void qlt_init_term_exchange(struct scsi_qla_host *vha)
{
@@ -3783,38 +3782,35 @@ static void qlt_chk_exch_leak_thresh_hold(struct scsi_qla_host *vha)
int qlt_abort_cmd(struct qla_tgt_cmd *cmd)
{
- struct qla_tgt *tgt = cmd->tgt;
- struct scsi_qla_host *vha = tgt->vha;
- struct se_cmd *se_cmd = &cmd->se_cmd;
+ struct scsi_qla_host *vha = cmd->vha;
+ struct qla_qpair *qpair = cmd->qpair;
unsigned long flags;
- ql_dbg(ql_dbg_tgt_mgt, vha, 0xf014,
- "qla_target(%d): terminating exchange for aborted cmd=%p "
- "(se_cmd=%p, tag=%llu)", vha->vp_idx, cmd, &cmd->se_cmd,
- se_cmd->tag);
-
- spin_lock_irqsave(&cmd->cmd_lock, flags);
- if (cmd->aborted) {
- if (cmd->sg_mapped)
- qlt_unmap_sg(vha, cmd);
+ spin_lock_irqsave(qpair->qp_lock_ptr, flags);
- spin_unlock_irqrestore(&cmd->cmd_lock, flags);
- /*
- * It's normal to see 2 calls in this path:
- * 1) XFER Rdy completion + CMD_T_ABORT
- * 2) TCM TMR - drain_state_list
- */
- ql_dbg(ql_dbg_tgt_mgt, vha, 0xf016,
- "multiple abort. %p transport_state %x, t_state %x, "
- "se_cmd_flags %x\n", cmd, cmd->se_cmd.transport_state,
- cmd->se_cmd.t_state, cmd->se_cmd.se_cmd_flags);
- return -EIO;
+ ql_dbg(ql_dbg_tgt_mgt, vha, 0xf014,
+ "qla_target(%d): tag %lld: cmd being aborted (state %d) %s; %s\n",
+ vha->vp_idx, cmd->se_cmd.tag, cmd->state,
+ cmd->cmd_sent_to_fw ? "sent to fw" : "not sent to fw",
+ cmd->aborted ? "aborted" : "not aborted");
+
+ if (cmd->state != QLA_TGT_STATE_DONE && !cmd->sent_term_exchg) {
+ if (!qpair->fw_started ||
+ cmd->reset_count != qpair->chip_reset) {
+ /*
+ * Chip was reset; just pretend that we sent the term
+ * exchange.
+ */
+ cmd->sent_term_exchg = 1;
+ cmd->aborted = 1;
+ cmd->jiffies_at_term_exchg = jiffies;
+ } else {
+ qlt_send_term_exchange(qpair, cmd, &cmd->atio, 1);
+ }
}
- cmd->aborted = 1;
- cmd->trc_flags |= TRC_ABORT;
- spin_unlock_irqrestore(&cmd->cmd_lock, flags);
- qlt_send_term_exchange(cmd->qpair, cmd, &cmd->atio, 0, 1);
+ spin_unlock_irqrestore(qpair->qp_lock_ptr, flags);
+
return 0;
}
EXPORT_SYMBOL(qlt_abort_cmd);
@@ -3845,40 +3841,6 @@ void qlt_free_cmd(struct qla_tgt_cmd *cmd)
}
EXPORT_SYMBOL(qlt_free_cmd);
-/*
- * ha->hardware_lock supposed to be held on entry. Might drop it, then reaquire
- */
-static int qlt_term_ctio_exchange(struct qla_qpair *qpair, void *ctio,
- struct qla_tgt_cmd *cmd, uint32_t status)
-{
- int term = 0;
- struct scsi_qla_host *vha = qpair->vha;
-
- if (cmd->se_cmd.prot_op)
- ql_dbg(ql_dbg_tgt_dif, vha, 0xe013,
- "Term DIF cmd: lba[0x%llx|%lld] len[0x%x] "
- "se_cmd=%p tag[%x] op %#x/%s",
- cmd->lba, cmd->lba,
- cmd->num_blks, &cmd->se_cmd,
- cmd->atio.u.isp24.exchange_addr,
- cmd->se_cmd.prot_op,
- prot_op_str(cmd->se_cmd.prot_op));
-
- if (ctio != NULL) {
- struct ctio7_from_24xx *c = (struct ctio7_from_24xx *)ctio;
-
- term = !(c->flags &
- cpu_to_le16(OF_TERM_EXCH));
- } else
- term = 1;
-
- if (term)
- qlt_send_term_exchange(qpair, cmd, &cmd->atio, 1, 0);
-
- return term;
-}
-
-
/* ha->hardware_lock supposed to be held on entry */
static void *qlt_ctio_to_cmd(struct scsi_qla_host *vha,
struct rsp_que *rsp, uint32_t handle, void *ctio)
@@ -3984,22 +3946,35 @@ static void qlt_do_ctio_completion(struct scsi_qla_host *vha,
qlt_unmap_sg(vha, cmd);
if (unlikely(status != CTIO_SUCCESS)) {
+ bool term_exchg = false;
+
+ /*
+ * If the hardware terminated the exchange, then we don't need
+ * to send an explicit term exchange message.
+ */
+ if (ctio_flags & OF_TERM_EXCH) {
+ cmd->sent_term_exchg = 1;
+ cmd->aborted = 1;
+ cmd->jiffies_at_term_exchg = jiffies;
+ }
+
switch (status & 0xFFFF) {
case CTIO_INVALID_RX_ID:
+ term_exchg = true;
if (printk_ratelimit())
dev_info(&vha->hw->pdev->dev,
"qla_target(%d): CTIO with INVALID_RX_ID ATIO attr %x CTIO Flags %x|%x\n",
vha->vp_idx, cmd->atio.u.isp24.attr,
((cmd->ctio_flags >> 9) & 0xf),
cmd->ctio_flags);
-
break;
+
case CTIO_LIP_RESET:
case CTIO_TARGET_RESET:
case CTIO_ABORTED:
- /* driver request abort via Terminate exchange */
+ term_exchg = true;
+ fallthrough;
case CTIO_TIMEOUT:
- /* They are OK */
ql_dbg(ql_dbg_tgt_mgt, vha, 0xf058,
"qla_target(%d): CTIO with "
"status %#x received, state %x, se_cmd %p, "
@@ -4020,6 +3995,7 @@ static void qlt_do_ctio_completion(struct scsi_qla_host *vha,
logged_out ? "PORT LOGGED OUT" : "PORT UNAVAILABLE",
status, cmd->state, se_cmd);
+ term_exchg = true;
if (logged_out && cmd->sess) {
/*
* Session is already logged out, but we need
@@ -4065,19 +4041,21 @@ static void qlt_do_ctio_completion(struct scsi_qla_host *vha,
break;
}
+ cmd->trc_flags |= TRC_CTIO_ERR;
- /* "cmd->aborted" means
- * cmd is already aborted/terminated, we don't
- * need to terminate again. The exchange is already
- * cleaned up/freed at FW level. Just cleanup at driver
- * level.
+ /*
+ * In state QLA_TGT_STATE_NEED_DATA the failed CTIO was for
+ * Data-Out, so either abort the exchange or try sending check
+ * condition with sense data depending on the severity of
+ * the error. In state QLA_TGT_STATE_PROCESSED the failed CTIO
+ * was for status (and possibly Data-In), so don't try sending
+ * an error status again in that case (if the error was for
+ * Data-In with status, we could try sending status without
+ * Data-In, but we don't do that currently).
*/
- if ((cmd->state != QLA_TGT_STATE_NEED_DATA) &&
- (!cmd->aborted)) {
- cmd->trc_flags |= TRC_CTIO_ERR;
- if (qlt_term_ctio_exchange(qpair, ctio, cmd, status))
- return;
- }
+ if (!cmd->sent_term_exchg &&
+ (term_exchg || cmd->state != QLA_TGT_STATE_NEED_DATA))
+ qlt_send_term_exchange(qpair, cmd, &cmd->atio, 1);
}
if (cmd->state == QLA_TGT_STATE_PROCESSED) {
@@ -4167,7 +4145,6 @@ static void __qlt_do_work(struct qla_tgt_cmd *cmd)
goto out_term;
}
- spin_lock_init(&cmd->cmd_lock);
cdb = &atio->u.isp24.fcp_cmnd.cdb[0];
cmd->se_cmd.tag = le32_to_cpu(atio->u.isp24.exchange_addr);
@@ -4204,7 +4181,7 @@ static void __qlt_do_work(struct qla_tgt_cmd *cmd)
*/
cmd->trc_flags |= TRC_DO_WORK_ERR;
spin_lock_irqsave(qpair->qp_lock_ptr, flags);
- qlt_send_term_exchange(qpair, NULL, &cmd->atio, 1, 0);
+ qlt_send_term_exchange(qpair, NULL, &cmd->atio, 1);
qlt_decr_num_pend_cmds(vha);
cmd->vha->hw->tgt.tgt_ops->rel_cmd(cmd);
@@ -5363,6 +5340,7 @@ static void qlt_handle_imm_notify(struct scsi_qla_host *vha,
if (qlt_24xx_handle_els(vha, iocb) == 0)
send_notify_ack = 0;
break;
+
default:
ql_dbg(ql_dbg_tgt_mgt, vha, 0xf06d,
"qla_target(%d): Received unknown immediate "
@@ -5397,7 +5375,7 @@ static int __qlt_send_busy(struct qla_qpair *qpair,
sess = qla2x00_find_fcport_by_nportid(vha, &id, 1);
spin_unlock_irqrestore(&ha->tgt.sess_lock, flags);
if (!sess) {
- qlt_send_term_exchange(qpair, NULL, atio, 1, 0);
+ qlt_send_term_exchange(qpair, NULL, atio, 1);
return 0;
}
/* Sending marker isn't necessary, since we called from ISR */
@@ -5626,7 +5604,7 @@ static void qlt_24xx_atio_pkt(struct scsi_qla_host *vha,
ql_dbg(ql_dbg_tgt, vha, 0xe05f,
"qla_target: Unable to send command to target, sending TERM EXCHANGE for rsp\n");
qlt_send_term_exchange(ha->base_qpair, NULL,
- atio, 1, 0);
+ atio, 1);
break;
case -EBUSY:
ql_dbg(ql_dbg_tgt, vha, 0xe060,
@@ -5833,7 +5811,7 @@ static void qlt_response_pkt(struct scsi_qla_host *vha,
ql_dbg(ql_dbg_tgt, vha, 0xe05f,
"qla_target: Unable to send command to target, sending TERM EXCHANGE for rsp\n");
qlt_send_term_exchange(rsp->qpair, NULL,
- atio, 1, 0);
+ atio, 1);
break;
case -EBUSY:
ql_dbg(ql_dbg_tgt, vha, 0xe060,
@@ -6723,7 +6701,7 @@ qlt_24xx_process_atio_queue(struct scsi_qla_host *vha, uint8_t ha_locked)
adjust_corrupted_atio(pkt);
qlt_send_term_exchange(ha->base_qpair, NULL, pkt,
- ha_locked, 0);
+ ha_locked);
} else {
qlt_24xx_atio_pkt_all_vps(vha,
(struct atio_from_isp *)pkt, ha_locked);
diff --git a/drivers/scsi/qla2xxx/qla_target.h b/drivers/scsi/qla2xxx/qla_target.h
index 15a59c125c53..adbc2a05b159 100644
--- a/drivers/scsi/qla2xxx/qla_target.h
+++ b/drivers/scsi/qla2xxx/qla_target.h
@@ -754,6 +754,7 @@ int qla2x00_wait_for_hba_online(struct scsi_qla_host *);
#define QLA_TGT_STATE_NEED_DATA 1 /* target needs data to continue */
#define QLA_TGT_STATE_DATA_IN 2 /* Data arrived + target processing */
#define QLA_TGT_STATE_PROCESSED 3 /* target done processing */
+#define QLA_TGT_STATE_DONE 4 /* cmd being freed */
/* ATIO task_codes field */
#define ATIO_SIMPLE_QUEUE 0
@@ -858,6 +859,7 @@ enum trace_flags {
TRC_DATA_IN = BIT_18,
TRC_ABORT = BIT_19,
TRC_DIF_ERR = BIT_20,
+ TRC_CTIO_IGNORED = BIT_21,
};
struct qla_tgt_cmd {
@@ -876,8 +878,6 @@ struct qla_tgt_cmd {
/* Sense buffer that will be mapped into outgoing status */
unsigned char sense_buffer[TRANSPORT_SENSE_BUFFER];
- spinlock_t cmd_lock;
- /* to save extra sess dereferences */
unsigned int conf_compl_supported:1;
unsigned int sg_mapped:1;
unsigned int write_data_transferred:1;
@@ -887,13 +887,14 @@ struct qla_tgt_cmd {
unsigned int cmd_in_wq:1;
unsigned int edif:1;
+ /* Set if the exchange has been terminated. */
+ unsigned int sent_term_exchg:1;
+
/*
- * This variable may be set from outside the LIO and I/O completion
- * callback functions. Do not declare this member variable as a
- * bitfield to avoid a read-modify-write operation when this variable
- * is set.
+ * Set if sent_term_exchg is set, or if the cmd was aborted by a TMR,
+ * or if some other error prevents normal processing of the command.
*/
- unsigned int aborted;
+ unsigned int aborted:1;
struct scatterlist *sg; /* cmd data buffer SG vector */
int sg_cnt; /* SG segments count */
@@ -932,6 +933,7 @@ struct qla_tgt_cmd {
#define DIF_BUNDL_DMA_VALID 1
uint16_t prot_flags;
+ unsigned long jiffies_at_term_exchg;
uint64_t jiffies_at_alloc;
uint64_t jiffies_at_free;
@@ -1055,6 +1057,9 @@ extern void qlt_response_pkt_all_vps(struct scsi_qla_host *, struct rsp_que *,
extern int qlt_rdy_to_xfer(struct qla_tgt_cmd *);
extern int qlt_xmit_response(struct qla_tgt_cmd *, int, uint8_t);
extern int qlt_abort_cmd(struct qla_tgt_cmd *);
+void qlt_send_term_exchange(struct qla_qpair *qpair,
+ struct qla_tgt_cmd *cmd, struct atio_from_isp *atio, int ha_locked);
+void qlt_unmap_sg(struct scsi_qla_host *vha, struct qla_tgt_cmd *cmd);
extern void qlt_xmit_tm_rsp(struct qla_tgt_mgmt_cmd *);
extern void qlt_free_mcmd(struct qla_tgt_mgmt_cmd *);
extern void qlt_free_cmd(struct qla_tgt_cmd *cmd);
diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
index ceaf1c7b1d17..169219fe47a2 100644
--- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c
+++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
@@ -303,6 +303,8 @@ static void tcm_qla2xxx_rel_cmd(struct qla_tgt_cmd *cmd)
*/
static void tcm_qla2xxx_free_cmd(struct qla_tgt_cmd *cmd)
{
+ cmd->state = QLA_TGT_STATE_DONE;
+
cmd->qpair->tgt_counters.core_qla_free_cmd++;
cmd->cmd_in_wq = 1;
--
2.43.0
On Mon, Sep 08, 2025 at 02:58:06PM -0400, Tony Battersby wrote:
>
> (target mode)
>
> There is a race between the following:
>
> CPU 1:
> scst_hw_pending_work_fn() ->
> sqa_on_hw_pending_cmd_timeout() ->
> qlt_abort_cmd() ->
> qlt_unmap_sg()
>
> CPU 2:
> qla_do_work() ->
> qla24xx_process_response_queue() ->
> qlt_do_ctio_completion() ->
> qlt_unmap_sg()
>
> Two CPUs calling qlt_unmap_sg() on the same cmd at the same time
> results in an oops:
>
> dma_unmap_sg_attrs()
> BUG_ON(!valid_dma_direction(dir));
>
> This race is more likely to happen because qlt_abort_cmd() may cause the
> hardware to send a CTIO.
>
> The solution is to lock cmd->qpair->qp_lock_ptr when aborting a command.
> This makes it possible to check the cmd state and make decisions about
> what to do without racing with the CTIO handler and other code.
>
> - Lock cmd->qpair->qp_lock_ptr when aborting a cmd.
> - Eliminate cmd->cmd_lock and change cmd->aborted back to a bitfield
> since it is now protected by qp_lock_ptr just like all the other
> flags.
> - Add another command state QLA_TGT_STATE_DONE to avoid any possible
> races between qlt_abort_cmd() and tgt_ops->free_cmd().
> - Add the cmd->sent_term_exchg flag to indicate if
> qlt_send_term_exchange() has already been called.
> - For SCST (scst_hw_pending_work_fn()), export qlt_send_term_exchange()
> and qlt_unmap_sg() so that they can be called directly instead of
> trying to make qlt_abort_cmd() work for both HW timeout and TMR abort.
> - Add TRC_CTIO_IGNORED for scst_hw_pending_work_fn().
>
> Fixes: 26f9ce53817a ("scsi: qla2xxx: Fix missed DMA unmap for aborted commands")
You are trying to fix that commit using its approach, but actually that
approach is the root cause itself. It is not ok to unmap dma while that
memory is owned by HW.
We use this patch 4 years already instead of 26f9ce53817a and never
faced with such crashes.
From: Dmitry Bogdanov <d.bogdanov@yadro.com>
Date: Wed, 20 Oct 2021 15:57:31 +0300
Subject: [PATCH] scsi: qla2xxx: clear cmds after chip reset
Commands sent to FW, after chip reset got stuck and never freed as FW is
not going to response to them anymore.
This patch partially reverts aefed3e5548f at __qla2x00_abort_all_cmds.
Fixes: aefed3e5548f ("scsi: qla2xxx: target: Fix offline port handling and host reset handling")
Signed-off-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
---
drivers/scsi/qla2xxx/qla_os.c | 20 ++++++++++++++++++--
drivers/scsi/qla2xxx/qla_target.c | 2 +-
drivers/scsi/qla2xxx/qla_target.h | 1 +
3 files changed, 20 insertions(+), 3 deletions(-)
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 407047e8b42b..04b0d3eb97e7 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -1809,10 +1809,26 @@ __qla2x00_abort_all_cmds(struct qla_qpair *qp, int res)
continue;
}
cmd = (struct qla_tgt_cmd *)sp;
- cmd->aborted = 1;
+
+ if (cmd->sg_mapped)
+ qlt_unmap_sg(vha, cmd);
+
+ if (cmd->se_cmd.t_state == TRANSPORT_WRITE_PENDING) {
+ cmd->aborted = 1;
+ cmd->write_data_transferred = 0;
+ cmd->state = QLA_TGT_STATE_DATA_IN;
+ ha->tgt.tgt_ops->handle_data(cmd);
+ } else {
+ ha->tgt.tgt_ops->free_cmd(cmd);
+ }
break;
case TYPE_TGT_TMCMD:
- /* Skip task management functions. */
+ /*
+ * Currently, only ABTS response gets on the
+ * outstanding_cmds[]
+ */
+ ha->tgt.tgt_ops->free_mcmd(
+ (struct qla_tgt_mgmt_cmd *) sp);
break;
default:
break;
diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c
index 8dfd13af48ea..c3b1a1426253 100644
--- a/drivers/scsi/qla2xxx/qla_target.c
+++ b/drivers/scsi/qla2xxx/qla_target.c
@@ -2463,7 +2463,7 @@ static int qlt_pci_map_calc_cnt(struct qla_tgt_prm *prm)
return -1;
}
-static void qlt_unmap_sg(struct scsi_qla_host *vha, struct qla_tgt_cmd *cmd)
+void qlt_unmap_sg(struct scsi_qla_host *vha, struct qla_tgt_cmd *cmd)
{
struct qla_hw_data *ha;
struct qla_qpair *qpair;
diff --git a/drivers/scsi/qla2xxx/qla_target.h b/drivers/scsi/qla2xxx/qla_target.h
index 10e5e6c8087d..76e80208d731 100644
--- a/drivers/scsi/qla2xxx/qla_target.h
+++ b/drivers/scsi/qla2xxx/qla_target.h
@@ -1050,6 +1050,7 @@ extern int qlt_abort_cmd(struct qla_tgt_cmd *);
extern void qlt_xmit_tm_rsp(struct qla_tgt_mgmt_cmd *);
extern void qlt_free_mcmd(struct qla_tgt_mgmt_cmd *);
extern void qlt_free_cmd(struct qla_tgt_cmd *cmd);
+extern void qlt_unmap_sg(struct scsi_qla_host *vha, struct qla_tgt_cmd *cmd);
extern void qlt_async_event(uint16_t, struct scsi_qla_host *, uint16_t *);
extern void qlt_enable_vha(struct scsi_qla_host *);
extern void qlt_vport_create(struct scsi_qla_host *, struct qla_hw_data *);
--
2.25.1
This patch applies to the out-of-tree SCST project, not to the Linux
kernel. Apply when importing the upstream patch with the same title.
SCST addendum:
- In sqa_on_hw_pending_cmd_timeout(), call qlt_send_term_exchange()
first and then restart the timeout. After another timeout, force the
cmd to finish without waiting for a response from the HW.
Signed-off-by: Tony Battersby <tonyb@cybernetics.com>
---
qla2x00t-32gbit/qla2x00-target/scst_qla2xxx.c | 172 +++++++++++++-----
1 file changed, 122 insertions(+), 50 deletions(-)
diff --git a/qla2x00t-32gbit/qla2x00-target/scst_qla2xxx.c b/qla2x00t-32gbit/qla2x00-target/scst_qla2xxx.c
index e885b9711..9371dad06 100644
--- a/qla2x00t-32gbit/qla2x00-target/scst_qla2xxx.c
+++ b/qla2x00t-32gbit/qla2x00-target/scst_qla2xxx.c
@@ -187,6 +187,7 @@ static struct cmd_state_name {
{QLA_TGT_STATE_NEED_DATA, "NeedData"},
{QLA_TGT_STATE_DATA_IN, "DataIn"},
{QLA_TGT_STATE_PROCESSED, "Processed"},
+ {QLA_TGT_STATE_DONE, "Done"},
};
static char *cmdstate_to_str(uint8_t state)
@@ -497,23 +498,14 @@ static void sqa_qla2xxx_handle_data(struct qla_tgt_cmd *cmd)
{
struct scst_cmd *scst_cmd = cmd->scst_cmd;
int rx_status;
- unsigned long flags;
TRACE_ENTRY();
- spin_lock_irqsave(&cmd->cmd_lock, flags);
- if (cmd->aborted) {
- spin_unlock_irqrestore(&cmd->cmd_lock, flags);
-
+ if (unlikely(cmd->aborted)) {
scst_set_cmd_error(scst_cmd,
SCST_LOAD_SENSE(scst_sense_internal_failure));
- scst_rx_data(scst_cmd, SCST_RX_STATUS_ERROR_SENSE_SET,
- SCST_CONTEXT_THREAD);
- return;
- }
- spin_unlock_irqrestore(&cmd->cmd_lock, flags);
-
- if (cmd->write_data_transferred) {
+ rx_status = SCST_RX_STATUS_ERROR_SENSE_SET;
+ } else if (likely(cmd->write_data_transferred)) {
rx_status = SCST_RX_STATUS_SUCCESS;
} else {
rx_status = SCST_RX_STATUS_ERROR_SENSE_SET;
@@ -691,6 +683,7 @@ static void sqa_qla2xxx_free_cmd(struct qla_tgt_cmd *cmd)
TRACE_ENTRY();
+ cmd->state = QLA_TGT_STATE_DONE;
cmd->trc_flags |= TRC_CMD_DONE;
scst_tgt_cmd_done(scst_cmd, scst_work_context);
@@ -1522,9 +1515,10 @@ static int sqa_xmit_response(struct scst_cmd *scst_cmd)
cmd = scst_cmd_get_tgt_priv(scst_cmd);
if (scst_cmd_aborted_on_xmit(scst_cmd)) {
- TRACE_MGMT_DBG("sqatgt(%ld/%d): CMD_ABORTED cmd[%p]",
- cmd->vha->host_no, cmd->vha->vp_idx,
- cmd);
+ TRACE_MGMT_DBG(
+ "sqatgt(%ld/%d): tag %lld: skipping send response for aborted cmd",
+ cmd->vha->host_no, cmd->vha->vp_idx,
+ scst_cmd_get_tag(scst_cmd));
qlt_abort_cmd(cmd);
scst_set_delivery_status(scst_cmd, SCST_CMD_DELIVERY_ABORTED);
scst_tgt_cmd_done(scst_cmd, SCST_CONTEXT_DIRECT);
@@ -1841,18 +1835,26 @@ static int sqa_qla2xxx_dif_tags(struct qla_tgt_cmd *cmd,
return t32;
}
+/*
+ * Prevent the CTIO completion handler from finding the given cmd, which will
+ * cause the CTIO to be ignored.
+ */
static void sqa_cleanup_hw_pending_cmd(scsi_qla_host_t *vha,
struct qla_tgt_cmd *cmd)
{
+ struct req_que *req = cmd->qpair->req;
uint32_t h;
- struct qla_qpair *qpair = cmd->qpair;
- for (h = 1; h < qpair->req->num_outstanding_cmds; h++) {
- if (qpair->req->outstanding_cmds[h] == (srb_t *)cmd) {
- printk(KERN_INFO "Clearing handle %d for cmd %p", h,
- cmd);
- //TRACE_DBG("Clearing handle %d for cmd %p", h, cmd);
- qpair->req->outstanding_cmds[h] = NULL;
+ cmd->cmd_sent_to_fw = 0;
+ cmd->trc_flags |= TRC_CTIO_IGNORED;
+
+ for (h = 1; h < req->num_outstanding_cmds; h++) {
+ if (req->outstanding_cmds[h] == (srb_t *)cmd) {
+ PRINT_INFO(
+ "sqatgt(%ld/%d): tag %lld: handle %x: detaching cmd from handle; CTIO will be ignored",
+ vha->host_no, vha->vp_idx,
+ se_cmd_tag(&cmd->se_cmd), h);
+ req->outstanding_cmds[h] = NULL;
break;
}
}
@@ -1863,50 +1865,120 @@ static void sqa_on_hw_pending_cmd_timeout(struct scst_cmd *scst_cmd)
struct qla_tgt_cmd *cmd = scst_cmd_get_tgt_priv(scst_cmd);
struct scsi_qla_host *vha = cmd->vha;
struct qla_qpair *qpair = cmd->qpair;
- uint8_t aborted = cmd->aborted;
unsigned long flags;
+ bool advance_cmd = false;
TRACE_ENTRY();
- TRACE_MGMT_DBG("sqatgt(%ld/%d): Cmd %p HW pending for too long (state %s) %s; %s;",
- vha->host_no, vha->vp_idx, cmd,
- cmdstate_to_str((uint8_t)cmd->state),
- cmd->cmd_sent_to_fw ? "sent to fw" : "not sent to fw",
- aborted ? "aborted" : "not aborted");
-
- qlt_abort_cmd(cmd);
+ scst_cmd_get(scst_cmd);
spin_lock_irqsave(qpair->qp_lock_ptr, flags);
+
+ TRACE_MGMT_DBG(
+ "sqatgt(%ld/%d): tag %lld: HW pending for too long (state %s) %s; %s",
+ vha->host_no, vha->vp_idx, scst_cmd_get_tag(scst_cmd),
+ cmdstate_to_str((uint8_t)cmd->state),
+ cmd->cmd_sent_to_fw ? "sent to fw" : "not sent to fw",
+ cmd->aborted ? "aborted" : "not aborted");
+
switch (cmd->state) {
case QLA_TGT_STATE_NEW:
case QLA_TGT_STATE_DATA_IN:
- PRINT_ERROR("sqa(%ld): A command in state (%s) should not be HW pending. %s",
- vha->host_no, cmdstate_to_str((uint8_t)cmd->state),
- aborted ? "aborted" : "not aborted");
- break;
+ case QLA_TGT_STATE_DONE:
+ PRINT_ERROR(
+ "sqatgt(%ld/%d): tag %lld: A command in state (%s) should not be HW pending. %s",
+ vha->host_no, vha->vp_idx, scst_cmd_get_tag(scst_cmd),
+ cmdstate_to_str((uint8_t)cmd->state),
+ cmd->aborted ? "aborted" : "not aborted");
+ goto out_unlock;
case QLA_TGT_STATE_NEED_DATA:
- /* the abort will nudge it out of FW */
- TRACE_MGMT_DBG("Force rx_data cmd %p", cmd);
- sqa_cleanup_hw_pending_cmd(vha, cmd);
- scst_set_cmd_error(scst_cmd,
- SCST_LOAD_SENSE(scst_sense_internal_failure));
- scst_rx_data(scst_cmd, SCST_RX_STATUS_ERROR_SENSE_SET,
- SCST_CONTEXT_THREAD);
- break;
case QLA_TGT_STATE_PROCESSED:
- if (!cmd->cmd_sent_to_fw)
- PRINT_ERROR("sqa(%ld): command should not be in HW pending. It's already processed. ",
- vha->host_no);
- else
- TRACE_MGMT_DBG("Force finishing cmd %p", cmd);
- sqa_cleanup_hw_pending_cmd(vha, cmd);
- scst_set_delivery_status(scst_cmd, SCST_CMD_DELIVERY_FAILED);
- scst_tgt_cmd_done(scst_cmd, SCST_CONTEXT_THREAD);
break;
}
+
+ /* Handle race with normal CTIO completion. */
+ if (!cmd->cmd_sent_to_fw) {
+ TRACE_MGMT_DBG(
+ "sqatgt(%ld/%d): tag %lld: cmd not sent to fw; assuming just completed",
+ vha->host_no, vha->vp_idx,
+ scst_cmd_get_tag(scst_cmd));
+ goto out_unlock;
+ }
+
+ /* Check for chip reset or a timeout after sending a term exchange. */
+ if (!qpair->fw_started ||
+ cmd->reset_count != qpair->chip_reset ||
+ (cmd->sent_term_exchg &&
+ time_is_before_jiffies(cmd->jiffies_at_term_exchg +
+ SQA_MAX_HW_PENDING_TIME * HZ / 2))) {
+ /*
+ * Timeout waiting for a response from the hw, so force the cmd
+ * to finish without waiting any longer. Note that this is
+ * slightly dangerous if the hw is still using the DMA
+ * mapping, so try term exchange first and see if that works.
+ */
+
+ sqa_cleanup_hw_pending_cmd(vha, cmd);
+
+ qlt_unmap_sg(vha, cmd);
+
+ advance_cmd = true;
+ } else {
+ /*
+ * We still expect a CTIO response from the hw. Terminating the
+ * exchange should force the CTIO response to happen sooner.
+ */
+ if (!cmd->sent_term_exchg)
+ qlt_send_term_exchange(qpair, cmd, &cmd->atio, 1);
+ }
+
+ if (advance_cmd) {
+ switch (cmd->state) {
+ case QLA_TGT_STATE_NEED_DATA:
+ TRACE_MGMT_DBG(
+ "sqatgt(%ld/%d): tag %lld: force rx_data",
+ vha->host_no, vha->vp_idx,
+ scst_cmd_get_tag(scst_cmd));
+ cmd->state = QLA_TGT_STATE_DATA_IN;
+ scst_set_cmd_error(scst_cmd,
+ SCST_LOAD_SENSE(scst_sense_internal_failure));
+ scst_rx_data(scst_cmd, SCST_RX_STATUS_ERROR_SENSE_SET,
+ SCST_CONTEXT_THREAD);
+ break;
+
+ case QLA_TGT_STATE_PROCESSED:
+ TRACE_MGMT_DBG(
+ "sqatgt(%ld/%d): tag %lld: force finishing cmd",
+ vha->host_no, vha->vp_idx,
+ scst_cmd_get_tag(scst_cmd));
+ scst_set_delivery_status(scst_cmd, SCST_CMD_DELIVERY_FAILED);
+ scst_tgt_cmd_done(scst_cmd, SCST_CONTEXT_THREAD);
+ break;
+
+ default:
+ break;
+ }
+ } else {
+ /*
+ * Restart the timer so that this function is called again
+ * after another timeout. This is similar to
+ * scst_update_hw_pending_start() except that we also set
+ * cmd_hw_pending to 1.
+ *
+ * IRQs are already OFF.
+ */
+ spin_lock(&scst_cmd->sess->sess_list_lock);
+ scst_cmd->cmd_hw_pending = 1;
+ scst_cmd->hw_pending_start = jiffies;
+ spin_unlock(&scst_cmd->sess->sess_list_lock);
+ }
+
+out_unlock:
spin_unlock_irqrestore(qpair->qp_lock_ptr, flags);
+ scst_cmd_put(scst_cmd);
+
TRACE_EXIT();
}
--
2.43.0
© 2016 - 2026 Red Hat, Inc.