[ewg] [PATCH] RDMA/nes: multiple disconnect cause crash during AE handling
Faisal Latif
faisal.latif at intel.com
Fri Feb 12 12:05:23 PST 2010
There is a double cm disconnect during AE processing causing crash. While fixing the
crash, also simplify the AE handling code.
Signed-off-by: Faisal Latif <faisal.latif at intel.com>
---
kernel_patches/fixes/nes_0025_ae_handling.patch | 164 +++++++++++++++++++++++
1 files changed, 164 insertions(+), 0 deletions(-)
create mode 100644 kernel_patches/fixes/nes_0025_ae_handling.patch
diff --git a/kernel_patches/fixes/nes_0025_ae_handling.patch b/kernel_patches/fixes/nes_0025_ae_handling.patch
new file mode 100644
index 0000000..0e541ca
--- /dev/null
+++ b/kernel_patches/fixes/nes_0025_ae_handling.patch
@@ -0,0 +1,164 @@
+diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c
+index b1c2cbb..310cc7c 100644
+--- a/drivers/infiniband/hw/nes/nes_hw.c
++++ b/drivers/infiniband/hw/nes/nes_hw.c
+@@ -3352,8 +3352,6 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
+ u16 async_event_id;
+ u8 tcp_state;
+ u8 iwarp_state;
+- int must_disconn = 1;
+- int must_terminate = 0;
+ struct ib_event ibevent;
+
+ nes_debug(NES_DBG_AEQ, "\n");
+@@ -3367,6 +3365,8 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
+ BUG_ON(!context);
+ }
+
++ /* context is nesqp unless async_event_id == CQ ERROR */
++ nesqp = (struct nes_qp *)(unsigned long)context;
+ async_event_id = (u16)aeq_info;
+ tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT;
+ iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT;
+@@ -3378,8 +3378,6 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
+
+ switch (async_event_id) {
+ case NES_AEQE_AEID_LLP_FIN_RECEIVED:
+- nesqp = (struct nes_qp *)(unsigned long)context;
+-
+ if (nesqp->term_flags)
+ return; /* Ignore it, wait for close complete */
+
+@@ -3394,79 +3392,48 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
+ async_event_id, nesqp->last_aeq, tcp_state);
+ }
+
+- if ((tcp_state != NES_AEQE_TCP_STATE_CLOSE_WAIT) ||
+- (nesqp->ibqp_state != IB_QPS_RTS)) {
+- /* FIN Received but tcp state or IB state moved on,
+- should expect a close complete */
+- return;
+- }
+-
++ break;
+ case NES_AEQE_AEID_LLP_CLOSE_COMPLETE:
+- nesqp = (struct nes_qp *)(unsigned long)context;
+ if (nesqp->term_flags) {
+ nes_terminate_done(nesqp, 0);
+ return;
+ }
++ spin_lock_irqsave(&nesqp->lock, flags);
++ nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING;
++ spin_unlock_irqrestore(&nesqp->lock, flags);
++ nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_CLOSING, 0, 0);
++ nes_cm_disconn(nesqp);
++ break;
+
+- case NES_AEQE_AEID_LLP_CONNECTION_RESET:
+ case NES_AEQE_AEID_RESET_SENT:
+- nesqp = (struct nes_qp *)(unsigned long)context;
+- if (async_event_id == NES_AEQE_AEID_RESET_SENT) {
+- tcp_state = NES_AEQE_TCP_STATE_CLOSED;
+- }
++ tcp_state = NES_AEQE_TCP_STATE_CLOSED;
+ spin_lock_irqsave(&nesqp->lock, flags);
+ nesqp->hw_iwarp_state = iwarp_state;
+ nesqp->hw_tcp_state = tcp_state;
+ nesqp->last_aeq = async_event_id;
+-
+- if ((tcp_state == NES_AEQE_TCP_STATE_CLOSED) ||
+- (tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT)) {
+- nesqp->hte_added = 0;
+- next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE;
+- }
+-
+- if ((nesqp->ibqp_state == IB_QPS_RTS) &&
+- ((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) ||
+- (async_event_id == NES_AEQE_AEID_LLP_CONNECTION_RESET))) {
+- switch (nesqp->hw_iwarp_state) {
+- case NES_AEQE_IWARP_STATE_RTS:
+- next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
+- nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING;
+- break;
+- case NES_AEQE_IWARP_STATE_TERMINATE:
+- must_disconn = 0; /* terminate path takes care of disconn */
+- if (nesqp->term_flags == 0)
+- must_terminate = 1;
+- break;
+- }
+- } else {
+- if (async_event_id == NES_AEQE_AEID_LLP_FIN_RECEIVED) {
+- /* FIN Received but ib state not RTS,
+- close complete will be on its way */
+- must_disconn = 0;
+- }
+- }
++ nesqp->hte_added = 0;
+ spin_unlock_irqrestore(&nesqp->lock, flags);
++ next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE;
++ nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
++ nes_cm_disconn(nesqp);
++ break;
+
+- if (must_terminate)
+- nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL);
+- else if (must_disconn) {
+- if (next_iwarp_state) {
+- nes_debug(NES_DBG_AEQ, "issuing hw modifyqp for QP%u. next state = 0x%08X\n",
+- nesqp->hwqp.qp_id, next_iwarp_state);
+- nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
+- }
+- nes_cm_disconn(nesqp);
+- }
++ case NES_AEQE_AEID_LLP_CONNECTION_RESET:
++ if (atomic_read(&nesqp->close_timer_started))
++ return;
++ spin_lock_irqsave(&nesqp->lock, flags);
++ nesqp->hw_iwarp_state = iwarp_state;
++ nesqp->hw_tcp_state = tcp_state;
++ nesqp->last_aeq = async_event_id;
++ spin_unlock_irqrestore(&nesqp->lock, flags);
++ nes_cm_disconn(nesqp);
+ break;
+
+ case NES_AEQE_AEID_TERMINATE_SENT:
+- nesqp = (struct nes_qp *)(unsigned long)context;
+ nes_terminate_send_fin(nesdev, nesqp, aeqe);
+ break;
+
+ case NES_AEQE_AEID_LLP_TERMINATE_RECEIVED:
+- nesqp = (struct nes_qp *)(unsigned long)context;
+ nes_terminate_received(nesdev, nesqp, aeqe);
+ break;
+
+@@ -3480,7 +3447,8 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
+ case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER:
+ case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION:
+ case NES_AEQE_AEID_AMP_TO_WRAP:
+- nesqp = (struct nes_qp *)(unsigned long)context;
++ printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_ACCESS_ERR\n",
++ nesqp->hwqp.qp_id, async_event_id);
+ nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_ACCESS_ERR);
+ break;
+
+@@ -3488,7 +3456,6 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
+ case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL:
+ case NES_AEQE_AEID_DDP_UBE_INVALID_MO:
+ case NES_AEQE_AEID_DDP_UBE_INVALID_QN:
+- nesqp = (struct nes_qp *)(unsigned long)context;
+ if (iwarp_opcode(nesqp, aeq_info) > IWARP_OPCODE_TERM) {
+ aeq_info &= 0xffff0000;
+ aeq_info |= NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE;
+@@ -3530,7 +3497,8 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
+ case NES_AEQE_AEID_STAG_ZERO_INVALID:
+ case NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST:
+ case NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP:
+- nesqp = (struct nes_qp *)(unsigned long)context;
++ printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_FATAL\n",
++ nesqp->hwqp.qp_id, async_event_id);
+ nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL);
+ break;
+
+--
+1.6.0
+
--
1.6.0
More information about the ewg
mailing list