[ofa-general] [PATCH] RDMA/nes: application hang during large cluster test

Faisal Latif faisal.latif at intel.com
Mon Apr 13 09:09:47 PDT 2009


Running large cluster setup, sometimes tests are hanging during long
testing cycle.

Fixing required following changes in nes_cm.[ch] code.

* Under heavy load, sometimes it takes longer to receive the response from
application to the MPA request. The rexmit timeout value is too low.

* in handle_fin_pkt(), we are calling cleanup_retrans_entry() for all
conditions, even if the packets needs to be dropped.

* check_seq(), does not check for condition if the seq# is wrapped.

* handle_ack_pkt() need to return error value, so in case of error,
handle_fin() is not called.

* handle_rst_pkt(), handling of cm_node's NES_CM_STATE_LAST_ACK is missing.

* process_packet(), in case of FIN only packet is received, call
check_seq() before processing.

* nes_connect() is not to set apbvt bit if it is a loopback connection.
apbvt bit is only set for non-loopback connections as for loopback,
all the connection setup is done from the driver.

Signed-off-by: Faisal Latif <faisal.latif at intel.com>
---
 drivers/infiniband/hw/nes/nes_cm.c |   74 ++++++++++++++++++------------------
 drivers/infiniband/hw/nes/nes_cm.h |    1 +
 2 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
index dbd9a75..61da9d3 100644
--- a/drivers/infiniband/hw/nes/nes_cm.c
+++ b/drivers/infiniband/hw/nes/nes_cm.c
@@ -56,6 +56,7 @@
 #include <net/neighbour.h>
 #include <net/route.h>
 #include <net/ip_fib.h>
+#include <net/tcp.h>
 
 #include "nes.h"
 
@@ -540,6 +541,7 @@ static void nes_cm_timer_tick(unsigned long pass)
 	struct list_head *list_node;
 	struct nes_cm_core *cm_core = g_cm_core;
 	u32 settimer = 0;
+	unsigned long timetosend;
 	int ret = NETDEV_TX_OK;
 
 	struct list_head timer_list;
@@ -644,8 +646,10 @@ static void nes_cm_timer_tick(unsigned long pass)
 				send_entry->retrycount);
 			if (send_entry->send_retrans) {
 				send_entry->retranscount--;
+				timetosend = (NES_RETRY_TIMEOUT <<
+					(NES_DEFAULT_RETRANS - send_entry->retranscount));
 				send_entry->timetosend = jiffies +
-					NES_RETRY_TIMEOUT;
+					min(timetosend, NES_MAX_TIMEOUT);
 				if (nexttimeout > send_entry->timetosend ||
 					!settimer) {
 					nexttimeout = send_entry->timetosend;
@@ -1325,18 +1329,20 @@ static void handle_fin_pkt(struct nes_cm_node *cm_node)
 	nes_debug(NES_DBG_CM, "Received FIN, cm_node = %p, state = %u. "
 		"refcnt=%d\n", cm_node, cm_node->state,
 		atomic_read(&cm_node->ref_count));
-	cm_node->tcp_cntxt.rcv_nxt++;
-	cleanup_retrans_entry(cm_node);
 	switch (cm_node->state) {
 	case NES_CM_STATE_SYN_RCVD:
 	case NES_CM_STATE_SYN_SENT:
 	case NES_CM_STATE_ESTABLISHED:
 	case NES_CM_STATE_MPAREQ_SENT:
 	case NES_CM_STATE_MPAREJ_RCVD:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		cleanup_retrans_entry(cm_node);
 		cm_node->state = NES_CM_STATE_LAST_ACK;
 		send_fin(cm_node, NULL);
 		break;
 	case NES_CM_STATE_FIN_WAIT1:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		cleanup_retrans_entry(cm_node);
 		cm_node->state = NES_CM_STATE_CLOSING;
 		send_ack(cm_node, NULL);
 		/* Wait for ACK as this is simultanous close..
@@ -1344,11 +1350,15 @@ static void handle_fin_pkt(struct nes_cm_node *cm_node)
 		* Just rm the node.. Done.. */
 		break;
 	case NES_CM_STATE_FIN_WAIT2:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		cleanup_retrans_entry(cm_node);
 		cm_node->state = NES_CM_STATE_TIME_WAIT;
 		send_ack(cm_node, NULL);
 		schedule_nes_timer(cm_node, NULL,  NES_TIMER_TYPE_CLOSE, 1, 0);
 		break;
 	case NES_CM_STATE_TIME_WAIT:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		cleanup_retrans_entry(cm_node);
 		cm_node->state = NES_CM_STATE_CLOSED;
 		rem_ref_cm_node(cm_node->cm_core, cm_node);
 		break;
@@ -1384,7 +1394,6 @@ static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
 		passive_state = atomic_add_return(1, &cm_node->passive_state);
 		if (passive_state ==  NES_SEND_RESET_EVENT)
 			create_event(cm_node, NES_CM_EVENT_RESET);
-		cleanup_retrans_entry(cm_node);
 		cm_node->state = NES_CM_STATE_CLOSED;
 		dev_kfree_skb_any(skb);
 		break;
@@ -1398,17 +1407,16 @@ static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
 		active_open_err(cm_node, skb, reset);
 		break;
 	case NES_CM_STATE_CLOSED:
-		cleanup_retrans_entry(cm_node);
 		drop_packet(skb);
 		break;
+	case NES_CM_STATE_LAST_ACK:
+		cm_node->cm_id->rem_ref(cm_node->cm_id);
 	case NES_CM_STATE_TIME_WAIT:
-		cleanup_retrans_entry(cm_node);
 		cm_node->state = NES_CM_STATE_CLOSED;
 		rem_ref_cm_node(cm_node->cm_core, cm_node);
 		drop_packet(skb);
 		break;
 	case NES_CM_STATE_FIN_WAIT1:
-		cleanup_retrans_entry(cm_node);
 		nes_debug(NES_DBG_CM, "Bad state %s[%u]\n", __func__, __LINE__);
 	default:
 		drop_packet(skb);
@@ -1455,6 +1463,7 @@ static void handle_rcv_mpa(struct nes_cm_node *cm_node, struct sk_buff *skb)
 				NES_PASSIVE_STATE_INDICATED);
 		break;
 	case NES_CM_STATE_MPAREQ_SENT:
+		cleanup_retrans_entry(cm_node);
 		if (res_type == NES_MPA_REQUEST_REJECT) {
 			type = NES_CM_EVENT_MPA_REJECT;
 			cm_node->state = NES_CM_STATE_MPAREJ_RCVD;
@@ -1518,7 +1527,7 @@ static int check_seq(struct nes_cm_node *cm_node, struct tcphdr *tcph,
 	rcv_wnd = cm_node->tcp_cntxt.rcv_wnd;
 	if (ack_seq != loc_seq_num)
 		err = 1;
-	else if ((seq + rcv_wnd) < rcv_nxt)
+	else if (!between(seq, rcv_nxt, (rcv_nxt+rcv_wnd)))
 		err = 1;
 	if (err) {
 		nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p "
@@ -1652,49 +1661,39 @@ static void handle_synack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
 	}
 }
 
-static void handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
+static int handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
 	struct tcphdr *tcph)
 {
 	int datasize = 0;
 	u32 inc_sequence;
 	u32 rem_seq_ack;
 	u32 rem_seq;
-	int ret;
+	int ret = 0;
 	int optionsize;
 	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
 
 	if (check_seq(cm_node, tcph, skb))
-		return;
+		return -EINVAL;
 
 	skb_pull(skb, tcph->doff << 2);
 	inc_sequence = ntohl(tcph->seq);
 	rem_seq = ntohl(tcph->seq);
 	rem_seq_ack =  ntohl(tcph->ack_seq);
 	datasize = skb->len;
-	cleanup_retrans_entry(cm_node);
 	switch (cm_node->state) {
 	case NES_CM_STATE_SYN_RCVD:
 		/* Passive OPEN */
+		cleanup_retrans_entry(cm_node);
 		ret = handle_tcp_options(cm_node, tcph, skb, optionsize, 1);
 		if (ret)
 			break;
 		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
-		if (cm_node->tcp_cntxt.rem_ack_num !=
-		    cm_node->tcp_cntxt.loc_seq_num) {
-			nes_debug(NES_DBG_CM, "rem_ack_num != loc_seq_num\n");
-			cleanup_retrans_entry(cm_node);
-			send_reset(cm_node, skb);
-			return;
-		}
 		cm_node->state = NES_CM_STATE_ESTABLISHED;
-		cleanup_retrans_entry(cm_node);
 		if (datasize) {
 			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
 			handle_rcv_mpa(cm_node, skb);
-		} else { /* rcvd ACK only */
+		} else  /* rcvd ACK only */
 			dev_kfree_skb_any(skb);
-			cleanup_retrans_entry(cm_node);
-		 }
 		break;
 	case NES_CM_STATE_ESTABLISHED:
 		/* Passive OPEN */
@@ -1706,15 +1705,12 @@ static void handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
 			drop_packet(skb);
 		break;
 	case NES_CM_STATE_MPAREQ_SENT:
-		cleanup_retrans_entry(cm_node);
 		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
 		if (datasize) {
 			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
 			handle_rcv_mpa(cm_node, skb);
-		} else { /* Could be just an ack pkt.. */
-			cleanup_retrans_entry(cm_node);
+		} else  /* Could be just an ack pkt.. */
 			dev_kfree_skb_any(skb);
-		}
 		break;
 	case NES_CM_STATE_LISTENING:
 	case NES_CM_STATE_CLOSED:
@@ -1722,11 +1718,10 @@ static void handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
 		send_reset(cm_node, skb);
 		break;
 	case NES_CM_STATE_LAST_ACK:
+	case NES_CM_STATE_CLOSING:
 		cleanup_retrans_entry(cm_node);
 		cm_node->state = NES_CM_STATE_CLOSED;
 		cm_node->cm_id->rem_ref(cm_node->cm_id);
-	case NES_CM_STATE_CLOSING:
-		cleanup_retrans_entry(cm_node);
 		rem_ref_cm_node(cm_node->cm_core, cm_node);
 		drop_packet(skb);
 		break;
@@ -1741,9 +1736,11 @@ static void handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
 	case NES_CM_STATE_MPAREQ_RCVD:
 	case NES_CM_STATE_UNKNOWN:
 	default:
+		cleanup_retrans_entry(cm_node);
 		drop_packet(skb);
 		break;
 	}
+	return ret;
 }
 
 
@@ -1849,6 +1846,7 @@ static void process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb,
 	enum nes_tcpip_pkt_type	pkt_type = NES_PKT_TYPE_UNKNOWN;
 	struct tcphdr *tcph = tcp_hdr(skb);
 	u32     fin_set = 0;
+	int ret = 0;
 	skb_pull(skb, ip_hdr(skb)->ihl << 2);
 
 	nes_debug(NES_DBG_CM, "process_packet: cm_node=%p state =%d syn=%d "
@@ -1874,17 +1872,17 @@ static void process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb,
 		handle_synack_pkt(cm_node, skb, tcph);
 		break;
 	case NES_PKT_TYPE_ACK:
-		handle_ack_pkt(cm_node, skb, tcph);
-		if (fin_set)
+		ret = handle_ack_pkt(cm_node, skb, tcph);
+		if (fin_set && !ret)
 			handle_fin_pkt(cm_node);
 		break;
 	case NES_PKT_TYPE_RST:
 		handle_rst_pkt(cm_node, skb, tcph);
 		break;
 	default:
-		drop_packet(skb);
-		if (fin_set)
+		if ((fin_set) && (!check_seq(cm_node, tcph, skb)))
 			handle_fin_pkt(cm_node);
+		drop_packet(skb);
 		break;
 	}
 }
@@ -2959,6 +2957,7 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	struct nes_device *nesdev;
 	struct nes_cm_node *cm_node;
 	struct nes_cm_info cm_info;
+	int apbvt_set = 0;
 
 	ibqp = nes_get_qp(cm_id->device, conn_param->qpn);
 	if (!ibqp)
@@ -2996,9 +2995,11 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 		conn_param->private_data_len);
 
 	if (cm_id->local_addr.sin_addr.s_addr !=
-		cm_id->remote_addr.sin_addr.s_addr)
+		cm_id->remote_addr.sin_addr.s_addr) {
 		nes_manage_apbvt(nesvnic, ntohs(cm_id->local_addr.sin_port),
 			PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD);
+		apbvt_set = 1;
+	}
 
 	/* set up the connection params for the node */
 	cm_info.loc_addr = htonl(cm_id->local_addr.sin_addr.s_addr);
@@ -3015,8 +3016,7 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 		conn_param->private_data_len, (void *)conn_param->private_data,
 		&cm_info);
 	if (!cm_node) {
-		if (cm_id->local_addr.sin_addr.s_addr !=
-				cm_id->remote_addr.sin_addr.s_addr)
+		if (apbvt_set)
 			nes_manage_apbvt(nesvnic, ntohs(cm_id->local_addr.sin_port),
 				PCI_FUNC(nesdev->pcidev->devfn),
 				NES_MANAGE_APBVT_DEL);
@@ -3025,7 +3025,7 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 		return -ENOMEM;
 	}
 
-	cm_node->apbvt_set = 1;
+	cm_node->apbvt_set = apbvt_set;
 	nesqp->cm_node = cm_node;
 	cm_node->nesqp = nesqp;
 	nes_add_ref(&nesqp->ibqp);
diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h
index 80bba18..8b7e7c0 100644
--- a/drivers/infiniband/hw/nes/nes_cm.h
+++ b/drivers/infiniband/hw/nes/nes_cm.h
@@ -149,6 +149,7 @@ struct nes_timer_entry {
 #endif
 #define NES_SHORT_TIME      (10)
 #define NES_LONG_TIME       (2000*HZ/1000)
+#define NES_MAX_TIMEOUT     ((unsigned long) (12*HZ))
 
 #define NES_CM_HASHTABLE_SIZE         1024
 #define NES_CM_TCP_TIMER_INTERVAL     3000
-- 
1.5.3.3




More information about the general mailing list