[ofa-general] SDP memory allocation policy problem?

Jim Mott jimmott at austin.rr.com
Wed Sep 26 17:47:39 PDT 2007


I have reworked your patch slightly and run my simple unit tests on it.  No correctness problems detected in latency or bandwidth
paths.  No performance regressions either.

If your proposed patch worked for you, then this one ought to work too.  Could you please give it a go and let me know?  

Index: ofa_1_3_dev_kernel/drivers/infiniband/ulp/sdp/sdp_bcopy.c
===================================================================
--- ofa_1_3_dev_kernel.orig/drivers/infiniband/ulp/sdp/sdp_bcopy.c	2007-09-26 13:27:43.000000000 -0500
+++ ofa_1_3_dev_kernel/drivers/infiniband/ulp/sdp/sdp_bcopy.c	2007-09-26 17:52:12.000000000 -0500
@@ -221,16 +221,26 @@ static void sdp_post_recv(struct sdp_soc
 	skb_frag_t *frag;
 	struct sdp_bsdh *h;
 	int id = ssk->rx_head;
+	unsigned int gfp_page;
 
 	/* Now, allocate and repost recv */
 	/* TODO: allocate from cache */
-	skb = sk_stream_alloc_skb(&ssk->isk.sk, SDP_HEAD_SIZE,
-				  GFP_KERNEL);
+
+	if (unlikely(ssk->isk.sk.sk_allocation)) {
+		skb = sk_stream_alloc_skb(&ssk->isk.sk, SDP_HEAD_SIZE,
+					  ssk->isk.sk.sk_allocation);
+		gfp_page = ssk->isk.sk.sk_allocation | __GFP_HIGHMEM;
+	} else {
+		skb = sk_stream_alloc_skb(&ssk->isk.sk, SDP_HEAD_SIZE,
+					  GFP_KERNEL);
+		gfp_page = GFP_HIGHUSER;
+	}
+
 	/* FIXME */
 	BUG_ON(!skb);
 	h = (struct sdp_bsdh *)skb->head;
 	for (i = 0; i < ssk->recv_frags; ++i) {
-		page = alloc_pages(GFP_HIGHUSER, 0);
+		page = alloc_pages(gfp_page, 0);
 		BUG_ON(!page);
 		frag = &skb_shinfo(skb)->frags[i];
 		frag->page                = page;
@@ -404,6 +414,7 @@ void sdp_post_sends(struct sdp_sock *ssk
 	/* TODO: nonagle? */
 	struct sk_buff *skb;
 	int c;
+	int gfp_page;
 
 	if (unlikely(!ssk->id)) {
 		if (ssk->isk.sk.sk_send_head) {
@@ -415,6 +426,11 @@ void sdp_post_sends(struct sdp_sock *ssk
 		return;
 	}
 
+	if (unlikely(ssk->isk.sk.sk_allocation))
+		gfp_page = ssk->isk.sk.sk_allocation;
+	else
+		gfp_page = GFP_KERNEL;
+
 	if (ssk->recv_request &&
 	    ssk->rx_tail >= ssk->recv_request_head &&
 	    ssk->bufs >= SDP_MIN_BUFS &&
@@ -424,7 +440,7 @@ void sdp_post_sends(struct sdp_sock *ssk
 		skb = sk_stream_alloc_skb(&ssk->isk.sk,
 					  sizeof(struct sdp_bsdh) +
 					  sizeof(*resp_size),
-					  GFP_KERNEL);
+					  gfp_page);
 		/* FIXME */
 		BUG_ON(!skb);
 		resp_size = (struct sdp_chrecvbuf *)skb_put(skb, sizeof *resp_size);
@@ -449,7 +465,7 @@ void sdp_post_sends(struct sdp_sock *ssk
 		skb = sk_stream_alloc_skb(&ssk->isk.sk,
 					  sizeof(struct sdp_bsdh) +
 					  sizeof(*req_size),
-					  GFP_KERNEL);
+					  gfp_page);
 		/* FIXME */
 		BUG_ON(!skb);
 		ssk->sent_request = SDP_MAX_SEND_SKB_FRAGS * PAGE_SIZE;
@@ -480,7 +496,7 @@ void sdp_post_sends(struct sdp_sock *ssk
 		ssk->bufs) {
 		skb = sk_stream_alloc_skb(&ssk->isk.sk,
 					  sizeof(struct sdp_bsdh),
-					  GFP_KERNEL);
+					  gfp_page);
 		/* FIXME */
 		BUG_ON(!skb);
 		sdp_post_send(ssk, skb, SDP_MID_DISCONN);

-----Original Message-----
From: general-bounces at lists.openfabrics.org [mailto:general-bounces at lists.openfabrics.org] On Behalf Of Nathan Dauchy
Sent: Tuesday, September 25, 2007 5:50 PM
To: general at lists.openfabrics.org
Subject: Re: [ofa-general] SDP memory allocation policy problem?

Is there anyone among the OFED development team that is looking into
this issue?  I believe that it is causing nodes to hang at our site.  We
are running ofed-1.2 and the 2.6.9-55.ELsmp kernel.

Workarounds or even untested patches would be appreciated.

Thanks!

-Nathan


Ken Phillips wrote:
> Greetings,
> 
> Teammates here report the following:
> 
> Problem
> 
> The method SDP uses to allocate socket buffers may cause the
> node to hang under memory pressure.
> 
> Details
> 
> Each kernel level socket has an allocation flag to specify the
> memory allocation policy for socket buffers, the default is GFP_ATOMIC
> (or GFP_KERNEL for SDP).  If the caller creates a socket with the
> policy set to GFP_NOFS or GFP_NOIO this should be the allocation
> policy used by the SDP layer.
> 
> The problem we are seeing is that if a node is under load, and
> a memory allocation fails (say in sock_sendmsg()), the kernel will
> use the allocation policy to decide how to proceed with the allocation.
> If GFP_KERNEL is specified, then the kernel may attempt to free pages
> through the iSCSI block device that is making the socket call, which
> would result in a deadlock.  Use of GFP_NOIO should prevent the kernel
> from using the IO backend to free memory resources.
> 
> here is a sample stack trace from Alt-Sysrq during one of these
> lockups,
> 
>> tx_worker     D ffffff0014d14000     0 10195      1         10196 10194
>> (L-TLB)
>> 00000100707e98d8 0000000000000046 0000000000000004 0000000000000212
>> 0000000000000212 ffffffffa018ccae 0000000000000246 0000000000000246
>> 000001007873c7f0 0000000000000320
>> Call Trace:<ffffffffa018ccae>{:ib_mthca:mthca_poll_cq+2258}
>> <ffffffff8030ad5c>{schedule_timeout+224}
>> <ffffffff802a9db7>{lock_sock+152}
>> <ffffffff80135756>{autoremove_wake_function+0}
>> <ffffffffa0538b13>{:ib_sdp:sdp_poll_cq+58}
>> <ffffffff80135756>{autoremove_wake_function+0}
>> <ffffffff802a9dfd>{release_sock+16}
>> <ffffffffa0534b18>{:ib_sdp:sdp_sendmsg+33}
>> <ffffffff802a730f>{sock_sendmsg+271}
>> <ffffffffa05386ad>{:ib_sdp:sdp_post_sends+619}
>> <ffffffff802a9dfd>{release_sock+16}
>> <ffffffffa05353a5>{:ib_sdp:sdp_sendmsg+2222}
>> <ffffffff80135756>{autoremove_wake_function+0}
>> <ffffffffa057708f>{:rs_iscsi:iscsi_sock_msg+1265}
>> <ffffffffa057708b>{:rs_iscsi:iscsi_sock_msg+1261}
>> <ffffffff80132159>{recalc_task_prio+337}
>> <ffffffffa055bfdb>{:rs_iscsi:scsi_command_i+5283}
>> <ffffffff8030a2c9>{thread_return+0}
>> <ffffffff8030a321>{thread_return+88}
>> <ffffffff8013fdf7>{del_timer+107}
>> <ffffffff8013feb4>{del_singleshot_timer_sync+9}
>> <ffffffff8030adf3>{schedule_timeout+375}
>> <ffffffffa056829e>{:rs_iscsi:tx_worker_proc_i+6819}
>> <ffffffff80110f47>{child_rip+8}
>> <ffffffffa05667fb>{:rs_iscsi:tx_worker_proc_i+0}
>> <ffffffff80110f3f>{child_rip+0}
>>
>>
> 
> We still don't know the scope of changes to be made, but we think,
> at minimum that some of the memory allocation in SDP should be changed,
> for example.
> 
> diff -Naur old/drivers/infiniband/ulp/sdp/sdp_bcopy.c
> new/drivers/infiniband/ulp/sdp/sdp_bcopy.c
> --- old/drivers/infiniband/ulp/sdp/sdp_bcopy.c    2007-06-21
> 10:38:47.000000000 -0400
> +++ new/drivers/infiniband/ulp/sdp/sdp_bcopy.c    2007-08-31
> 12:25:58.000000000 -0400
> @@ -224,13 +224,27 @@
> 
>      /* Now, allocate and repost recv */
>      /* TODO: allocate from cache */
> +
> +#if (PROPOSED_SDP_FIX == 1)
> +    skb = sk_stream_alloc_skb(&ssk->isk.sk, SDP_HEAD_SIZE,
> +        (ssk->isk.sk.sk_allocation == 0) ? (GFP_KERNEL) :
> +        ssk->isk.sk.sk_allocation);
> +#else
>      skb = sk_stream_alloc_skb(&ssk->isk.sk, SDP_HEAD_SIZE,
>                    GFP_KERNEL);
> +#endif
>      /* FIXME */
>      BUG_ON(!skb);
>      h = (struct sdp_bsdh *)skb->head;
>      for (i = 0; i < ssk->recv_frags; ++i) {
> +#if (PROPOSED_SDP_FIX == 1)
> +        page = alloc_pages((ssk->isk.sk.sk_allocation == 0)
> +            ? (GFP_HIGHUSER) :
> +            (ssk->isk.sk.sk_allocation | (__GFP_HIGHMEM)),
> +            0);
> +#else
>          page = alloc_pages(GFP_HIGHUSER, 0);
> +#endif
>          BUG_ON(!page);
>          frag = &skb_shinfo(skb)->frags[i];
>          frag->page                = page;
> @@ -406,10 +420,18 @@
>          ssk->tx_head - ssk->tx_tail < SDP_TX_SIZE) {
>          struct sdp_chrecvbuf *resp_size;
>          ssk->recv_request = 0;
> +#if (PROPOSED_SDP_FIX == 1)
> +        skb = sk_stream_alloc_skb(&ssk->isk.sk,
> +            sizeof(struct sdp_bsdh) +
> +            sizeof(*resp_size),
> +            (ssk->isk.sk.sk_allocation == 0) ? (GFP_KERNEL) :
> +            ssk->isk.sk.sk_allocation);
> +#else
>          skb = sk_stream_alloc_skb(&ssk->isk.sk,
>                        sizeof(struct sdp_bsdh) +
>                        sizeof(*resp_size),
>                        GFP_KERNEL);
> +#endif
>          /* FIXME */
>          BUG_ON(!skb);
>          resp_size = (struct sdp_chrecvbuf *)skb_put(skb, sizeof *resp_size);
> @@ -431,10 +453,18 @@
>          ssk->tx_head > ssk->sent_request_head + SDP_RESIZE_WAIT &&
>          ssk->tx_head - ssk->tx_tail < SDP_TX_SIZE) {
>          struct sdp_chrecvbuf *req_size;
> +#if (PROPOSED_SDP_FIX == 1)
> +        skb = sk_stream_alloc_skb(&ssk->isk.sk,
> +            sizeof(struct sdp_bsdh) +
> +            sizeof(*req_size),
> +            (ssk->isk.sk.sk_allocation == 0) ? (GFP_KERNEL) :
> +            ssk->isk.sk.sk_allocation);
> +#else
>          skb = sk_stream_alloc_skb(&ssk->isk.sk,
>                        sizeof(struct sdp_bsdh) +
>                        sizeof(*req_size),
>                        GFP_KERNEL);
> +#endif
>          /* FIXME */
>          BUG_ON(!skb);
>          ssk->sent_request = SDP_MAX_SEND_SKB_FRAGS * PAGE_SIZE;
> @@ -463,9 +493,16 @@
>              (TCPF_FIN_WAIT1 | TCPF_LAST_ACK)) &&
>          !ssk->isk.sk.sk_send_head &&
>          ssk->bufs) {
> +#if (PROPOSED_SDP_FIX == 1)
> +        skb = sk_stream_alloc_skb(&ssk->isk.sk,
> +            sizeof(struct sdp_bsdh),
> +            (ssk->isk.sk.sk_allocation == 0) ? (GFP_KERNEL) :
> +            ssk->isk.sk.sk_allocation);
> +#else
>          skb = sk_stream_alloc_skb(&ssk->isk.sk,
>                        sizeof(struct sdp_bsdh),
>                        GFP_KERNEL);
> +#endif
>          /* FIXME */
>          BUG_ON(!skb);
>          sdp_post_send(ssk, skb, SDP_MID_DISCONN);
> diff -Naur old/drivers/infiniband/ulp/sdp/sdp.h
> new/drivers/infiniband/ulp/sdp/sdp.h
> --- old/drivers/infiniband/ulp/sdp/sdp.h    2007-06-21 10:38:47.000000000 -0400
> +++ new/drivers/infiniband/ulp/sdp/sdp.h    2007-08-31 12:25:45.000000000 -0400
> @@ -7,6 +7,8 @@
>  #include <net/tcp.h> /* For urgent data flags */
>  #include <rdma/ib_verbs.h>
> 
> +#define PROPOSED_SDP_FIX 1
> +
>  #define sdp_printk(level, sk, format, arg...)                \
>      printk(level "sdp_sock(%d:%d): " format,             \
>             (sk) ? inet_sk(sk)->num : -1,                 \
> 
> 
> 
> 
> ---------------------
> Best Regards
> K Phillips
> _______________________________________________
> general mailing list
> general at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
> 
> To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

_______________________________________________
general mailing list
general at lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general




More information about the general mailing list