[ofa-general] SDP memory allocation policy problem?

Ken Phillips phillips.ken at gmail.com
Tue Sep 18 12:47:42 PDT 2007


Greetings,

Teammates here report the following:

Problem

The method SDP uses to allocate socket buffers may cause the
node to hang under memory pressure.

Details

Each kernel level socket has an allocation flag to specify the
memory allocation policy for socket buffers, the default is GFP_ATOMIC
(or GFP_KERNEL for SDP).  If the caller creates a socket with the
policy set to GFP_NOFS or GFP_NOIO this should be the allocation
policy used by the SDP layer.

The problem we are seeing is that if a node is under load, and
a memory allocation fails (say in sock_sendmsg()), the kernel will
use the allocation policy to decide how to proceed with the allocation.
If GFP_KERNEL is specified, then the kernel may attempt to free pages
through the iSCSI block device that is making the socket call, which
would result in a deadlock.  Use of GFP_NOIO should prevent the kernel
from using the IO backend to free memory resources.

here is a sample stack trace from Alt-Sysrq during one of these
lockups,

> tx_worker     D ffffff0014d14000     0 10195      1         10196 10194
> (L-TLB)
> 00000100707e98d8 0000000000000046 0000000000000004 0000000000000212
> 0000000000000212 ffffffffa018ccae 0000000000000246 0000000000000246
> 000001007873c7f0 0000000000000320
> Call Trace:<ffffffffa018ccae>{:ib_mthca:mthca_poll_cq+2258}
> <ffffffff8030ad5c>{schedule_timeout+224}
> <ffffffff802a9db7>{lock_sock+152}
> <ffffffff80135756>{autoremove_wake_function+0}
> <ffffffffa0538b13>{:ib_sdp:sdp_poll_cq+58}
> <ffffffff80135756>{autoremove_wake_function+0}
> <ffffffff802a9dfd>{release_sock+16}
> <ffffffffa0534b18>{:ib_sdp:sdp_sendmsg+33}
> <ffffffff802a730f>{sock_sendmsg+271}
> <ffffffffa05386ad>{:ib_sdp:sdp_post_sends+619}
> <ffffffff802a9dfd>{release_sock+16}
> <ffffffffa05353a5>{:ib_sdp:sdp_sendmsg+2222}
> <ffffffff80135756>{autoremove_wake_function+0}
> <ffffffffa057708f>{:rs_iscsi:iscsi_sock_msg+1265}
> <ffffffffa057708b>{:rs_iscsi:iscsi_sock_msg+1261}
> <ffffffff80132159>{recalc_task_prio+337}
> <ffffffffa055bfdb>{:rs_iscsi:scsi_command_i+5283}
> <ffffffff8030a2c9>{thread_return+0}
> <ffffffff8030a321>{thread_return+88}
> <ffffffff8013fdf7>{del_timer+107}
> <ffffffff8013feb4>{del_singleshot_timer_sync+9}
> <ffffffff8030adf3>{schedule_timeout+375}
> <ffffffffa056829e>{:rs_iscsi:tx_worker_proc_i+6819}
> <ffffffff80110f47>{child_rip+8}
> <ffffffffa05667fb>{:rs_iscsi:tx_worker_proc_i+0}
> <ffffffff80110f3f>{child_rip+0}
>
>

We still don't know the scope of changes to be made, but we think,
at minimum that some of the memory allocation in SDP should be changed,
for example.

diff -Naur old/drivers/infiniband/ulp/sdp/sdp_bcopy.c
new/drivers/infiniband/ulp/sdp/sdp_bcopy.c
--- old/drivers/infiniband/ulp/sdp/sdp_bcopy.c    2007-06-21
10:38:47.000000000 -0400
+++ new/drivers/infiniband/ulp/sdp/sdp_bcopy.c    2007-08-31
12:25:58.000000000 -0400
@@ -224,13 +224,27 @@

     /* Now, allocate and repost recv */
     /* TODO: allocate from cache */
+
+#if (PROPOSED_SDP_FIX == 1)
+    skb = sk_stream_alloc_skb(&ssk->isk.sk, SDP_HEAD_SIZE,
+        (ssk->isk.sk.sk_allocation == 0) ? (GFP_KERNEL) :
+        ssk->isk.sk.sk_allocation);
+#else
     skb = sk_stream_alloc_skb(&ssk->isk.sk, SDP_HEAD_SIZE,
                   GFP_KERNEL);
+#endif
     /* FIXME */
     BUG_ON(!skb);
     h = (struct sdp_bsdh *)skb->head;
     for (i = 0; i < ssk->recv_frags; ++i) {
+#if (PROPOSED_SDP_FIX == 1)
+        page = alloc_pages((ssk->isk.sk.sk_allocation == 0)
+            ? (GFP_HIGHUSER) :
+            (ssk->isk.sk.sk_allocation | (__GFP_HIGHMEM)),
+            0);
+#else
         page = alloc_pages(GFP_HIGHUSER, 0);
+#endif
         BUG_ON(!page);
         frag = &skb_shinfo(skb)->frags[i];
         frag->page                = page;
@@ -406,10 +420,18 @@
         ssk->tx_head - ssk->tx_tail < SDP_TX_SIZE) {
         struct sdp_chrecvbuf *resp_size;
         ssk->recv_request = 0;
+#if (PROPOSED_SDP_FIX == 1)
+        skb = sk_stream_alloc_skb(&ssk->isk.sk,
+            sizeof(struct sdp_bsdh) +
+            sizeof(*resp_size),
+            (ssk->isk.sk.sk_allocation == 0) ? (GFP_KERNEL) :
+            ssk->isk.sk.sk_allocation);
+#else
         skb = sk_stream_alloc_skb(&ssk->isk.sk,
                       sizeof(struct sdp_bsdh) +
                       sizeof(*resp_size),
                       GFP_KERNEL);
+#endif
         /* FIXME */
         BUG_ON(!skb);
         resp_size = (struct sdp_chrecvbuf *)skb_put(skb, sizeof *resp_size);
@@ -431,10 +453,18 @@
         ssk->tx_head > ssk->sent_request_head + SDP_RESIZE_WAIT &&
         ssk->tx_head - ssk->tx_tail < SDP_TX_SIZE) {
         struct sdp_chrecvbuf *req_size;
+#if (PROPOSED_SDP_FIX == 1)
+        skb = sk_stream_alloc_skb(&ssk->isk.sk,
+            sizeof(struct sdp_bsdh) +
+            sizeof(*req_size),
+            (ssk->isk.sk.sk_allocation == 0) ? (GFP_KERNEL) :
+            ssk->isk.sk.sk_allocation);
+#else
         skb = sk_stream_alloc_skb(&ssk->isk.sk,
                       sizeof(struct sdp_bsdh) +
                       sizeof(*req_size),
                       GFP_KERNEL);
+#endif
         /* FIXME */
         BUG_ON(!skb);
         ssk->sent_request = SDP_MAX_SEND_SKB_FRAGS * PAGE_SIZE;
@@ -463,9 +493,16 @@
             (TCPF_FIN_WAIT1 | TCPF_LAST_ACK)) &&
         !ssk->isk.sk.sk_send_head &&
         ssk->bufs) {
+#if (PROPOSED_SDP_FIX == 1)
+        skb = sk_stream_alloc_skb(&ssk->isk.sk,
+            sizeof(struct sdp_bsdh),
+            (ssk->isk.sk.sk_allocation == 0) ? (GFP_KERNEL) :
+            ssk->isk.sk.sk_allocation);
+#else
         skb = sk_stream_alloc_skb(&ssk->isk.sk,
                       sizeof(struct sdp_bsdh),
                       GFP_KERNEL);
+#endif
         /* FIXME */
         BUG_ON(!skb);
         sdp_post_send(ssk, skb, SDP_MID_DISCONN);
diff -Naur old/drivers/infiniband/ulp/sdp/sdp.h
new/drivers/infiniband/ulp/sdp/sdp.h
--- old/drivers/infiniband/ulp/sdp/sdp.h    2007-06-21 10:38:47.000000000 -0400
+++ new/drivers/infiniband/ulp/sdp/sdp.h    2007-08-31 12:25:45.000000000 -0400
@@ -7,6 +7,8 @@
 #include <net/tcp.h> /* For urgent data flags */
 #include <rdma/ib_verbs.h>

+#define PROPOSED_SDP_FIX 1
+
 #define sdp_printk(level, sk, format, arg...)                \
     printk(level "sdp_sock(%d:%d): " format,             \
            (sk) ? inet_sk(sk)->num : -1,                 \




---------------------
Best Regards
K Phillips



More information about the general mailing list