[ofa-general] hang at module removal with local sa patches applied

Michael S. Tsirkin mst at dev.mellanox.co.il
Mon Jun 18 04:48:43 PDT 2007


Hi!
I tried applying the latest local sa patches to 2.6.2-rc5, and applied the
patch at the bottom to disable sa cache by default. After this, "openib stop"
hangs forever.

You can see the exact patches I applied here:
http://git.openfabrics.org/git/?p=~mst/ofed_kernel.git;a=tree;f=kernel_patches/attic;hb=ofed_kernel

Here's sysrq trace of threads that look IB-related.


[14897.168101] mthca_catas   S 0000000000000001     0  8330      2 (L-TLB)
[14897.168104]  ffff8100764bded0 0000000000000046 0000000000000000 0000000000000000
[14897.168107]  ffff81007ebea950 0000000000000006 ffff81007ebea920 ffff81007ff1f4a0
[14897.168111]  00000d83a434d314 00000000000004b6 ffff81007ebeaad0 0000000000000046
[14897.168113] Call Trace:
[14897.168116]  [<ffffffff80241f94>] worker_thread+0x0/0xe7
[14897.168119]  [<ffffffff80242036>] worker_thread+0xa2/0xe7
[14897.168122]  [<ffffffff802450af>] autoremove_wake_function+0x0/0x38
[14897.168125]  [<ffffffff80244f8b>] kthread+0x49/0x76
[14897.168127]  [<ffffffff8020aaa8>] child_rip+0xa/0x12
[14897.168130]  [<ffffffff80244f42>] kthread+0x0/0x76
[14897.168133]  [<ffffffff8020aa9e>] child_rip+0x0/0x12
[14897.168134]
[14897.168136] ib_mad1       S 0000000000000003     0  8333      2 (L-TLB)
[14897.168139]  ffff81007ce53ed0 0000000000000046 0000000000000000 ffff81007fcdc400
[14897.168142]  000000007ebf4990 000000000000000a ffff81007ebf4960 ffff81007fe0b520
[14897.168146]  00000d853dc5974d 00000000000012c8 ffff81007ebf4b10 ffff81007fe0b520
[14897.168149] Call Trace:
[14897.168152]  [<ffffffff80241f94>] worker_thread+0x0/0xe7
[14897.168154]  [<ffffffff80242036>] worker_thread+0xa2/0xe7
[14897.168157]  [<ffffffff802450af>] autoremove_wake_function+0x0/0x38
[14897.168160]  [<ffffffff80244f8b>] kthread+0x49/0x76
[14897.168162]  [<ffffffff8020aaa8>] child_rip+0xa/0x12
[14897.168165]  [<ffffffff80244f42>] kthread+0x0/0x76
[14897.168168]  [<ffffffff8020aa9e>] child_rip+0x0/0x12
[14897.168169]
[14897.168171] ib_mad2       S 0000000000000000     0  8334      2 (L-TLB)
[14897.168174]  ffff81007ce51ed0 0000000000000046 0000000000000000 ffff81007edcdc00
[14897.168177]  000000007e86f710 000000000000000a ffff81007e86f6e0 ffffffff8070d4c0
[14897.168181]  00000d853dc7ba8f 00000000000012aa ffff81007e86f890 ffffffff8070d4c0
[14897.168184] Call Trace:
[14897.168187]  [<ffffffff80241f94>] worker_thread+0x0/0xe7
[14897.168189]  [<ffffffff80242036>] worker_thread+0xa2/0xe7
[14897.168192]  [<ffffffff802450af>] autoremove_wake_function+0x0/0x38
[14897.168195]  [<ffffffff80244f8b>] kthread+0x49/0x76
[14897.168198]  [<ffffffff8020aaa8>] child_rip+0xa/0x12
[14897.168201]  [<ffffffff80244f42>] kthread+0x0/0x76
[14897.168203]  [<ffffffff8020aa9e>] child_rip+0x0/0x12
[14897.168205]
[14897.168206] ib_mcast      S 0000000000000000     0  8359      2 (L-TLB)
[14897.168210]  ffff81007d3a3ed0 0000000000000046 0000000000000000 0000000000000000
[14897.168213]  0000ffff1b4012ff 000000000000000a ffff81007e8830c0 ffffffff8070d4c0
[14897.168216]  00000d84fe84fafa 0000000000001105 ffff81007e883270 0000000000010000
[14897.168219] Call Trace:
[14897.168222]  [<ffffffff80241f94>] worker_thread+0x0/0xe7
[14897.168225]  [<ffffffff80242036>] worker_thread+0xa2/0xe7
[14897.168228]  [<ffffffff802450af>] autoremove_wake_function+0x0/0x38
[14897.168230]  [<ffffffff80244f8b>] kthread+0x49/0x76
[14897.168233]  [<ffffffff8020aaa8>] child_rip+0xa/0x12
[14897.168236]  [<ffffffff80244f42>] kthread+0x0/0x76
[14897.168239]  [<ffffffff8020aa9e>] child_rip+0x0/0x12
[14897.168240]
[14897.168242] ib_inform     S ffff81007e4d1740     0  8360      2 (L-TLB)
[14897.168245]  ffff81007d0d1ed0 0000000000000046 0000000024000000 0000000000000000
[14897.168248]  ffff810076c60130 0000000000000006 ffff810076c60100 ffff81007d1c7560
[14897.168252]  00000d83ee2e6167 000000000000035a ffff810076c602b0 0000000000000046
[14897.168254] Call Trace:
[14897.168257]  [<ffffffff80241f94>] worker_thread+0x0/0xe7
[14897.168260]  [<ffffffff80242036>] worker_thread+0xa2/0xe7
[14897.168263]  [<ffffffff802450af>] autoremove_wake_function+0x0/0x38
[14897.168266]  [<ffffffff80244f8b>] kthread+0x49/0x76
[14897.168268]  [<ffffffff8020aaa8>] child_rip+0xa/0x12
[14897.168271]  [<ffffffff80244f42>] kthread+0x0/0x76
[14897.168274]  [<ffffffff8020aa9e>] child_rip+0x0/0x12
[14897.168275]
[14897.168277] local_sa      D 0000000000000001     0  8361      2 (L-TLB)
[14897.168280]  ffff81007d0d3c10 0000000000000046 0000000000000000 800000ce00000000
[14897.168283]  84000b0000000000 000000000000000a ffff81007e8f3420 ffff81007ff1f4a0
[14897.168287]  00000d8431895ed4 0000000000000d33 ffff81007e8f35d0 800000ce00000000
[14897.168290] Call Trace:
[14897.168294]  [<ffffffff80582e4a>] __mutex_lock_slowpath+0x69/0xaa
[14897.168303]  [<ffffffff8806369a>] :ib_sa:port_work_handler+0x0/0x34
[14897.168306]  [<ffffffff80582c87>] mutex_lock+0xe/0x10
[14897.168311]  [<ffffffff880636b6>] :ib_sa:port_work_handler+0x1c/0x34
[14897.168314]  [<ffffffff80241669>] run_workqueue+0x85/0x10f
[14897.168317]  [<ffffffff80241851>] flush_cpu_workqueue+0x28/0x7b
[14897.168320]  [<ffffffff80241ad0>] flush_workqueue+0x43/0x5d
[14897.168326]  [<ffffffff88063250>] :ib_sa:cleanup_port+0x25/0x7b
[14897.168331]  [<ffffffff88063307>] :ib_sa:process_updates+0x61/0x336
[14897.168335]  [<ffffffff8058212b>] thread_return+0x0/0xea
[14897.168341]  [<ffffffff88063656>] :ib_sa:add_update+0x7a/0x83
[14897.168347]  [<ffffffff8806369a>] :ib_sa:port_work_handler+0x0/0x34
[14897.168352]  [<ffffffff88063695>] :ib_sa:refresh_port_db+0x36/0x3b
[14897.168358]  [<ffffffff880636be>] :ib_sa:port_work_handler+0x24/0x34
[14897.168361]  [<ffffffff80241669>] run_workqueue+0x85/0x10f
[14897.168363]  [<ffffffff80241f94>] worker_thread+0x0/0xe7
[14897.168366]  [<ffffffff80242070>] worker_thread+0xdc/0xe7
[14897.168368]  [<ffffffff802450af>] autoremove_wake_function+0x0/0x38
[14897.168371]  [<ffffffff80244f8b>] kthread+0x49/0x76
[14897.168374]  [<ffffffff8020aaa8>] child_rip+0xa/0x12
[14897.168377]  [<ffffffff80244f42>] kthread+0x0/0x76
[14897.168379]  [<ffffffff8020aa9e>] child_rip+0x0/0x12
[14897.168381]
[14897.168382] openibd       S 0000000000000002     0  8598   6178 (NOTLB)
[14897.168386]  ffff81007fadbeb8 0000000000000082 0000000000000000 ffff81007d4b2678
[14897.168389]  00000000005a5640 0000000000000001 ffff81007f7e60c0 ffff81007ff574e0
[14897.168392]  00000d84e88f6e97 0000000000007060 ffff81007f7e6270 ffff81007c309600
[14897.168396] Call Trace:
[14897.168399]  [<ffffffff80235807>] do_wait+0xa0a/0xb1f
[14897.168402]  [<ffffffff8022d6ce>] default_wake_function+0x0/0xf
[14897.168405]  [<ffffffff80235944>] sys_wait4+0x28/0x2a
[14897.168408]  [<ffffffff80209c8e>] system_call+0x7e/0x83
[14897.168410]
[14897.168411] modprobe      D 0000000000000000     0  8640   8598 (NOTLB)
[14897.168415]  ffff81007c90bd78 0000000000000086 0000000000000000 ffffffff807186a0
[14897.168418]  ffff81007c90be68 0000000000000007 ffff81007730edc0 ffffffff8070d4c0
[14897.168422]  00000d852f6be2aa 0000000000000b50 ffff81007730ef70 0000000000000001
[14897.168424] Call Trace:
[14897.168428]  [<ffffffff805822f9>] wait_for_completion+0x82/0xc1
[14897.168431]  [<ffffffff8022d6ce>] default_wake_function+0x0/0xf
[14897.168434]  [<ffffffff80241898>] flush_cpu_workqueue+0x6f/0x7b
[14897.168436]  [<ffffffff802419d6>] wq_barrier_func+0x0/0xf
[14897.168439]  [<ffffffff80241ad0>] flush_workqueue+0x43/0x5d
[14897.168445]  [<ffffffff8806388b>] :ib_sa:sa_db_remove_dev+0x3d/0x9c
[14897.168448]  [<ffffffff8022d6ce>] default_wake_function+0x0/0xf
[14897.168458]  [<ffffffff8801069e>] :ib_core:ib_unregister_client+0x37/0xf0
[14897.168465]  [<ffffffff880637f4>] :ib_sa:sa_db_cleanup+0x10/0x2a
[14897.168470]  [<ffffffff8806459d>] :ib_sa:ib_sa_cleanup+0x9/0x2d
[14897.168474]  [<ffffffff8025110e>] sys_delete_module+0x1b5/0x1e6
[14897.168477]  [<ffffffff80209c8e>] system_call+0x7e/0x83
[14897.168479]


---

Disable SA cache by default.

Signed-off-by: Michael S. Tsirkin <mst at dev.mellanox.co.il>

---

Index: connectx/drivers/infiniband/core/local_sa.c
===================================================================
--- connectx.orig/drivers/infiniband/core/local_sa.c	2007-05-31 09:32:50.000000000 +0300
+++ connectx/drivers/infiniband/core/local_sa.c	2007-05-31 09:33:55.000000000 +0300
@@ -55,7 +55,7 @@ enum {
 };
 
 static int set_paths_per_dest(const char *val, struct kernel_param *kp);
-static unsigned long paths_per_dest = SA_DB_MAX_PATHS_PER_DEST;
+static unsigned long paths_per_dest = 0;
 module_param_call(paths_per_dest, set_paths_per_dest, param_get_ulong,
 		  &paths_per_dest, 0644);
 MODULE_PARM_DESC(paths_per_dest, "Maximum number of paths to retrieve "

-- 
MST



More information about the general mailing list