[ofa-general] hang at module removal with local sa patches applied
Michael S. Tsirkin
mst at dev.mellanox.co.il
Mon Jun 18 04:48:43 PDT 2007
Hi!
I tried applying the latest local sa patches to 2.6.2-rc5, and applied the
patch at the bottom to disable sa cache by default. After this, "openib stop"
hangs forever.
You can see the exact patches I applied here:
http://git.openfabrics.org/git/?p=~mst/ofed_kernel.git;a=tree;f=kernel_patches/attic;hb=ofed_kernel
Here's sysrq trace of threads that look IB-related.
[14897.168101] mthca_catas S 0000000000000001 0 8330 2 (L-TLB)
[14897.168104] ffff8100764bded0 0000000000000046 0000000000000000 0000000000000000
[14897.168107] ffff81007ebea950 0000000000000006 ffff81007ebea920 ffff81007ff1f4a0
[14897.168111] 00000d83a434d314 00000000000004b6 ffff81007ebeaad0 0000000000000046
[14897.168113] Call Trace:
[14897.168116] [<ffffffff80241f94>] worker_thread+0x0/0xe7
[14897.168119] [<ffffffff80242036>] worker_thread+0xa2/0xe7
[14897.168122] [<ffffffff802450af>] autoremove_wake_function+0x0/0x38
[14897.168125] [<ffffffff80244f8b>] kthread+0x49/0x76
[14897.168127] [<ffffffff8020aaa8>] child_rip+0xa/0x12
[14897.168130] [<ffffffff80244f42>] kthread+0x0/0x76
[14897.168133] [<ffffffff8020aa9e>] child_rip+0x0/0x12
[14897.168134]
[14897.168136] ib_mad1 S 0000000000000003 0 8333 2 (L-TLB)
[14897.168139] ffff81007ce53ed0 0000000000000046 0000000000000000 ffff81007fcdc400
[14897.168142] 000000007ebf4990 000000000000000a ffff81007ebf4960 ffff81007fe0b520
[14897.168146] 00000d853dc5974d 00000000000012c8 ffff81007ebf4b10 ffff81007fe0b520
[14897.168149] Call Trace:
[14897.168152] [<ffffffff80241f94>] worker_thread+0x0/0xe7
[14897.168154] [<ffffffff80242036>] worker_thread+0xa2/0xe7
[14897.168157] [<ffffffff802450af>] autoremove_wake_function+0x0/0x38
[14897.168160] [<ffffffff80244f8b>] kthread+0x49/0x76
[14897.168162] [<ffffffff8020aaa8>] child_rip+0xa/0x12
[14897.168165] [<ffffffff80244f42>] kthread+0x0/0x76
[14897.168168] [<ffffffff8020aa9e>] child_rip+0x0/0x12
[14897.168169]
[14897.168171] ib_mad2 S 0000000000000000 0 8334 2 (L-TLB)
[14897.168174] ffff81007ce51ed0 0000000000000046 0000000000000000 ffff81007edcdc00
[14897.168177] 000000007e86f710 000000000000000a ffff81007e86f6e0 ffffffff8070d4c0
[14897.168181] 00000d853dc7ba8f 00000000000012aa ffff81007e86f890 ffffffff8070d4c0
[14897.168184] Call Trace:
[14897.168187] [<ffffffff80241f94>] worker_thread+0x0/0xe7
[14897.168189] [<ffffffff80242036>] worker_thread+0xa2/0xe7
[14897.168192] [<ffffffff802450af>] autoremove_wake_function+0x0/0x38
[14897.168195] [<ffffffff80244f8b>] kthread+0x49/0x76
[14897.168198] [<ffffffff8020aaa8>] child_rip+0xa/0x12
[14897.168201] [<ffffffff80244f42>] kthread+0x0/0x76
[14897.168203] [<ffffffff8020aa9e>] child_rip+0x0/0x12
[14897.168205]
[14897.168206] ib_mcast S 0000000000000000 0 8359 2 (L-TLB)
[14897.168210] ffff81007d3a3ed0 0000000000000046 0000000000000000 0000000000000000
[14897.168213] 0000ffff1b4012ff 000000000000000a ffff81007e8830c0 ffffffff8070d4c0
[14897.168216] 00000d84fe84fafa 0000000000001105 ffff81007e883270 0000000000010000
[14897.168219] Call Trace:
[14897.168222] [<ffffffff80241f94>] worker_thread+0x0/0xe7
[14897.168225] [<ffffffff80242036>] worker_thread+0xa2/0xe7
[14897.168228] [<ffffffff802450af>] autoremove_wake_function+0x0/0x38
[14897.168230] [<ffffffff80244f8b>] kthread+0x49/0x76
[14897.168233] [<ffffffff8020aaa8>] child_rip+0xa/0x12
[14897.168236] [<ffffffff80244f42>] kthread+0x0/0x76
[14897.168239] [<ffffffff8020aa9e>] child_rip+0x0/0x12
[14897.168240]
[14897.168242] ib_inform S ffff81007e4d1740 0 8360 2 (L-TLB)
[14897.168245] ffff81007d0d1ed0 0000000000000046 0000000024000000 0000000000000000
[14897.168248] ffff810076c60130 0000000000000006 ffff810076c60100 ffff81007d1c7560
[14897.168252] 00000d83ee2e6167 000000000000035a ffff810076c602b0 0000000000000046
[14897.168254] Call Trace:
[14897.168257] [<ffffffff80241f94>] worker_thread+0x0/0xe7
[14897.168260] [<ffffffff80242036>] worker_thread+0xa2/0xe7
[14897.168263] [<ffffffff802450af>] autoremove_wake_function+0x0/0x38
[14897.168266] [<ffffffff80244f8b>] kthread+0x49/0x76
[14897.168268] [<ffffffff8020aaa8>] child_rip+0xa/0x12
[14897.168271] [<ffffffff80244f42>] kthread+0x0/0x76
[14897.168274] [<ffffffff8020aa9e>] child_rip+0x0/0x12
[14897.168275]
[14897.168277] local_sa D 0000000000000001 0 8361 2 (L-TLB)
[14897.168280] ffff81007d0d3c10 0000000000000046 0000000000000000 800000ce00000000
[14897.168283] 84000b0000000000 000000000000000a ffff81007e8f3420 ffff81007ff1f4a0
[14897.168287] 00000d8431895ed4 0000000000000d33 ffff81007e8f35d0 800000ce00000000
[14897.168290] Call Trace:
[14897.168294] [<ffffffff80582e4a>] __mutex_lock_slowpath+0x69/0xaa
[14897.168303] [<ffffffff8806369a>] :ib_sa:port_work_handler+0x0/0x34
[14897.168306] [<ffffffff80582c87>] mutex_lock+0xe/0x10
[14897.168311] [<ffffffff880636b6>] :ib_sa:port_work_handler+0x1c/0x34
[14897.168314] [<ffffffff80241669>] run_workqueue+0x85/0x10f
[14897.168317] [<ffffffff80241851>] flush_cpu_workqueue+0x28/0x7b
[14897.168320] [<ffffffff80241ad0>] flush_workqueue+0x43/0x5d
[14897.168326] [<ffffffff88063250>] :ib_sa:cleanup_port+0x25/0x7b
[14897.168331] [<ffffffff88063307>] :ib_sa:process_updates+0x61/0x336
[14897.168335] [<ffffffff8058212b>] thread_return+0x0/0xea
[14897.168341] [<ffffffff88063656>] :ib_sa:add_update+0x7a/0x83
[14897.168347] [<ffffffff8806369a>] :ib_sa:port_work_handler+0x0/0x34
[14897.168352] [<ffffffff88063695>] :ib_sa:refresh_port_db+0x36/0x3b
[14897.168358] [<ffffffff880636be>] :ib_sa:port_work_handler+0x24/0x34
[14897.168361] [<ffffffff80241669>] run_workqueue+0x85/0x10f
[14897.168363] [<ffffffff80241f94>] worker_thread+0x0/0xe7
[14897.168366] [<ffffffff80242070>] worker_thread+0xdc/0xe7
[14897.168368] [<ffffffff802450af>] autoremove_wake_function+0x0/0x38
[14897.168371] [<ffffffff80244f8b>] kthread+0x49/0x76
[14897.168374] [<ffffffff8020aaa8>] child_rip+0xa/0x12
[14897.168377] [<ffffffff80244f42>] kthread+0x0/0x76
[14897.168379] [<ffffffff8020aa9e>] child_rip+0x0/0x12
[14897.168381]
[14897.168382] openibd S 0000000000000002 0 8598 6178 (NOTLB)
[14897.168386] ffff81007fadbeb8 0000000000000082 0000000000000000 ffff81007d4b2678
[14897.168389] 00000000005a5640 0000000000000001 ffff81007f7e60c0 ffff81007ff574e0
[14897.168392] 00000d84e88f6e97 0000000000007060 ffff81007f7e6270 ffff81007c309600
[14897.168396] Call Trace:
[14897.168399] [<ffffffff80235807>] do_wait+0xa0a/0xb1f
[14897.168402] [<ffffffff8022d6ce>] default_wake_function+0x0/0xf
[14897.168405] [<ffffffff80235944>] sys_wait4+0x28/0x2a
[14897.168408] [<ffffffff80209c8e>] system_call+0x7e/0x83
[14897.168410]
[14897.168411] modprobe D 0000000000000000 0 8640 8598 (NOTLB)
[14897.168415] ffff81007c90bd78 0000000000000086 0000000000000000 ffffffff807186a0
[14897.168418] ffff81007c90be68 0000000000000007 ffff81007730edc0 ffffffff8070d4c0
[14897.168422] 00000d852f6be2aa 0000000000000b50 ffff81007730ef70 0000000000000001
[14897.168424] Call Trace:
[14897.168428] [<ffffffff805822f9>] wait_for_completion+0x82/0xc1
[14897.168431] [<ffffffff8022d6ce>] default_wake_function+0x0/0xf
[14897.168434] [<ffffffff80241898>] flush_cpu_workqueue+0x6f/0x7b
[14897.168436] [<ffffffff802419d6>] wq_barrier_func+0x0/0xf
[14897.168439] [<ffffffff80241ad0>] flush_workqueue+0x43/0x5d
[14897.168445] [<ffffffff8806388b>] :ib_sa:sa_db_remove_dev+0x3d/0x9c
[14897.168448] [<ffffffff8022d6ce>] default_wake_function+0x0/0xf
[14897.168458] [<ffffffff8801069e>] :ib_core:ib_unregister_client+0x37/0xf0
[14897.168465] [<ffffffff880637f4>] :ib_sa:sa_db_cleanup+0x10/0x2a
[14897.168470] [<ffffffff8806459d>] :ib_sa:ib_sa_cleanup+0x9/0x2d
[14897.168474] [<ffffffff8025110e>] sys_delete_module+0x1b5/0x1e6
[14897.168477] [<ffffffff80209c8e>] system_call+0x7e/0x83
[14897.168479]
---
Disable SA cache by default.
Signed-off-by: Michael S. Tsirkin <mst at dev.mellanox.co.il>
---
Index: connectx/drivers/infiniband/core/local_sa.c
===================================================================
--- connectx.orig/drivers/infiniband/core/local_sa.c 2007-05-31 09:32:50.000000000 +0300
+++ connectx/drivers/infiniband/core/local_sa.c 2007-05-31 09:33:55.000000000 +0300
@@ -55,7 +55,7 @@ enum {
};
static int set_paths_per_dest(const char *val, struct kernel_param *kp);
-static unsigned long paths_per_dest = SA_DB_MAX_PATHS_PER_DEST;
+static unsigned long paths_per_dest = 0;
module_param_call(paths_per_dest, set_paths_per_dest, param_get_ulong,
&paths_per_dest, 0644);
MODULE_PARM_DESC(paths_per_dest, "Maximum number of paths to retrieve "
--
MST
More information about the general
mailing list