[ofa-general] (no subject)

Mike Heinz michael.heinz at qlogic.com
Wed Jan 14 10:10:47 PST 2009


We've repeatedly run into a problem where mstvpd can hang on certain HCA models, and on HCAs that have failed. This is an issue for us, because mstvpd is one of the tools we use to automatically capture information about a system that's experiencing problems.

I previously opened PR 1440 on the problem, but it doesn't appear to have been investigated.

For this reason, I'm proposing the attached patch. Basically, it adds a configurable time out and it terminates the attempt to read the VPD area if it fails to retrieve data before the time out expires. The default is 30 seconds.  It uses a stupid busy-loop to check for time out because that's what the existing code does.

Other changes were also made to support this change - I changed how command line options are processed and extended the usage() function.

--- vpd.c.orig    2009-01-08 16:56:12.000000000 -0500
+++ vpd.c   2009-01-08 17:44:01.000000000 -0500
@@ -44,6 +44,13 @@
 #include <stdlib.h>
 #include <errno.h>
 #include <libgen.h>
+#include <sys/times.h>
+
+/* pread is non-blocking, so we loop until we find data.  Unfortunately,
+ * we can loop forever if the HCA is crashed or if the wrong device is
+ * specified as an argument. So, we set time outs.
+ */
+static clock_t ticks_per_sec, start_t, curr_t, timeout_t = 30;

 struct vpd_cap {
      unsigned char id;
@@ -168,7 +175,13 @@
      if (ret != sizeof addr_flag)
            return ret;

+     start_t = times(NULL);
      while((addr_flag[1] & VPD_FLAG) != VPD_FLAG_READ_READY) {
+           curr_t = times(NULL);
+           if ((curr_t - start_t) / ticks_per_sec > timeout_t) {
+                 return -EIO;
+           }
+
            ret = pread(device, addr_flag, sizeof addr_flag,
                       vpd_cap_offset + VPD_ADDR_OFFSET);
            if (ret != sizeof addr_flag)
@@ -437,24 +450,34 @@
            rc = 1;
            goto usage;
      }
-     if (argc == 3) {
-           if (!strcmp("-m", argv[1])) {
-                 argv++;
-                 argc--;
-                 m = 1;
-           } else if (!strcmp("-n", argv[1])) {
-                 argv++;
-                 argc--;
-                 n = 1;
-           } else {
-                 rc = 2;
-                 goto usage;
+
+     ticks_per_sec = sysconf(_SC_CLK_TCK);
+
+     do
+     {
+           i=getopt(argc, argv, "mnt:");
+           if (i<0) {
+                 break;
            }
-     }

-     name = argv[1];
-     argv++;
-     argc--;
+           switch (i) {
+                 case 'm':
+                       m=1;
+                       break;
+                 case 'n':
+                       n=1;
+                       break;
+                 case 't':
+                       timeout_t = strtol(optarg, NULL, 0);
+                       break;
+                 default:
+                       goto usage;
+           }
+     } while (1 == 1);
+
+     name = argv[optind];
+     argc -= optind;
+     argv += optind;

      if (!strcmp("-", name)) {
            if (fread(d, VPD_MAX_SIZE, 1, stdin) != 1)
@@ -486,6 +509,14 @@
      return 0;

 usage:
-     fprintf(stderr, "Usage: %s [-m|-n] <file|-> [-- keyword ...]\n", argv[0]);
+     fprintf(stderr, "Usage: %s [-m|-n] [-t ##] <file> [-- keyword ...]\n", argv[0]);
+     fprintf(stderr, "-m\tDump raw VPD data to stdout.\n");
+     fprintf(stderr, "-n\tDo not validate check sum.\n");
+     fprintf(stderr, "-t ##\tTime out after ## seconds. (Default is 30.)\n\n");
+     fprintf(stderr, "file\tThe PCI id number of the HCA (for example, \"2:00.0\"),\n");
+     fprintf(stderr, "\tthe device name (such as \"mlx4_0\")\n");
+     fprintf(stderr, "\tthe absolute path to the device (\"/sys/class/infiniband/mlx4_0/device\")\n");
+     fprintf(stderr, "\tor '-' to read VPD data from the standard input.\n\n");
+     fprintf(stderr, "keyword(s): Only display the requested information. (ID, PN, EC, SN, etc...)\n");
      return rc;
 }


--
Michael Heinz
Principal Engineer, Qlogic Corporation
King of Prussia, Pennsylvania
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20090114/3b2f84ff/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: mstvpd.patch
Type: application/octet-stream
Size: 2501 bytes
Desc: mstvpd.patch
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20090114/3b2f84ff/attachment.obj>


More information about the general mailing list