[ofw][patch][HW] fix bugs in low resources flow

Leonid Keller leonid at mellanox.co.il
Sun Mar 29 08:35:45 PDT 2009


Appied in 2064.


________________________________

	From: ofw-bounces at lists.openfabrics.org
[mailto:ofw-bounces at lists.openfabrics.org] On Behalf Of Leonid Keller
	Sent: Wednesday, March 25, 2009 4:17 PM
	To: ofw at lists.openfabrics.org
	Subject: [ofw][patch][HW] fix bugs in low resources flow
	
	
	This patch fixes several bugs that show up upon low resources.
	(found with the help of Verifier with error injection)
	 
	Index: hw/mlx4/kernel/bus/core/cache.c
	
===================================================================
	--- hw/mlx4/kernel/bus/core/cache.c (revision 2055)
	+++ hw/mlx4/kernel/bus/core/cache.c (working copy)
	@@ -366,48 +366,38 @@
	  int p;
	  int port_num;
	  
	+ shutter_init( &device->cache.x.work_thread );
	  rwlock_init(&device->cache.lock);
	+ INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
	+         device, ib_cache_event, NULL, NULL, 0);
	+ ib_register_event_handler(&device->cache.event_handler);
	+
	  port_num = end_port(device) - start_port(device) + 1;
	- 
	  if (port_num > 0 ) { 
	   // if port_num ==0   ==> there are no IB ports
	   device->cache.pkey_cache =
	    kmalloc(sizeof *device->cache.pkey_cache * port_num,
GFP_KERNEL);
	   device->cache.gid_cache =
	    kmalloc(sizeof *device->cache.gid_cache * port_num,
GFP_KERNEL);
	-
	   device->cache.lmc_cache = kmalloc(sizeof
*device->cache.lmc_cache *
	-        port_num, GFP_KERNEL);
	+   port_num, GFP_KERNEL);
	 
	   if (!device->cache.pkey_cache || !device->cache.gid_cache ||
	-      !device->cache.lmc_cache) {
	+   !device->cache.lmc_cache) {
	    printk(KERN_WARNING "Couldn't allocate cache "
	-          "for %s\n", device->name);
	+    "for %s\n", device->name);
	    goto err;
	   }
	  }
	 
	- shutter_init( &device->cache.x.work_thread );
	-
	  for (p = 0; p < port_num; ++p) {
	   device->cache.pkey_cache[p] = NULL;
	   device->cache.gid_cache [p] = NULL;
	   ib_cache_update(device, (u8)(p + start_port(device)));
	  }
	 
	- INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
	-         device, ib_cache_event, NULL, NULL, 0);
	- if (ib_register_event_handler(&device->cache.event_handler))
	-  goto err_cache;
	-
	  return;
	 
	-err_cache:
	- for (p = 0; p <= end_port(device) - start_port(device); ++p) {
	-  kfree(device->cache.pkey_cache[p]);
	-  kfree(device->cache.gid_cache[p]);
	- }
	-
	 err:
	  kfree(device->cache.pkey_cache);
	  kfree(device->cache.gid_cache);
	@@ -422,6 +412,7 @@
	 {
	  int p;
	 
	+ ASSERT(device->cache.event_handler.device);
	  ib_unregister_event_handler(&device->cache.event_handler);
	  // instead of Linux flush_scheduled_work(): wait for them to
quit
	  shutter_shut( &device->cache.x.work_thread );
	Index: hw/mlx4/kernel/bus/core/device.c
	
===================================================================
	--- hw/mlx4/kernel/bus/core/device.c (revision 2055)
	+++ hw/mlx4/kernel/bus/core/device.c (working copy)
	@@ -302,17 +302,23 @@
	   goto out;
	  }
	 
	- list_add_tail(&device->core_list, &device_list);
	-
	- device->reg_state = IB_DEV_REGISTERED;
	-
	  {
	   struct ib_client *client;
	 
	-  list_for_each_entry(client, &client_list, list, struct
ib_client)
	-   if (client->add && !add_client_context(device, client))
	+  list_for_each_entry(client, &client_list, list, struct
ib_client) {
	+   if ( add_client_context(device, client) ) {
	+    printk(KERN_WARNING "add_client_context failed for device
%s\n",
	+        device->name);
	+    ret = -EFAULT;
	+    goto out;
	+   }
	+   if (client->add)
	     client->add(device);
	+  }
	  }
	+    
	+ list_add_tail(&device->core_list, &device_list);
	+ device->reg_state = IB_DEV_REGISTERED;
	 
	  out:
	  mutex_unlock(&device_mutex);
	@@ -381,17 +387,25 @@
	 int ib_register_client(struct ib_client *client)
	 {
	  struct ib_device *device;
	+ int ret = 0;
	 
	  mutex_lock(&device_mutex);
	 
	- list_add_tail(&client->list, &client_list);
	- list_for_each_entry(device, &device_list, core_list, struct
ib_device)
	-  if (client->add && !add_client_context(device, client))
	+ list_for_each_entry(device, &device_list, core_list, struct
ib_device) {
	+  if ( add_client_context(device, client) ) {
	+   printk(KERN_WARNING "add_client_context failed for device
%s\n",
	+       device->name);
	+   ret = -EFAULT;
	+   goto out;
	+  }
	+  if (client->add)
	    client->add(device);
	-
	+ }
	+    
	+    list_add_tail(&client->list, &client_list);
	+out:
	  mutex_unlock(&device_mutex);
	-
	- return 0;
	+ return ret;
	 }
	 EXPORT_SYMBOL(ib_register_client);
	 
	Index: hw/mlx4/kernel/bus/drv/drv.c
	
===================================================================
	--- hw/mlx4/kernel/bus/drv/drv.c (revision 2055)
	+++ hw/mlx4/kernel/bus/drv/drv.c (working copy)
	@@ -323,10 +323,12 @@
	 
	  p_fdo->bus_ib_ifc.pdev = &p_fdo->pci_dev;
	  p_fdo->bus_ib_ifc.p_ibdev = p_fdo->pci_dev.ib_dev;
	- p_fdo->bus_ib_ifc.pmlx4_dev =
to_mdev(p_fdo->pci_dev.ib_dev)->dev;
	- p_fdo->bus_ib_ifc.is_livefish =
mlx4_is_livefish(p_fdo->pci_dev.dev);
	- if ( p_fdo->bus_ib_ifc.pmlx4_dev->flags & MLX4_FLAG_MSI_X )
	-  p_fdo->bus_ib_ifc.n_msi_vectors =
p_fdo->pci_dev.n_msi_vectors - 2;
	+    p_fdo->bus_ib_ifc.is_livefish =
mlx4_is_livefish(p_fdo->pci_dev.dev);    
	+    if ( p_fdo->bus_ib_ifc.is_livefish == 0 ) {
	+        p_fdo->bus_ib_ifc.pmlx4_dev =
to_mdev(p_fdo->pci_dev.ib_dev)->dev;    
	+     if ( p_fdo->bus_ib_ifc.pmlx4_dev->flags & MLX4_FLAG_MSI_X
)
	+      p_fdo->bus_ib_ifc.n_msi_vectors =
p_fdo->pci_dev.n_msi_vectors - 2;
	+    }
	 
	  p_fdo->card_started = TRUE;
	 
	@@ -572,13 +574,13 @@
	      pdev->int_info = *desc;
	     if (desc->Flags & CM_RESOURCE_INTERRUPT_MESSAGE) {
	      pdev->n_msi_vectors_alloc =
(u8)(pdev->n_msi_vectors_alloc+desc_raw->u.MessageInterrupt.Raw.MessageC
ount);
	-     MLX4_PRINT(TRACE_LEVEL_WARNING, MLX4_DBG_DRV,
	+     MLX4_PRINT(TRACE_LEVEL_VERBOSE, MLX4_DBG_DRV,
	       ("EvtPrepareHardware: Desc %d: MsiInterrupt: Share %d,
Flags %#x, Level %d, Vector %#x, Affinity %#x\n", 
	       i, desc->ShareDisposition, desc->Flags,
	       desc->u.MessageInterrupt.Translated.Level, 
	       desc->u.MessageInterrupt.Translated.Vector, 
	       (u32)desc->u.MessageInterrupt.Translated.Affinity ));
	-     MLX4_PRINT(TRACE_LEVEL_WARNING, MLX4_DBG_DRV,
	+     MLX4_PRINT(TRACE_LEVEL_VERBOSE, MLX4_DBG_DRV,
	       ("EvtPrepareHardware: Desc %d: RawMsiInterrupt: Share %d,
Flags %#x, MessageCount %#hx, Vector %#x, Affinity %#x\n", 
	       i, desc_raw->ShareDisposition, desc_raw->Flags,
	       desc_raw->u.MessageInterrupt.Raw.MessageCount, 
	@@ -586,7 +588,7 @@
	       (u32)desc_raw->u.MessageInterrupt.Raw.Affinity ));
	     }
	     else { // line-based interrupt
	-     MLX4_PRINT(TRACE_LEVEL_WARNING, MLX4_DBG_DRV,
	+     MLX4_PRINT(TRACE_LEVEL_VERBOSE, MLX4_DBG_DRV,
	       ("EvtPrepareHardware: Desc %d: LineInterrupt: Share %d,
Flags %#x, Level %d, Vector %#x, Affinity %#x\n", 
	       i, desc->ShareDisposition, desc->Flags,
	       desc->u.Interrupt.Level, desc->u.Interrupt.Vector, 
	@@ -774,14 +776,14 @@
	 }
	 
	 NTSTATUS
	-EvtDeviceAdd(
	+EvtDriverDeviceAdd(
	  IN WDFDRIVER        Driver,
	  IN PWDFDEVICE_INIT  DeviceInit
	  )
	 /*++
	 Routine Description:
	 
	- EvtDeviceAdd is called by the framework in response to
AddDevice
	+ EvtDriverDeviceAdd is called by the framework in response to
AddDevice
	  call from the PnP manager. We create and initialize a device
object to
	  represent a new instance of mxe bus.
	 
	@@ -1191,7 +1193,7 @@
	  //
	 
	  WDF_DRIVER_CONFIG_INIT(
	-  &config, EvtDeviceAdd );
	+  &config, EvtDriverDeviceAdd );
	  config.EvtDriverUnload = EvtDriverUnload;
	 
	  //
	Index: hw/mlx4/kernel/bus/drv/drv.h
	
===================================================================
	--- hw/mlx4/kernel/bus/drv/drv.h (revision 2055)
	+++ hw/mlx4/kernel/bus/drv/drv.h (working copy)
	@@ -155,7 +155,7 @@
	  );
	  
	 NTSTATUS
	-EvtDeviceAdd(
	+EvtDriverDeviceAdd(
	  IN WDFDRIVER        Driver,
	  IN PWDFDEVICE_INIT  DeviceInit
	  );
	Index: hw/mlx4/kernel/bus/drv/pci.c
	
===================================================================
	--- hw/mlx4/kernel/bus/drv/pci.c (revision 2055)
	+++ hw/mlx4/kernel/bus/drv/pci.c (working copy)
	@@ -541,7 +541,7 @@
	    p_vector = ka;
	    /* print (allocated+2) vectors */
	    for (i=0; i<pdev->n_msi_vectors_alloc+2; i++) {
	-    MLX4_PRINT( TRACE_LEVEL_WARNING  ,MLX4_DBG_PNP  ,
	+    MLX4_PRINT( TRACE_LEVEL_VERBOSE  ,MLX4_DBG_PNP  ,
	      ("MSI-X Vectors: Id %d, Masked %d, Addr %#I64x, Data
%#x\n",
	      i, MSIX_VECTOR_MASKED(p_vector[i].Flags),
	      p_vector[i].Addr, p_vector[i].Data ));
	@@ -587,7 +587,7 @@
	 )
	 {
	  u32       sem;
	- NTSTATUS     status = STATUS_SUCCESS;
	+ NTSTATUS     status = STATUS_SUCCESS, status1;
	  PBUS_INTERFACE_STANDARD  p_ifc = &pdev->bus_pci_ifc;
	  PCI_COMMON_CONFIG*   p_cfg = &pdev->pci_cfg_space;
	  struct msix_saved_info   msix_info;
	@@ -703,19 +703,19 @@
	   }
	  }
	 
	+ status = STATUS_SUCCESS;
	+
	+err:
	  /* restore MSI-X info after reset */
	- status = __pci_restore_msix_info( pdev, &msix_info );
	- if (!NT_SUCCESS(status))
	-  goto err;
	+ status1 = __pci_restore_msix_info( pdev, &msix_info );
	+ status = (!status) ? status1 : status; /* return the only or
the first error */
	+ if( NT_SUCCESS( status ) ) {
	+  MLX4_PRINT( TRACE_LEVEL_WARNING ,MLX4_DBG_PNP , ("HCA has
been reset ! \n"));
	+ }
	 
	- /* check, whether MSI-X capabilities were restore */
	+ /* check, whether MSI-X capabilities have been restored */
	  pci_get_msi_info( pdev, p_cfg, &pdev->uplink_info );
	 
	- MLX4_PRINT( TRACE_LEVEL_WARNING ,MLX4_DBG_PNP , ("HCA has been
reset ! \n"));
	-
	- status = STATUS_SUCCESS;
	-
	-err:
	  if (pdev->msix_info.valid) 
	   pci_free_msix_info_resources(&pdev->msix_info);
	  MLX4_EXIT( MLX4_DBG_PNP );
	Index: hw/mlx4/kernel/bus/ib/main.c
	
===================================================================
	--- hw/mlx4/kernel/bus/ib/main.c (revision 2055)
	+++ hw/mlx4/kernel/bus/ib/main.c (working copy)
	@@ -611,6 +611,7 @@
	  mlx4_pd_free(dev, ibdev->priv_pdn);
	 
	 err_dealloc:
	+ ibdev->ib_dev.reg_state = IB_DEV_UNINITIALIZED;
	  ib_dealloc_device(&ibdev->ib_dev);
	 
	  return NULL;
	Index: hw/mlx4/kernel/bus/net/catas.c
	
===================================================================
	--- hw/mlx4/kernel/bus/net/catas.c (revision 2055)
	+++ hw/mlx4/kernel/bus/net/catas.c (working copy)
	@@ -370,6 +370,7 @@
	   // to allow for end of operations that are in progress
	   reset_work = IoAllocateWorkItem( dev->pdev->p_self_do );
	   if (!reset_work) {
	+            spin_unlock_irqrestore(&ibdev->event_handler_lock,
flags);
	    mlx4_err(dev, "mlx4_reset_request IoAllocateWorkItem failed,
reset will not be propagated\n");
	    err = -EFAULT;
	    goto err_workitem;
	Index: hw/mlx4/kernel/bus/net/cmd.c
	
===================================================================
	--- hw/mlx4/kernel/bus/net/cmd.c (revision 2055)
	+++ hw/mlx4/kernel/bus/net/cmd.c (working copy)
	@@ -337,9 +337,15 @@
	     mlx4_dispatch_reset_event(dev->pdev->ib_dev,
IB_EVENT_RESET_DRIVER);
	    }
	   }
	+  else {
	+   err = -EFAULT;
	+   mlx4_err(dev, "mlx4_cmd_wait: Unexpected end of waiting for
a comand \n");
	+   ASSERT(0);
	+  }
	  }
	-
	- err = context->result;
	+ else
	+  err = context->result;
	+ 
	  if (err)
	   goto out;
	 
	Index: hw/mlx4/kernel/bus/net/intf.c
	
===================================================================
	--- hw/mlx4/kernel/bus/net/intf.c (revision 2055)
	+++ hw/mlx4/kernel/bus/net/intf.c (working copy)
	@@ -43,13 +43,13 @@
	 static LIST_HEAD(dev_list);
	 static DEFINE_MUTEX(intf_mutex);
	 
	-static void mlx4_add_device(struct mlx4_interface *intf, struct
mlx4_priv *priv)
	+static int mlx4_add_device(struct mlx4_interface *intf, struct
mlx4_priv *priv)
	 {
	  struct mlx4_device_context *dev_ctx;
	 
	  dev_ctx = kmalloc(sizeof *dev_ctx, GFP_KERNEL);
	  if (!dev_ctx)
	-  return;
	+  return -EFAULT;
	 
	  dev_ctx->intf    = intf;
	  dev_ctx->context = intf->add(&priv->dev);
	@@ -59,8 +59,11 @@
	   spin_lock_irq(&priv->ctx_lock);
	   list_add_tail(&dev_ctx->list, &priv->ctx_list);
	   spin_unlock_irq(&priv->ctx_lock);
	- } else
	+ } else {
	   kfree(dev_ctx);
	+  return -EFAULT;
	+ }
	+ return 0;
	 }
	 
	 static void mlx4_remove_device(struct mlx4_interface *intf,
struct mlx4_priv *priv)
	@@ -82,19 +85,25 @@
	 int mlx4_register_interface(struct mlx4_interface *intf)
	 {
	  struct mlx4_priv *priv;
	+ int err = 0;
	 
	  if (!intf->add || !intf->remove)
	   return -EINVAL;
	 
	  mutex_lock(&intf_mutex);
	 
	- list_add_tail(&intf->list, &intf_list);
	- list_for_each_entry(priv, &dev_list, dev_list, struct
mlx4_priv)
	-  mlx4_add_device(intf, priv);
	+ list_for_each_entry(priv, &dev_list, dev_list, struct
mlx4_priv) {
	+  if (mlx4_add_device(intf, priv)) {
	+   err = -EFAULT;
	+   goto end;
	+  }
	+ }
	 
	+    list_add_tail(&intf->list, &intf_list);
	+
	+end:
	  mutex_unlock(&intf_mutex);
	-
	- return 0;
	+ return err;
	 }
	 EXPORT_SYMBOL_GPL(mlx4_register_interface);
	 
	@@ -137,12 +146,18 @@
	 
	  mutex_lock(&intf_mutex);
	 
	+ list_for_each_entry(intf, &intf_list, list, struct
mlx4_interface) {
	+  if (mlx4_add_device(intf, priv)) {
	+   err = -EFAULT;
	+   goto end;
	+  }
	+ }
	+    
	  list_add_tail(&priv->dev_list, &dev_list);
	- list_for_each_entry(intf, &intf_list, list, struct
mlx4_interface)
	-  mlx4_add_device(intf, priv);
	-
	+    
	+end:
	  mutex_unlock(&intf_mutex);
	- if (!mlx4_is_livefish(dev))
	+ if (!err && !mlx4_is_livefish(dev))
	   err = mlx4_start_catas_poll(dev);
	 
	  return err;
	Index: hw/mlx4/kernel/bus/net/main.c
	
===================================================================
	--- hw/mlx4/kernel/bus/net/main.c (revision 2055)
	+++ hw/mlx4/kernel/bus/net/main.c (working copy)
	@@ -956,9 +956,11 @@
	     ("mlx4_register_device for livefish failed, return with
error.\n"));
	    pdev->dev = NULL;
	    kfree(priv);
	+  } 
	+  else {
	+      MLX4_PRINT(TRACE_LEVEL_ERROR ,MLX4_DBG_LOW ,
	+       ("MLX4_BUS started in \"livefish\" mode !!!.\n"));
	   }
	-  MLX4_PRINT(TRACE_LEVEL_ERROR ,MLX4_DBG_LOW ,
	-   ("MLX4_BUS started in \"livefish\" mode !!!.\n"));
	   goto end;
	  }
	 
	@@ -1064,8 +1066,8 @@
	   mlx4_close_hca(dev);
	   mlx4_cmd_cleanup(dev);
	 
	-  if (reset)
	-   mlx4_reset(dev);
	+  if (reset && mlx4_reset(dev))
	+   mlx4_err(dev, "Failed to reset HCA\n");
	   mlx4_dbg(dev, "MLX4_BUS: NET device (dev_id=%d) is REMOVED !
\n", (int)pdev->dev_id);
	   pdev->dev = NULL;
	 done:
	Index: hw/mthca/kernel/mt_cache.c
	
===================================================================
	--- hw/mthca/kernel/mt_cache.c (revision 2055)
	+++ hw/mthca/kernel/mt_cache.c (working copy)
	@@ -341,6 +341,9 @@
	  u8 p;
	 
	  rwlock_init(&device->cache.lock);
	+ INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
	+         device, ib_cache_event);
	+ ib_register_event_handler(&device->cache.event_handler);
	 
	  device->cache.pkey_cache =
	   kmalloc(sizeof *device->cache.pkey_cache *
	@@ -361,19 +364,8 @@
	   ib_cache_update(device, p + start_port(device));
	  }
	 
	- INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
	-         device, ib_cache_event);
	- if (ib_register_event_handler(&device->cache.event_handler))
	-  goto err_cache;
	-
	  return;
	 
	-err_cache:
	- for (p = 0; p <= end_port(device) - start_port(device); ++p) {
	-  kfree(device->cache.pkey_cache[p]);
	-  kfree(device->cache.gid_cache[p]);
	- }
	-
	 err:
	  kfree(device->cache.pkey_cache);
	  kfree(device->cache.gid_cache);
	

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20090329/a3e0ec5a/attachment.html>


More information about the ofw mailing list