[ofa-general] [PATCH] mlx4: add device reset to Internal Error handling mechanism

Jack Morgenstein jackm at dev.mellanox.co.il
Mon Jul 9 00:12:52 PDT 2007


Add device reset to mlx4 Internal Error handling. Also, detect errors
via polling the device error buffer (rather than via interrupt), because
this provides better coverage.

This patch also disables the detection of Internal Errors via a device
interrupt, because we wish to avoid the complexity of supporting
two independent detection mechanisms.

Signed-off-by: Jack Morgenstein <jackm at dev.mellanox.co.il>

diff --git a/drivers/net/mlx4/catas.c b/drivers/net/mlx4/catas.c
index 1bb088a..94bc784 100644
--- a/drivers/net/mlx4/catas.c
+++ b/drivers/net/mlx4/catas.c
@@ -30,15 +30,32 @@
  * SOFTWARE.
  */
 
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
 #include "mlx4.h"
 
+enum {
+	MLX4_CATAS_POLL_INTERVAL	= 5 * HZ,
+};
+
+static DEFINE_SPINLOCK(catas_lock);
+
+static LIST_HEAD(catas_list);
+static struct workqueue_struct *catas_wq;
+static struct work_struct catas_work;
+
+static int ierr_reset_disable;
+module_param_named(ierr_reset_disable, ierr_reset_disable, int, 0644);
+MODULE_PARM_DESC(ierr_reset_disable, "disable reset on Internal Error event if nonzero");
+
 void mlx4_handle_catas_err(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
 	int i;
 
-	mlx4_err(dev, "Catastrophic error detected:\n");
+	mlx4_err(dev, "Internal error detected:\n");
 	for (i = 0; i < priv->fw.catas_size; ++i)
 		mlx4_err(dev, "  buf[%02x]: %08x\n",
 			 i, swab32(readl(priv->catas_err.map + i)));
@@ -46,25 +63,118 @@ void mlx4_handle_catas_err(struct mlx4_dev *dev)
 	mlx4_dispatch_event(dev, MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR, 0, 0);
 }
 
-void mlx4_map_catas_buf(struct mlx4_dev *dev)
+static void catas_reset(struct work_struct *work)
+{
+	struct mlx4_priv *priv, *tmppriv;
+	struct mlx4_dev *dev;
+
+	LIST_HEAD(tlist);
+	int ret;
+
+	spin_lock_irq(&catas_lock);
+	list_splice_init(&catas_list, &tlist);
+	spin_unlock_irq(&catas_lock);
+
+	list_for_each_entry_safe(priv, tmppriv, &tlist, catas_err.list) {
+		ret = mlx4_restart_one(priv->dev.pdev);
+		dev = &priv->dev;
+		if (ret)
+			mlx4_err(dev, "Reset failed (%d)\n", ret);
+		else
+			mlx4_dbg(dev, "Reset succeeded\n");
+	}
+}
+
+static void handle_catas(struct mlx4_dev *dev)
+{
+	unsigned long flags;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mlx4_handle_catas_err(dev);
+
+	if (ierr_reset_disable)
+		return;
+
+	spin_lock_irqsave(&catas_lock, flags);
+	list_add(&priv->catas_err.list, &catas_list);
+	queue_work(catas_wq, &catas_work);
+	spin_unlock_irqrestore(&catas_lock, flags);
+}
+
+static void poll_catas(unsigned long dev_ptr)
+{
+	struct mlx4_dev *dev = (struct mlx4_dev *) dev_ptr;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	unsigned long flags;
+
+	if (readl(priv->catas_err.map)) {
+		handle_catas(&priv->dev);
+		return;
+	}
+
+	spin_lock_irqsave(&catas_lock, flags);
+	if (!priv->catas_err.stop)
+		mod_timer(&priv->catas_err.timer,
+			  jiffies + MLX4_CATAS_POLL_INTERVAL);
+	spin_unlock_irqrestore(&catas_lock, flags);
+
+	return;
+}
+
+void mlx4_start_catas_poll(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	unsigned long addr;
 
+	init_timer(&priv->catas_err.timer);
+	priv->catas_err.stop = 0;
+	priv->catas_err.map  = NULL;
+
 	addr = pci_resource_start(dev->pdev, priv->fw.catas_bar) +
 		priv->fw.catas_offset;
 
 	priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
 	if (!priv->catas_err.map)
-		mlx4_warn(dev, "Failed to map catastrophic error buffer at 0x%lx\n",
+		mlx4_warn(dev, "Failed to map Internal Error buffer at 0x%lx\n",
 			  addr);
 
+	priv->catas_err.timer.data     = (unsigned long) dev;
+	priv->catas_err.timer.function = poll_catas;
+	priv->catas_err.timer.expires  = jiffies + MLX4_CATAS_POLL_INTERVAL;
+	INIT_LIST_HEAD(&priv->catas_err.list);
+	add_timer(&priv->catas_err.timer);
 }
 
-void mlx4_unmap_catas_buf(struct mlx4_dev *dev)
+void mlx4_stop_catas_poll(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
+	spin_lock_irq(&catas_lock);
+	priv->catas_err.stop = 1;
+	spin_unlock_irq(&catas_lock);
+
+	del_timer_sync(&priv->catas_err.timer);
+
 	if (priv->catas_err.map)
 		iounmap(priv->catas_err.map);
+
+	spin_lock_irq(&catas_lock);
+	list_del(&priv->catas_err.list);
+	spin_unlock_irq(&catas_lock);
+}
+
+int __init mlx4_catas_init(void)
+{
+	INIT_WORK(&catas_work, catas_reset);
+
+	catas_wq = create_singlethread_workqueue("mlx4_err");
+	if (!catas_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx4_catas_cleanup(void)
+{
+	destroy_workqueue(catas_wq);
 }
diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c
index 27a82ce..a9841c6 100644
--- a/drivers/net/mlx4/eq.c
+++ b/drivers/net/mlx4/eq.c
@@ -283,7 +283,9 @@ static irqreturn_t mlx4_msi_x_interrupt(int irq, void *eq_ptr)
 
 static irqreturn_t mlx4_catas_interrupt(int irq, void *dev_ptr)
 {
-	mlx4_handle_catas_err(dev_ptr);
+	/* disable handling catas errors via interrupt. */
+	/* We now handle them via polling.              */
+	/* mlx4_handle_catas_err(dev_ptr);              */
 
 	/* MSI-X vectors always belong to us */
 	return IRQ_HANDLED;
diff --git a/drivers/net/mlx4/intf.c b/drivers/net/mlx4/intf.c
index 9ae951b..be5d9e9 100644
--- a/drivers/net/mlx4/intf.c
+++ b/drivers/net/mlx4/intf.c
@@ -142,6 +142,7 @@ int mlx4_register_device(struct mlx4_dev *dev)
 		mlx4_add_device(intf, priv);
 
 	mutex_unlock(&intf_mutex);
+	mlx4_start_catas_poll(dev);
 
 	return 0;
 }
@@ -151,6 +152,7 @@ void mlx4_unregister_device(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_interface *intf;
 
+	mlx4_stop_catas_poll(dev);
 	mutex_lock(&intf_mutex);
 
 	list_for_each_entry(intf, &intf_list, list)
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 41eafeb..297fe41 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -582,8 +582,6 @@ static int __devinit mlx4_setup_hca(struct mlx4_dev *dev)
 		goto err_pd_table_free;
 	}
 
-	mlx4_map_catas_buf(dev);
-
 	err = mlx4_init_eq_table(dev);
 	if (err) {
 		mlx4_err(dev, "Failed to initialize "
@@ -659,7 +657,6 @@ err_eq_table_free:
 	mlx4_cleanup_eq_table(dev);
 
 err_catas_buf:
-	mlx4_unmap_catas_buf(dev);
 	mlx4_cleanup_mr_table(dev);
 
 err_pd_table_free:
@@ -835,9 +832,6 @@ err_cleanup:
 	mlx4_cleanup_cq_table(dev);
 	mlx4_cmd_use_polling(dev);
 	mlx4_cleanup_eq_table(dev);
-
-	mlx4_unmap_catas_buf(dev);
-
 	mlx4_cleanup_mr_table(dev);
 	mlx4_cleanup_pd_table(dev);
 	mlx4_cleanup_uar_table(dev);
@@ -884,9 +878,6 @@ static void __devexit mlx4_remove_one(struct pci_dev *pdev)
 		mlx4_cleanup_cq_table(dev);
 		mlx4_cmd_use_polling(dev);
 		mlx4_cleanup_eq_table(dev);
-
-		mlx4_unmap_catas_buf(dev);
-
 		mlx4_cleanup_mr_table(dev);
 		mlx4_cleanup_pd_table(dev);
 
@@ -907,6 +898,12 @@ static void __devexit mlx4_remove_one(struct pci_dev *pdev)
 	}
 }
 
+int mlx4_restart_one(struct pci_dev *pdev)
+{
+	mlx4_remove_one(pdev);
+	return mlx4_init_one(pdev, NULL);
+}
+
 static struct pci_device_id mlx4_pci_table[] = {
 	{ PCI_VDEVICE(MELLANOX, 0x6340) }, /* MT25408 "Hermon" SDR */
 	{ PCI_VDEVICE(MELLANOX, 0x634a) }, /* MT25408 "Hermon" DDR */
@@ -927,6 +924,10 @@ static int __init mlx4_init(void)
 {
 	int ret;
 
+	ret = mlx4_catas_init();
+	if (ret)
+		return ret;
+
 	ret = pci_register_driver(&mlx4_driver);
 	return ret < 0 ? ret : 0;
 }
@@ -934,6 +935,7 @@ static int __init mlx4_init(void)
 static void __exit mlx4_cleanup(void)
 {
 	pci_unregister_driver(&mlx4_driver);
+	mlx4_catas_cleanup();
 }
 
 module_init(mlx4_init);
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 3d3b6d2..d4e9111 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -247,7 +247,9 @@ struct mlx4_mcg_table {
 
 struct mlx4_catas_err {
 	u32 __iomem	       *map;
-	int			size;
+	u32			stop;
+	struct timer_list	timer;
+	struct list_head	list;
 };
 
 struct mlx4_priv {
@@ -310,9 +312,11 @@ void mlx4_cleanup_qp_table(struct mlx4_dev *dev);
 void mlx4_cleanup_srq_table(struct mlx4_dev *dev);
 void mlx4_cleanup_mcg_table(struct mlx4_dev *dev);
 
-void mlx4_map_catas_buf(struct mlx4_dev *dev);
-void mlx4_unmap_catas_buf(struct mlx4_dev *dev);
-
+void mlx4_start_catas_poll(struct mlx4_dev *dev);
+void mlx4_stop_catas_poll(struct mlx4_dev *dev);
+int mlx4_catas_init(void);
+void mlx4_catas_cleanup(void);
+int mlx4_restart_one(struct pci_dev *pdev);
 int mlx4_register_device(struct mlx4_dev *dev);
 void mlx4_unregister_device(struct mlx4_dev *dev);
 void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_event type,



More information about the general mailing list