[openib-general] [PATCH 9 of 20] ipath - core driver, part 2 of 4

Bryan O'Sullivan bos at pathscale.com
Wed Dec 28 16:31:28 PST 2005


Signed-off-by: Bryan O'Sullivan <bos at pathscale.com>

diff -r ddd21709e12c -r dad2e87e21f4 drivers/infiniband/hw/ipath/ipath_driver.c
--- a/drivers/infiniband/hw/ipath/ipath_driver.c	Wed Dec 28 14:19:42 2005 -0800
+++ b/drivers/infiniband/hw/ipath/ipath_driver.c	Wed Dec 28 14:19:42 2005 -0800
@@ -1877,3 +1877,2004 @@
 
 	return ret;
 }
+
+/*
+ * cancel a range of PIO buffers, used when they might be armed, but
+ * not triggered.  Used at init to ensure buffer state, and also user
+ * process close, in case it died while writing to a PIO buffer
+ */
+
+static void ipath_disarm_piobufs(const ipath_type t, unsigned first,
+				 unsigned cnt)
+{
+	unsigned i, last = first + cnt;
+	uint64_t sendctrl;
+	for (i = first; i < last; i++) {
+		sendctrl = devdata[t].ipath_sendctrl | INFINIPATH_S_DISARM |
+		    (i << INFINIPATH_S_DISARMPIOBUF_SHIFT);
+		ipath_kput_kreg(t, kr_sendctrl, sendctrl);
+	}
+}
+
+static void ipath_clean_partkey(struct ipath_portdata * pd,
+				struct ipath_devdata * dd)
+{
+	int i, j, pchanged = 0;
+	uint64_t oldpkey;
+
+	/* for debugging only */
+	oldpkey =
+	    (uint64_t) dd->ipath_pkeys[0] | ((uint64_t) dd->
+					     ipath_pkeys[1] << 16)
+	    | ((uint64_t) dd->ipath_pkeys[2] << 32)
+	    | ((uint64_t) dd->ipath_pkeys[3] << 48);
+
+	for (i = 0; i < (sizeof(pd->port_pkeys) / sizeof(pd->port_pkeys[0]));
+	     i++) {
+		if (!pd->port_pkeys[i])
+			continue;
+		_IPATH_VDBG("look for key[%d] %hx in pkeys\n", i,
+			    pd->port_pkeys[i]);
+		for (j = 0;
+		     j < (sizeof(dd->ipath_pkeys) / sizeof(dd->ipath_pkeys[0]));
+		     j++) {
+			/* check for match independent of the global bit */
+			if ((dd->ipath_pkeys[j] & 0x7fff) ==
+			    (pd->port_pkeys[i] & 0x7fff)) {
+				if (atomic_dec_and_test(&dd->ipath_pkeyrefs[j])) {
+					_IPATH_VDBG
+					    ("p%u clear key %x matches #%d\n",
+					     pd->port_port, pd->port_pkeys[i],
+					     j);
+					ipath_stats.sps_pkeys[j] =
+					    dd->ipath_pkeys[j] = 0;
+					pchanged++;
+				} else
+					_IPATH_VDBG
+					    ("p%u key %x matches #%d, but ref still %d\n",
+					     pd->port_port, pd->port_pkeys[i],
+					     j,
+					     atomic_read(&dd->
+							 ipath_pkeyrefs[j]));
+				break;
+			}
+		}
+		pd->port_pkeys[i] = 0;
+	}
+	if (pchanged) {
+		uint64_t pkey;
+		pkey =
+		    (uint64_t) dd->ipath_pkeys[0] | ((uint64_t) dd->
+						     ipath_pkeys[1] << 16)
+		    | ((uint64_t) dd->ipath_pkeys[2] << 32)
+		    | ((uint64_t) dd->ipath_pkeys[3] << 48);
+		_IPATH_VDBG("p%u old pkey reg %llx, new pkey reg %llx\n",
+			    pd->port_port, oldpkey, pkey);
+		ipath_kput_kreg(pd->port_unit, kr_partitionkey, pkey);
+	}
+}
+
+static unsigned int ipath_poll(struct file *fp, struct poll_table_struct *pt)
+{
+	int ret;
+	struct ipath_portdata *pd;
+
+	pd = port_fp(fp);
+	/* nothing for select/poll in this driver, at least for now */
+	ret = 0;
+
+	return ret;
+}
+
+/*
+ * wait up to msecs milliseconds for IB link state change to occur
+ * for now, take the easy polling route.  Currently used only by
+ * the SMA ioctls.  Returns 0 if state reached, otherwise -ETIMEDOUT
+ * state can have multiple states set, for any of several transitions.
+ */
+
+int ipath_wait_linkstate(const ipath_type t, uint32_t state, int msecs)
+{
+	devdata[t].ipath_sma_state_wanted = state;
+	wait_event_interruptible_timeout(ipath_sma_state_wait,
+					 (devdata[t].ipath_flags & state),
+					 msecs_to_jiffies(msecs));
+	devdata[t].ipath_sma_state_wanted = 0;
+
+	if (!(devdata[t].ipath_flags & state))
+		_IPATH_DBG
+		    ("Didn't reach linkstate %s within %u ms (ibcc %llx %s)\n",
+		     /* test INIT ahead of DOWN, both can be set */
+		     (state & IPATH_LINKINIT) ? "INIT" :
+		     ((state & IPATH_LINKDOWN) ? "DOWN" :
+		      ((state & IPATH_LINKARMED) ? "ARM" : "ACTIVE")),
+		     msecs, ipath_kget_kreg64(t, kr_ibcctrl),
+		     ipath_ibcstatus_str[ipath_kget_kreg64(t, kr_ibcstatus) &
+					 0xf]);
+	return (devdata[t].ipath_flags & state) ? 0 : -ETIMEDOUT;
+}
+
+/* unit number is already validated in ipath_ioctl() */
+static int ipath_kset_lid(uint32_t arg)
+{
+	unsigned unit = (arg >> 16) & 0xffff;
+
+	if (unit >= infinipath_max
+	    || !(devdata[unit].ipath_flags & IPATH_INITTED)) {
+		_IPATH_DBG("Invalid unit %u\n", unit);
+		return -ENODEV;
+	}
+
+	arg &= 0xffff;
+	_IPATH_SMADBG("Unit %u setting lid to 0x%x, was 0x%x\n", unit, arg,
+		      devdata[unit].ipath_lid);
+	ipath_set_sps_lid(unit, arg);
+	return 0;
+}
+
+static int ipath_kset_mlid(uint32_t arg)
+{
+	unsigned unit = (arg >> 16) & 0xffff;
+
+	if (unit >= infinipath_max
+	    || !(devdata[unit].ipath_flags & IPATH_INITTED)) {
+		_IPATH_DBG("Invalid unit %u\n", unit);
+		return -ENODEV;
+	}
+
+	arg &= 0xffff;
+	_IPATH_SMADBG("Unit %u setting mlid to 0x%x, was 0x%x\n", unit, arg,
+		      devdata[unit].ipath_mlid);
+	ipath_stats.sps_mlid[unit] = devdata[unit].ipath_mlid = arg;
+	if (devdata[unit].ipath_layer.l_intr)
+		devdata[unit].ipath_layer.l_intr(unit, IPATH_LAYER_INT_BCAST);
+	return 0;
+}
+
+/* unit number is in incoming, overwritten on return with data */
+
+static int ipath_get_devstatus(uint64_t __user *a)
+{
+	int ret;
+	uint64_t unit64;
+	uint32_t unit;
+	uint64_t devstatus;
+
+	if ((ret = copy_from_user(&unit64, a, sizeof unit64))) {
+		_IPATH_DBG("Failed to copy in unit: %d\n", ret);
+		return -EFAULT;
+	}
+	unit = unit64;
+	if (unit >= infinipath_max
+	    || !(devdata[unit].ipath_flags & IPATH_INITTED)) {
+		_IPATH_DBG("Invalid unit %u\n", unit);
+		return -ENODEV;
+	}
+
+	devstatus = *devdata[unit].ipath_statusp;
+
+	if ((ret = copy_to_user(a, &devstatus, sizeof devstatus))) {
+		_IPATH_DBG("Failed to copy out device status: %d\n", ret);
+		ret = -EFAULT;
+	}
+	return ret;
+}
+
+/* unit number is in incoming, overwritten on return with data */
+
+static int ipath_get_mlid(uint32_t __user *a)
+{
+	int ret;
+	uint32_t unit;
+	uint32_t mlid;
+
+	if ((ret = copy_from_user(&unit, a, sizeof unit))) {
+		_IPATH_DBG("Failed to copy in mlid: %d\n", ret);
+		return -EFAULT;
+	}
+	if (unit >= infinipath_max
+	    || !(devdata[unit].ipath_flags & IPATH_INITTED)) {
+		_IPATH_DBG("Invalid unit %u\n", unit);
+		return -ENODEV;
+	}
+
+	mlid = devdata[unit].ipath_mlid;
+
+	if ((ret = copy_to_user(a, &mlid, sizeof mlid))) {
+		_IPATH_DBG("Failed to copy out MLID: %d\n", ret);
+		ret = -EFAULT;
+	}
+	return ret;
+}
+
+static int ipath_kset_guid(struct ipath_setguid __user *a)
+{
+	struct ipath_setguid setguid;
+	int ret;
+
+	if ((ret = copy_from_user(&setguid, a, sizeof setguid))) {
+		_IPATH_DBG("Failed to copy in guid info: %d\n", ret);
+		return -EFAULT;
+	}
+	if (setguid.sunit >= infinipath_max ||
+	    !(devdata[setguid.sunit].ipath_flags & IPATH_INITTED)) {
+		_IPATH_DBG("Invalid unit %llu\n", setguid.sunit);
+		return -ENODEV;
+	}
+	if (setguid.sguid == 0ULL || setguid.sguid == -1LL) {
+		/*
+		 * use INFO, not DBG, because ipath_mux doesn't yet
+		 * complain about errors on this
+		 */
+
+		_IPATH_INFO("Ignoring attempt to set invalid GUID %llx\n",
+			    setguid.sguid);
+		return -EINVAL;
+	}
+	devdata[setguid.sunit].ipath_guid = setguid.sguid;
+	devdata[setguid.sunit].ipath_nguid = 1;
+	_IPATH_DBG("SMA set hardware GUID unit %llu to %llx (network order)\n",
+		   setguid.sunit, devdata[setguid.sunit].ipath_guid);
+	return 0;
+}
+
+/*
+ * receive an IB packet with QP 0 or 1.  For now, we have no timeout implemented
+ * We put the actual received count into the iov on return, and the unit we
+ * received from goes into the lower 16 bits of sps_flags.
+ * This receives from all/any of the active chips, and we currently do not
+ * allow specifying just one (we could, by filling in unit in the library
+ * before the syscall, and checking here).
+ */
+
+static int ipath_rcvsma_pkt(struct ipath_sendpkt __user *p)
+{
+	struct ipath_sendpkt rpkt;
+	int i, any, ret;
+	unsigned long flags;
+
+	if ((ret = copy_from_user(&rpkt, p, sizeof rpkt))) {
+		_IPATH_DBG("Failed to copy in pkt struct (%d)\n", ret);
+		return -EFAULT;
+	}
+	if (!ipath_sma_data_spare) {
+		_IPATH_DBG("can't do receive, sma not initialized\n");
+		return -ENETDOWN;
+	}
+
+	for (any = i = 0; i < infinipath_max; i++)
+		if (devdata[i].ipath_flags & IPATH_INITTED)
+			any++;
+	if (!any) {		/* no hardware, freeze, etc. */
+		_IPATH_SMADBG("Didn't find any initialized and usable chips\n");
+		return -ENODEV;
+	}
+
+	wait_event_interruptible(ipath_sma_wait,
+				 ipath_sma_data[ipath_sma_first].len);
+
+	spin_lock_irqsave(&ipath_sma_lock, flags);
+	if (ipath_sma_data[ipath_sma_first].len) {
+		int len;
+		uint32_t slen;
+		uint8_t *sdata;
+		struct _ipath_sma_rpkt *smpkt =
+		    &ipath_sma_data[ipath_sma_first];
+
+		/*
+		 * we swap out the buffer we are going to use with the
+		 * spare buffer and set spare to that buffer.  This code
+		 * is the only code that ever manipulates spare, other
+		 * than the initialization code.  This code should never
+		 * be entered by more than one process at a time, and
+		 * if it is, the user  code doing so deserves what it gets;
+		 * it won't break anything in the driver by doing so.
+		 * We do it this way to avoid holding a lock across the
+		 * copy_to_user, which could fault, or delay a long time
+		 * while paging occurs; ditto for printks
+		 */
+
+		slen = smpkt->len;
+		sdata = smpkt->buf;
+		rpkt.sps_flags = smpkt->unit;
+		smpkt->buf = ipath_sma_data_spare;
+		ipath_sma_data_spare = sdata;
+		smpkt->len = 0;	/* it's available again */
+		if (++ipath_sma_first >= IPATH_NUM_SMAPKTS)
+			ipath_sma_first = 0;
+		spin_unlock_irqrestore(&ipath_sma_lock, flags);
+
+		len = min((uint32_t) rpkt.sps_iov[0].iov_len, slen);
+		ret = copy_to_user((void __user *) rpkt.sps_iov[0].iov_base,
+				   sdata, len);
+		_IPATH_VDBG("SMA packet (index=%d), len %d (actual %d) "
+			    "buf %p, ubuf %llx\n", ipath_sma_first, slen,
+			    len, sdata, rpkt.sps_iov[0].iov_base);
+		if (!ret) {
+			/* actual length read. */
+			rpkt.sps_iov[0].iov_len = len;
+			rpkt.sps_cnt = 1;	/* received one packet */
+			if ((ret = copy_to_user(p, &rpkt, sizeof rpkt))) {
+				_IPATH_DBG("Failed to copy out pkt struct "
+					   "(%d)\n", ret);
+				ret = -EFAULT;
+			}
+		} else {
+			_IPATH_DBG("copyout failed: %d\n", ret);
+			ret = -EFAULT;
+		}
+	} else {
+		/* usually means SMA process received a signal */
+		spin_unlock_irqrestore(&ipath_sma_lock, flags);
+		return -EAGAIN;
+	}
+
+	return ret;
+}
+
+/* unit number is in first word incoming, overwritten on return with data */
+static int ipath_get_portinfo(uint32_t __user *a)
+{
+	int ret;
+	uint32_t unit, tmp, tmp2;
+	struct ipath_devdata *dd;
+	uint32_t portinfo[13];	/* just the data for Portinfo, in host horder */
+
+	if ((ret = copy_from_user(&unit, a, sizeof unit))) {
+		_IPATH_DBG("Failed to copy in portinfo: %d\n", ret);
+		return -EFAULT;
+	}
+	if (unit >= infinipath_max
+	    || !(devdata[unit].ipath_flags & IPATH_INITTED)) {
+		_IPATH_DBG("Invalid unit %u\n", unit);
+		return -ENODEV;
+	}
+	dd = &devdata[unit];
+	/* so we only initialize non-zero fields. */
+	memset(portinfo, 0, sizeof portinfo);
+
+	/*
+	 * Notimpl yet M_Key (64)
+	 * Notimpl yet GID (64)
+	 */
+
+	portinfo[4] = (dd->ipath_lid << 16);
+
+	/*
+	 * Notimpl yet SMLID (should we store this in the driver, in
+	 * case SMA dies?)
+	 * CapabilityMask is 0, we don't support any of these
+	 * DiagCode is 0; we don't store any diag info for now
+	 * Notimpl yet M_KeyLeasePeriod (we don't support M_Key)
+	 */
+
+	/* LocalPortNum is whichever port number they ask for */
+	portinfo[7] = (unit << 24)
+	    /* LinkWidthEnabled */
+	    |(2 << 16)
+	    /* LinkWidthSupported (really 2, but that's not IB valid...) */
+	    |(3 << 8)
+	    /* LinkWidthActive */
+	    |(2 << 0);
+	tmp = dd->ipath_lastibcstat & 0xff;
+	tmp2 = 5;
+	if (tmp == 0x11)
+		tmp = 2;
+	else if (tmp == 0x21)
+		tmp = 3;
+	else if (tmp == 0x31)
+		tmp = 4;
+	else {
+		tmp = 0;	/* down */
+		tmp2 = tmp & 0xf;
+	}
+	portinfo[8] = (1 << 28)	/* LinkSpeedSupported */
+	    |(tmp << 24)	/* PortState */
+	    |(tmp2 << 20)	/* PortPhysicalState */
+	    |(2 << 16)
+
+	    /* LinkDownDefaultState */
+	    /* M_KeyProtectBits == 0 */
+	    /* NotImpl yet LMC == 0 (we can support all values) */
+	    |(1 << 4)		/* LinkSpeedActive */
+	    |(1 << 0);		/* LinkSpeedEnabled */
+	switch (dd->ipath_ibmtu) {
+	case 4096:
+		tmp = 5;
+		break;
+	case 2048:
+		tmp = 4;
+		break;
+	case 1024:
+		tmp = 3;
+		break;
+	case 512:
+		tmp = 2;
+		break;
+	case 256:
+		tmp = 1;
+		break;
+	default:		/* oops, something is wrong */
+		_IPATH_DBG
+		    ("Problem, ipath_ibmtu 0x%x not a valid IB MTU, treat as 2048\n",
+		     dd->ipath_ibmtu);
+		tmp = 4;
+		break;
+	}
+	portinfo[9] = (tmp << 28)
+	    /* NeighborMTU */
+	    /* Notimpl MasterSMSL */
+	    |(1 << 20)
+
+	    /* VLCap */
+	    /* Notimpl InitType (actually, an SMA decision) */
+	    /* VLHighLimit is 0 (only one VL) */
+	    ;			/* VLArbitrationHighCap is 0 (only one VL) */
+	portinfo[10] =		/* VLArbitrationLowCap is 0 (only one VL) */
+	    /* InitTypeReply is SMA decision */
+	    (5 << 16)		/* MTUCap 4096 */
+	    |(7 << 13)		/* VLStallCount */
+	    |(0x1f << 8)	/* HOQLife */
+	    |(1 << 4)		/* OperationalVLs 0 */
+
+	    /* PartitionEnforcementInbound */
+	    /* PartitionEnforcementOutbound not enforced */
+	    /* FilterRawinbound not enforced */
+	    ;			/* FilterRawOutbound not enforced */
+	/* M_KeyViolations are not counted by hardware, SMA can count */
+	tmp = ipath_kget_creg32(unit, cr_errpkey);
+	/* P_KeyViolations are counted by hardware. */
+	portinfo[11] = ((tmp & 0xffff) << 0);
+	portinfo[12] =
+	    /* Q_KeyViolations are not counted by hardware */
+	    (1 << 8)
+
+	    /* GUIDCap */
+	    /* SubnetTimeOut handled by SMA */
+	    /* RespTimeValue handled by SMA */
+	    ;
+	/* LocalPhyErrors are programmed to max */
+	portinfo[12] |= (0xf << 20)
+	    |(0xf << 16)	/* OverRunErrors are programmed to max */
+	    ;
+
+	if ((ret = copy_to_user(a, portinfo, sizeof portinfo))) {
+		_IPATH_DBG("Failed to copy out portinfo: %d\n", ret);
+		ret = -EFAULT;
+	}
+	return ret;
+}
+
+/* unit number is in first word incoming, overwritten on return with data */
+static int ipath_get_nodeinfo(uint32_t __user *a)
+{
+	int ret;
+	uint32_t unit;		/*, tmp, tmp2; */
+	struct ipath_devdata *dd;
+	uint32_t nodeinfo[10];	/* just the data for Nodeinfo, in host horder */
+
+	if ((ret = copy_from_user(&unit, a, sizeof unit))) {
+		_IPATH_DBG("Failed to copy in nodeinfo: %d\n", ret);
+		return -EFAULT;
+	}
+	if (unit >= infinipath_max
+	    || !(devdata[unit].ipath_flags & IPATH_INITTED)) {
+		/* VDBG because sma normally probes for all possible units */
+		_IPATH_VDBG("Invalid unit %u\n", unit);
+		return -ENODEV;
+	}
+	dd = &devdata[unit];
+
+	/* so we only initialize non-zero fields. */
+	memset(nodeinfo, 0, sizeof nodeinfo);
+
+	nodeinfo[0] =		/* BaseVersion is SMA */
+	    /* ClassVersion is SMA */
+	    (1 << 8)		/* NodeType  */
+	    |(1 << 0);		/* NumPorts */
+	nodeinfo[1] = (uint32_t) (dd->ipath_guid >> 32);
+	nodeinfo[2] = (uint32_t) (dd->ipath_guid & 0xffffffff);
+	nodeinfo[3] = nodeinfo[1];	/* PortGUID == SystemImageGUID for us */
+	nodeinfo[4] = nodeinfo[2];	/* PortGUID == SystemImageGUID for us */
+	nodeinfo[5] = nodeinfo[3];	/* PortGUID == NodeGUID for us */
+	nodeinfo[6] = nodeinfo[4];	/* PortGUID == NodeGUID for us */
+	nodeinfo[7] = (4 << 16)	/* we support 4 pkeys */
+	    |(dd->ipath_deviceid << 0);
+	/* our chip version as 16 bits major, 16 bits minor */
+	nodeinfo[8] = dd->ipath_minrev | (dd->ipath_majrev << 16);
+	nodeinfo[9] = (unit << 24) | (dd->ipath_vendorid << 0);
+
+	if ((ret = copy_to_user(a, nodeinfo, sizeof nodeinfo))) {
+		_IPATH_DBG("Failed to copy out nodeinfo: %d\n", ret);
+		ret = -EFAULT;
+	}
+	return ret;
+}
+
+static int ipath_sma_ioctl(struct file *fp, unsigned int cmd, unsigned long a)
+{
+	int ret = 0;
+	switch (cmd) {
+	case IPATH_SEND_SMA_PKT:	/* send SMA packet */
+		if (!(ret = ipath_send_smapkt((struct ipath_sendpkt __user *) a)))
+			/* another SMA packet sent */
+			ipath_stats.sps_sma_spkts++;
+		break;
+	case IPATH_RCV_SMA_PKT:	/* recieve an SMA or MAD packet */
+		ret = ipath_rcvsma_pkt((struct ipath_sendpkt __user *) a);
+		break;
+	case IPATH_SET_LID:	/* set our lid, (SMA) */
+		ret = ipath_kset_lid((uint32_t) a);
+		break;
+	case IPATH_SET_MTU:	/* set the IB mtu (not maxpktlen) (SMA) */
+		ret = ipath_kset_mtu((uint32_t) a);
+		break;
+	case IPATH_SET_LINKSTATE:
+		/* walk through the linkstate states (SMA) */
+		ret = ipath_kset_linkstate((uint32_t) a);
+		break;
+	case IPATH_GET_PORTINFO:	/* get the SMA portinfo */
+		ret = ipath_get_portinfo((uint32_t __user *) a);
+		break;
+	case IPATH_GET_NODEINFO:	/* get the SMA nodeinfo */
+		ret = ipath_get_nodeinfo((uint32_t __user *) a);
+		break;
+	case IPATH_SET_GUID:
+		/*
+		 * set our guid, (SMA).  This is not normally
+		 * used, but provides a way to set the GUID when the i2c flash
+		 * has a problem, or for special testing.
+		 */
+		ret = ipath_kset_guid((struct ipath_setguid __user *) a);
+		break;
+	case IPATH_SET_MLID:	/* set multicast LID for ipath broadcast */
+		ret = ipath_kset_mlid((uint32_t) a);
+		break;
+	case IPATH_GET_MLID:	/* get multicast LID for ipath broadcast */
+		ret = ipath_get_mlid((uint32_t __user *) a);
+		break;
+	case IPATH_GET_DEVSTATUS:	/* get device status */
+		ret = ipath_get_devstatus((uint64_t __user *) a);
+		break;
+	default:
+		_IPATH_DBG("%x not a valid SMA ioctl for infinipath\n", cmd);
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+
+static int ipath_get_unit_counters(struct infinipath_getunitcounters __user *a)
+{
+	struct infinipath_getunitcounters c;
+
+	if (copy_from_user(&c, a, sizeof c))
+		return -EFAULT;
+
+	if (c.unit >= infinipath_max ||
+	   !(devdata[c.unit].ipath_flags & IPATH_PRESENT))
+		return -ENODEV;
+
+	return ipath_get_counters(c.unit,
+				  (struct infinipath_counters __user *) c.data);
+}
+
+/*
+ * ioctls for the control device, which is useful when you don't want
+ * to open the main device and use up a port.
+ */
+
+static int ipath_ctrl_ioctl(struct file *fp, unsigned int cmd, unsigned long a)
+{
+	int ret = 0;
+
+	switch (cmd) {
+	case IPATH_GETSTATS:		/* return driver stats */
+		ret = ipath_get_stats((struct infinipath_stats __user *) a);
+		break;
+	case IPATH_GETUNITCOUNTERS:	/* return chip counters */
+		ret = ipath_get_unit_counters((struct infinipath_getunitcounters __user *) a);
+		break;
+	default:
+		_IPATH_DBG("%x not a valid CTRL ioctl for infinipath\n", cmd);
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+long ipath_ioctl(struct file *fp, unsigned int cmd, unsigned long a)
+{
+	int ret = 0;
+	struct ipath_portdata *pd;
+	ipath_type unit;
+	uint32_t tmp, i, nactive = 0;
+
+	if (cmd == IPATH_GETUNITS) {
+		/*
+		 * Return number of units supported.  This is called
+		 * here as this ioctl is needed via both the normal and
+		 * diags interface, and it does not need the device to
+		 * be opened.
+		 */
+		return ipath_get_units();
+	}
+
+	pd = port_fp(fp);
+	if (!pd) {
+		if (IPATH_SMA == (unsigned long)fp->private_data)
+			/* sma separate; no pd */
+			return (long)ipath_sma_ioctl(fp, cmd, a);
+#ifdef IPATH_DIAG
+		else if (IPATH_DIAG == (unsigned long)fp->private_data)
+			/* diags separate; no pd */
+			return (long)ipath_diags_ioctl(fp, cmd, a);
+#endif
+		else if (IPATH_CTRL == (unsigned long)fp->private_data)
+			/* ctrl separate; no pd */
+			return (long)ipath_ctrl_ioctl(fp, cmd, a);
+		else {
+			_IPATH_DBG("NULL pd from fp (%p), cmd=%x\n", fp, cmd);
+			return -ENODEV;	/* bad; shouldn't ever happen */
+		}
+	}
+
+	unit = pd->port_unit;
+
+	if ((devdata[unit].ipath_flags & IPATH_PRESENT)
+	    && (cmd == IPATH_GETCOUNTERS || cmd == IPATH_GETSTATS
+		|| cmd == IPATH_READ_EEPROM || cmd == IPATH_WRITE_EEPROM)) {
+		/* allowed to do these, as long as chip is accessible */
+	} else if (!(devdata[unit].ipath_flags & IPATH_INITTED)) {
+		_IPATH_DBG
+		    ("%s not initialized (flags=0x%x), failing ioctl #%u\n",
+		     ipath_get_unit_name(unit), devdata[unit].ipath_flags,
+		     _IOC_NR(cmd));
+		ret = -ENODEV;
+	} else
+	    if ((devdata[unit].
+		 ipath_flags & (IPATH_LINKDOWN | IPATH_LINKUNK))) {
+		_IPATH_DBG("%s link is down, failing ioctl #%u\n",
+			   ipath_get_unit_name(unit), _IOC_NR(cmd));
+		ret = -ENETDOWN;
+	}
+
+	if (ret)
+		return ret;
+
+	switch (cmd) {
+	case IPATH_USERINIT:
+		/* real application is starting on a port */
+		ret = ipath_do_user_init(pd, (struct ipath_user_info __user *) a);
+		break;
+	case IPATH_BASEINFO:
+		/* it's done the init, now return the info it needs */
+		ret = ipath_get_baseinfo(pd, (struct ipath_base_info __user *) a);
+		break;
+	case IPATH_GETPORT:
+		/*
+		 * just return the unit:port that we were assigned,
+		 * and the number of active chips.  This is is used for
+		 * doing sched_setaffinity() before initialization.
+		 */
+		for (i = 0; i < infinipath_max; i++)
+			if ((devdata[i].ipath_flags & IPATH_PRESENT)
+			    && devdata[i].ipath_kregbase
+			    && devdata[i].ipath_lid
+			    && !(devdata[i].ipath_flags &
+				 (IPATH_LINKDOWN | IPATH_LINKUNK)))
+				nactive++;
+		tmp = (nactive << 24) | (unit << 16) | pd->port_port;
+		if (copy_to_user((void __user *) a, &tmp, sizeof(tmp)))
+			ret = EFAULT;
+		break;
+	case IPATH_GETLID:
+		/* get LID for given unit # */
+		ret = ipath_layer_get_lid(a);
+		break;
+	case IPATH_UPDM_TID:	/* update expected TID entries */
+		ret = ipath_tid_update(pd, (struct _tidupd __user *) a);
+		break;
+	case IPATH_FREE_TID:	/* free expected TID entries */
+		ret = ipath_tid_free(pd, (struct _tidupd __user *) a);
+		break;
+	case IPATH_GETCOUNTERS:	/* return chip counters */
+		ret = ipath_get_counters(unit, (struct infinipath_counters __user *) a);
+		break;
+	case IPATH_GETSTATS:	/* return driver stats */
+		ret = ipath_get_stats((struct infinipath_stats __user *) a);
+		break;
+	case IPATH_GETUNITCOUNTERS:	/* return chip counters */
+		ret = ipath_get_unit_counters(
+			(struct infinipath_getunitcounters __user *) a);
+		break;
+	case IPATH_SET_PKEY:	/* set a partition key */
+		ret = ipath_set_partkey(pd, (uint16_t) a);
+		break;
+	case IPATH_RCVCTRL:	/* error handling to manage the rcvq */
+		ret = ipath_manage_rcvq(pd, (uint16_t) a);
+		break;
+	case IPATH_WRITE_EEPROM:
+		/* write the eeprom (for GUID) */
+		ret = ipath_wr_eeprom(pd,
+				      (struct ipath_eeprom_req __user *) a);
+		break;
+	case IPATH_READ_EEPROM:	/* read the eeprom (for GUID) */
+		ret = ipath_rd_eeprom(pd->port_unit,
+				      (struct ipath_eeprom_req __user *) a);
+		break;
+	case IPATH_WAIT:
+		/*
+		 * wait for a receive intr for this port, or PIO avail
+		 */
+		ret = ipath_wait_intr(pd, (uint32_t) a);
+		break;
+
+	default:
+		_IPATH_DBG("cmd %x (%c,%u) not a valid ioctl\n", cmd,
+			   _IOC_TYPE(cmd), _IOC_NR(cmd));
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static loff_t ipath_llseek(struct file *fp, loff_t off, int whence)
+{
+	loff_t ret;
+
+	/* range checking is done where offset is used, not here. */
+	down(&fp->f_dentry->d_inode->i_sem);
+	if (!whence)
+		ret = fp->f_pos = off;
+	else if (whence == 1) {
+		fp->f_pos += off;
+		ret = fp->f_pos;
+	} else
+		ret = -EINVAL;
+	up(&fp->f_dentry->d_inode->i_sem);
+	_IPATH_DBG("New offset %llx from seek %llx whence=%d\n", fp->f_pos, off,
+		   whence);
+
+	return ret;
+}
+
+/*
+ * We use this to have a shared buffer between the kernel and the user
+ * code for the rcvhdr queue, egr buffers, and the per-port user regs and pio
+ * buffers in the chip.  We have the open and close entries so we can bump
+ * the ref count and keep the driver from being unloaded while still mapped.
+ */
+
+static struct vm_operations_struct ipath_vmops = {
+	.nopage = ipath_nopage,
+};
+
+static int ipath_mmap(struct file *fp, struct vm_area_struct *vm)
+{
+	int setlen = 0, ret = -EINVAL;
+	struct ipath_portdata *pd;
+
+	if (fp->private_data && 255UL < (unsigned long)fp->private_data) {
+		pd = port_fp(fp);
+		{
+			/*
+			 * This is the ipath_do_user_init() code,
+			 * mapping the shared buffers into the user
+			 * process. The address referred to by vm_pgoff
+			 * is the virtual, not physical, address; we only
+			 * do one mmap for each space mapped.
+			 */
+			uint64_t pgaddr, ureg;
+
+			pgaddr = vm->vm_pgoff << PAGE_SHIFT;
+
+			/*
+			 * note that ureg does *NOT* have the kregvirt
+			 * as part of it, to be sure that for 32 bit
+			 * programs, we don't end up trying to map
+			 * a > 44 address.  Has to match ipath_get_baseinfo()
+			 * code that sets __spi_uregbase
+			 */
+
+			ureg = devdata[pd->port_unit].ipath_uregbase +
+			    devdata[pd->port_unit].ipath_palign * pd->port_port;
+
+			_IPATH_MMDBG
+			    ("ushare: pgaddr %llx vm_start=%lx, vmlen %lx\n",
+			     pgaddr, vm->vm_start, vm->vm_end - vm->vm_start);
+
+			if (pgaddr == ureg) {
+				/* it's the real hardware, so io_remap works */
+				unsigned long phys;
+				if ((vm->vm_end - vm->vm_start) > PAGE_SIZE) {
+					_IPATH_INFO
+					    ("FAIL mmap userreg: reqlen %lx > PAGE\n",
+					     vm->vm_end - vm->vm_start);
+					ret = -EFAULT;
+				} else {
+					phys =
+					    devdata[pd->port_unit].
+					    ipath_physaddr + ureg;
+					vm->vm_page_prot =
+					    pgprot_noncached(vm->vm_page_prot);
+
+					vm->vm_flags |=
+					    VM_DONTCOPY | VM_DONTEXPAND | VM_IO
+					    | VM_SHM | VM_LOCKED;
+					ret =
+			io_remap_pfn_range(vm, vm->vm_start, phys >> PAGE_SHIFT,
+					   vm->vm_end - vm->vm_start,
+					   vm->vm_page_prot);
+				}
+			} else if (pgaddr == pd->port_piobufs) {
+				/*
+				 * We use io_remap, so there is not a
+				 * nopage handler for this case!
+				 * when we map the PIO buffers, we want
+				 * to map them as writeonly, no read possible.
+				 */
+
+				unsigned long phys;
+				if ((vm->vm_end - vm->vm_start) >
+				    (devdata[pd->port_unit].ipath_pbufsport *
+				     devdata[pd->port_unit].ipath_palign)) {
+					_IPATH_INFO
+					    ("FAIL mmap piobufs: reqlen %lx > PAGE\n",
+					     vm->vm_end - vm->vm_start);
+					ret = -EFAULT;
+				} else {
+					phys =
+					    devdata[pd->port_unit].
+					    ipath_physaddr + pd->port_piobufs;
+					/*
+					 * Do *NOT* mark this as
+					 * non-cached (PWT bit), or we
+					 * don't get the write combining
+					 * behavior we want on the
+					 * PIO buffers!
+					 * vm->vm_page_prot = pgprot_noncached(vm->vm_page_prot);
+					 */
+
+#if defined (pgprot_writecombine) && defined(_PAGE_MA_WC)
+					/* Enable WC */
+					vm->vm_page_prot =
+					    pgprot_writecombine(vm->
+								vm_page_prot);
+#endif
+
+					if (vm->vm_flags & VM_READ) {
+						_IPATH_INFO
+						    ("Can't map piobufs as readable (flags=%lx)\n",
+						     vm->vm_flags);
+						ret = -EPERM;
+					} else {
+						/*
+						 * don't allow them to
+						 * later change to readable
+						 * with mprotect
+						 */
+
+						vm->vm_flags &= ~VM_MAYWRITE;
+
+						vm->vm_flags |=
+						    VM_DONTCOPY | VM_DONTEXPAND
+						    | VM_IO | VM_SHM |
+						    VM_LOCKED;
+						ret =
+			io_remap_pfn_range(vm, vm->vm_start, phys >> PAGE_SHIFT,
+					   vm->vm_end - vm->vm_start,
+					   vm->vm_page_prot);
+					}
+				}
+			} else if (pgaddr == (uint64_t) pd->port_rcvegr_phys) {
+				if (!pd->port_rcvegrbuf_virt)
+					return -EFAULT;
+				/*
+				 * page_alloc'ed egr memory, not
+				 * physically contiguous
+				 * *BUT* to work around the 32 bit mmap64
+				 * only handling 44 bits, we have remapped
+				 * the first page to kernel virtual, so
+				 * we have to do the conversion here to
+				 * get back to the original virtual
+				 * address (not contig pages) so we have
+				 * to mark this for special handling.
+				 */
+
+				/*
+				 * not egrbufs * egrsize since they are
+				 * no longer virtually contiguous.
+				 */
+				setlen = pd->port_rcvegrbuf_chunks * PAGE_SIZE *
+				    (1 << pd->port_rcvegrbuf_order);
+				if ((vm->vm_end - vm->vm_start) > setlen) {
+					_IPATH_INFO
+					    ("FAIL on egr bufs: reqlen %lx > actual %x\n",
+					     vm->vm_end - vm->vm_start, setlen);
+					ret = -EFAULT;
+				} else {
+					vm->vm_ops = &ipath_vmops;
+					vm->vm_private_data =
+					    (void *)(3 | (uint64_t) pd);
+					if (vm->vm_flags & VM_WRITE) {
+						_IPATH_INFO
+						    ("Can't map eager buffers as writable (flags=%lx)\n",
+						     vm->vm_flags);
+						ret = -EPERM;
+					} else {
+						/*
+						 * don't allow them to
+						 * later change to writeable
+						 * with mprotect
+						 */
+
+						vm->vm_flags &= ~VM_MAYWRITE;
+						_IPATH_MMDBG
+						    ("egrbufs, set private to %p, not %llx\n",
+						     vm->vm_private_data,
+						     pgaddr);
+						ret = 0;
+					}
+				}
+			} else if (pgaddr == (uint64_t) pd->port_rcvhdrq_phys) {
+				/*
+				 * kmalloc'ed memory, physically
+				 * contiguous; this is from
+				 * spi_rcvhdr_base; we allow user to
+				 * map read-write so they can write
+				 * hdrq entries to allow protocol code
+				 * to directly poll whether a hdrq entry
+				 * has been written.
+				 */
+				setlen = ALIGN(devdata[pd->port_unit].ipath_rcvhdrcnt * devdata[pd->port_unit].ipath_rcvhdrentsize * sizeof(uint32_t), PAGE_SIZE);
+				if ((vm->vm_end - vm->vm_start) > setlen) {
+					_IPATH_INFO
+					    ("FAIL on rcvhdrq: reqlen %lx > actual %x\n",
+					     vm->vm_end - vm->vm_start, setlen);
+					ret = -EFAULT;
+				} else {
+					vm->vm_ops = &ipath_vmops;
+					vm->vm_private_data =
+					    (void *)(pgaddr | 1);
+					ret = 0;
+				}
+			}
+			/*
+			 * when we map the PIO bufferavail registers,
+			 * we want to map them as readonly, no read
+			 * possible.
+			 */
+			else if (pgaddr == devdata[pd->port_unit].ipath_pioavailregs_phys) {
+				/*
+				 * kmalloc'ed memory, physically
+				 * contiguous, one page only, readonly
+				 */
+				setlen = PAGE_SIZE;
+				if ((vm->vm_end - vm->vm_start) > setlen) {
+					_IPATH_INFO
+					    ("FAIL on pioavailregs_dma: reqlen %lx > actual %x\n",
+					     vm->vm_end - vm->vm_start, setlen);
+					ret = -EFAULT;
+				} else if (vm->vm_flags & VM_WRITE) {
+					_IPATH_INFO
+					    ("Can't map pioavailregs as writable (flags=%lx)\n",
+					     vm->vm_flags);
+					ret = -EPERM;
+				} else {
+					/*
+					 * don't allow them to later
+					 * change with mprotect
+					 */
+					vm->vm_flags &= ~VM_MAYWRITE;
+					vm->vm_ops = &ipath_vmops;
+					vm->vm_private_data =
+					    (void *)(pgaddr | 2);
+					ret = 0;
+				}
+			}
+			if (!ret && setlen) {
+				/* keep page(s) from being swapped, etc. */
+				vm->vm_flags |=
+				    VM_DONTEXPAND | VM_DONTCOPY | VM_RESERVED |
+				    VM_IO | VM_SHM;
+			} else {
+				/* failure, or io_remap case */
+				vm->vm_private_data = NULL;
+				if (ret)
+					_IPATH_INFO
+					    ("Failure %d, setlen %d, on addr %lx, off %lx\n",
+					     ret, setlen, vm->vm_start,
+					     vm->vm_pgoff);
+			}
+		}
+	} else			/* something very wrong */
+		_IPATH_INFO("fp_private wasn't set, no mmaping\n");
+
+	return ret;
+}
+
+/* page fault handler.  For each page that is first faulted in from the
+ * mmap'ed shared address buffer, this routine is called.
+ * It's always for a single page.
+ * We use the low bits of the private_data field to tell us which case
+ * we are dealing with.
+ */
+
+static struct page *ipath_nopage(struct vm_area_struct *vma, unsigned long addr,
+				 int *type)
+{
+	unsigned long avirt,	/* the original [kv]malloc virtual address */
+	 paddr,			/* physical address */
+	 off;			/* calculated page offset */
+	uint32_t which, chunk;
+	void *vaddr = NULL;
+	struct ipath_portdata *pd;
+	struct page *vpage = NOPAGE_SIGBUS;
+
+	if (!(avirt = (unsigned long)vma->vm_private_data)) {
+		_IPATH_DBG("NULL private_data, vm_pgoff %lx\n", vma->vm_pgoff);
+		which = 0;	/* quiet incorrect gcc warning */
+		goto done;
+	}
+	which = avirt & 3;
+	avirt &= ~3ULL;
+
+	if (addr > vma->vm_end) {
+		_IPATH_DBG("trying to fault in addr %lx past end\n", addr);
+		goto done;
+	}
+
+	/*
+	 * most of our memory is vmalloc'ed, but rcvhdr Q is physically
+	 * contiguous, either from kmalloc or alloc_pages()
+	 * pgoff is virtual.
+	 */
+	switch (which) {
+	case 1:		/* rcvhdrq_phys */
+		/* should always be 0 */
+		off = vma->vm_pgoff - (avirt >> PAGE_SHIFT);
+		paddr = addr - vma->vm_start + (off << PAGE_SHIFT) + avirt;
+		_IPATH_MMDBG("hdrq %lx (u=%lx)\n", paddr, addr);
+		vpage = pfn_to_page(paddr >> PAGE_SHIFT);
+		break;
+	case 2:		/* PIO buffer avail regs */
+		/* should always be 0 */
+		off = vma->vm_pgoff - (avirt >> PAGE_SHIFT);
+		paddr = (addr - vma->vm_start + (off << PAGE_SHIFT) + avirt);
+		_IPATH_MMDBG("pioav %lx\n", paddr);
+		vpage = pfn_to_page(paddr >> PAGE_SHIFT);
+		break;
+	case 3:
+		/*
+		 * rcvegrbufs; page_alloc()'ed like rcvhdrq, but we
+		 * have to pick out which page_alloc()'ed chunk it is.
+		 */
+		pd = (struct ipath_portdata *) avirt;
+		/* this should always be 0 */
+		off =
+		    vma->vm_pgoff -
+		    ((unsigned long)pd->port_rcvegr_phys >> PAGE_SHIFT);
+		off = (addr - vma->vm_start + (off << PAGE_SHIFT));
+
+		chunk = off / (PAGE_SIZE * (1 << pd->port_rcvegrbuf_order));
+		if (chunk > pd->port_rcvegrbuf_chunks)
+			_IPATH_DBG("Bad egrbuf chunk %u (max %u); off = %lx\n",
+				   chunk, pd->port_rcvegrbuf_chunks, off);
+		vaddr = pd->port_rcvegrbuf_virt[chunk] +
+		    off % (PAGE_SIZE * (1 << pd->port_rcvegrbuf_order));
+		paddr = virt_to_phys(vaddr);
+		vpage = pfn_to_page(paddr >> PAGE_SHIFT);
+		_IPATH_MMDBG("egrb %p,%lx\n", vaddr, paddr);
+		break;
+	default:
+		_IPATH_DBG
+		    ("trying to fault in mmap addr %lx (avirt %lx) that isn't known (case %u)\n",
+		     addr, avirt, which);
+	}
+
+done:
+	if (vpage != NOPAGE_SIGBUS && vpage != NOPAGE_OOM) {
+		if (which == 2)
+			/*
+			 * media/video/video-buf.c doesn't do get_page() for
+			 * buffer from alloc_page().  Hmmm.
+			 *
+			 * keep it from being swapped, complaints if
+			 * process exits before we [vf]free it, etc,
+			 * and keep shared page counts correct, etc.
+			 */
+			get_page(vpage);
+		mark_page_accessed(vpage);
+		if (type)
+			*type = VM_FAULT_MINOR;
+	} else
+		_IPATH_DBG("faultin of addr %lx vaddr %p avirt %lx failed\n",
+			   addr, vaddr, avirt);
+
+	return vpage;
+}
+
+/* this is separate to allow for better optimization of ipath_intr() */
+
+static void ipath_bad_intr(const ipath_type t, uint32_t * unexpectp)
+{
+	struct ipath_devdata *dd = &devdata[t];
+
+	/*
+	 * sometimes happen during driver init and unload, don't want
+	 * to process any interrupts at that point
+	 */
+
+	/* this is just a bandaid, not a fix, if something goes badly wrong */
+	if (++*unexpectp > 100) {
+		if (++*unexpectp > 105) {
+			/*
+			 * ok, we must be taking somebody else's interrupts,
+			 * due to a messed up mptable and/or PIRQ table, so
+			 * unregister the interrupt.  We've seen this
+			 * during linuxbios development work, and it
+			 * may happen in the future again.
+			 */
+			if (dd->pcidev && dd->pcidev->irq) {
+				_IPATH_UNIT_ERROR(t,
+						  "Now %u unexpected interrupts, unregistering interrupt handler\n",
+						  *unexpectp);
+				_IPATH_DBG("free_irq of irq %x\n",
+					   dd->pcidev->irq);
+				free_irq(dd->pcidev->irq, dd);
+				dd->pcidev->irq = 0;
+			}
+		}
+		if (ipath_kget_kreg32(t, kr_intmask)) {
+			_IPATH_UNIT_ERROR(t,
+					  "%u unexpected interrupts, disabling interrupts completely\n",
+					  *unexpectp);
+			/* disable all interrupts, something is very wrong */
+			ipath_kput_kreg(t, kr_intmask, 0ULL);
+		}
+	} else if (*unexpectp > 1)
+		_IPATH_DBG
+		    ("Interrupt when not ready, should not happen, ignoring\n");
+}
+
+/* separate routine, for better optimization of ipath_intr() */
+
+static void ipath_bad_regread(const ipath_type t)
+{
+	static int allbits;
+	struct ipath_devdata *dd = &devdata[t];
+
+	/*
+	 * We print the message and disable interrupts, in hope of
+	 * having a better chance of debugging the problem.
+	 */
+	_IPATH_UNIT_ERROR(t,
+			  "Read of interrupt status failed (all bits set)\n");
+	if (allbits++) {
+		/* disable all interrupts, something is very wrong */
+		ipath_kput_kreg(t, kr_intmask, 0ULL);
+		if (allbits == 2) {
+			_IPATH_UNIT_ERROR(t,
+					  "Still bad interrupt status, unregistering interrupt\n");
+			free_irq(dd->pcidev->irq, dd);
+			dd->pcidev->irq = 0;
+		} else if (allbits > 2) {
+			if ((allbits % 10000) == 0)
+				printk(".");
+		} else
+			_IPATH_UNIT_ERROR(t,
+					  "Disabling interrupts, multiple errors\n");
+	}
+}
+
+static irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs)
+{
+	struct ipath_devdata *dd = data;
+	const ipath_type t = IPATH_UNIT(dd);
+	uint32_t istat = ipath_kget_kreg32(t, kr_intstatus);
+	uint64_t estat = 0;
+	static unsigned unexpected = 0;
+
+	if (unlikely(!istat)) {
+		ipath_stats.sps_nullintr++;
+		/* not our interrupt, or already handled */
+		return IRQ_NONE;
+	}
+	if (unlikely(istat == -1)) {
+		ipath_bad_regread(t);
+		/* don't know if it was our interrupt or not */
+		return IRQ_NONE;
+	}
+
+	ipath_stats.sps_ints++;
+
+	/*
+	 * this needs to be flags&initted, not statusp, so we keep
+	 * taking interrupts even after link goes down, etc.
+	 * Also, we *must* clear the interrupt at some point, or we won't
+	 * take it again, which can be real bad for errors, etc...
+	 */
+
+	if (!(dd->ipath_flags & IPATH_INITTED)) {
+		ipath_bad_intr(t, &unexpected);
+		return IRQ_NONE;
+	}
+	if (unexpected)
+		unexpected = 0;
+
+	if (istat & ~infinipath_i_bitsextant)
+		_IPATH_UNIT_ERROR(t,
+				  "interrupt with unknown interrupts %x set\n",
+				  istat & (uint32_t) ~ infinipath_i_bitsextant);
+
+	if (istat & INFINIPATH_I_ERROR) {
+		ipath_stats.sps_errints++;
+		estat = ipath_kget_kreg64(t, kr_errorstatus);
+		if (!estat)
+			_IPATH_INFO
+			    ("error interrupt (%x), but no error bits set!\n",
+			     istat);
+		else if (estat == -1LL)
+			/*
+			 * should we try clearing all, or hope next read
+			 * works?
+			 */
+			_IPATH_UNIT_ERROR(t,
+					  "Read of error status failed (all bits set); ignoring\n");
+		else
+			ipath_handle_errors(t, estat);
+	}
+
+	if (istat & INFINIPATH_I_GPIO) {
+		/* Clear GPIO status bit 2 */
+		ipath_kput_kreg(t, kr_gpio_clear, (uint64_t)(1 << 2));
+
+		/*
+		 * Packets are available in the port 0 receive queue.
+		 * Eventually this needs to be generalized to check
+		 * IPATH_GPIO_INTR, and the specific GPIO bit, when
+		 * GPIO interrupts start being used for other things.
+		 * We skip that now to improve performance.
+		 */
+		ipath_kreceive(t);
+	}
+
+	/*
+	 * clear the ones we will deal with on this round
+	 * We clear it early, mostly for receive interrupts, so we
+	 * know the chip will have seen this by the time we process
+	 * the queue, and will re-interrupt if necessary.  The processor
+	 * itself won't take the interrupt again until we return.
+	 */
+	ipath_kput_kreg(t, kr_intclear, istat);
+
+	if (istat & INFINIPATH_I_SPIOBUFAVAIL) {
+		atomic_clear_mask(INFINIPATH_S_PIOINTBUFAVAIL,
+				  &dd->ipath_sendctrl);
+		ipath_kput_kreg(t, kr_sendctrl, dd->ipath_sendctrl);
+
+		if (dd->ipath_portpiowait) {
+			uint32_t i;
+			/*
+			 * start from port 1, since for now port 0  is
+			 * never using wait_event for PIO
+			 */
+			for (i = 1;
+			     dd->ipath_portpiowait && i < dd->ipath_cfgports;
+			     i++) {
+				if (dd->ipath_pd[i]
+				    && dd->ipath_portpiowait & (1U << i)) {
+					atomic_clear_mask(1U << i,
+							  &dd->
+							  ipath_portpiowait);
+					if (dd->ipath_pd[i]->
+					    port_flag & IPATH_PORT_WAITING_PIO)
+					{
+						dd->ipath_pd[i]->port_flag &=
+						    ~IPATH_PORT_WAITING_PIO;
+						wake_up_interruptible(&dd->
+								      ipath_pd
+								      [i]->
+								      port_wait);
+					}
+				}
+			}
+		}
+
+		if (dd->ipath_layer.l_intr) {
+			if (dd->ipath_layer.l_intr(t,
+				IPATH_LAYER_INT_SEND_CONTINUE)) {
+				atomic_set_mask(INFINIPATH_S_PIOINTBUFAVAIL,
+						&dd->ipath_sendctrl);
+				ipath_kput_kreg(t, kr_sendctrl,
+						dd->ipath_sendctrl);
+			}
+		}
+
+		if (dd->verbs_layer.l_piobufavail) {
+			if (!dd->verbs_layer.l_piobufavail(t)) {
+				atomic_set_mask(INFINIPATH_S_PIOINTBUFAVAIL,
+						&dd->ipath_sendctrl);
+				ipath_kput_kreg(t, kr_sendctrl,
+						dd->ipath_sendctrl);
+			}
+		}
+	}
+
+	/*
+	 * we check for both transition from empty to non-empty, and urgent
+	 * packets (those with the interrupt bit set in the header)
+	 */
+
+	if (istat & ((infinipath_i_rcvavail_mask << INFINIPATH_I_RCVAVAIL_SHIFT)
+		     | (infinipath_i_rcvurg_mask << INFINIPATH_I_RCVURG_SHIFT))) {
+		uint64_t portr;
+		int i;
+		uint32_t rcvdint = 0;
+
+		portr = ((istat >> INFINIPATH_I_RCVAVAIL_SHIFT) &
+			 infinipath_i_rcvavail_mask)
+		    | ((istat >> INFINIPATH_I_RCVURG_SHIFT) &
+		       infinipath_i_rcvurg_mask);
+		for (i = 0; i < dd->ipath_cfgports; i++) {
+			if (portr & (1 << i) && dd->ipath_pd[i]) {
+				if (i == 0)
+					ipath_kreceive(t);
+				else if (dd->ipath_pd[i]->
+					 port_flag & IPATH_PORT_WAITING_RCV) {
+					atomic_clear_mask
+					    (IPATH_PORT_WAITING_RCV,
+					     &dd->ipath_pd[i]->port_flag);
+					wake_up_interruptible(&dd->ipath_pd[i]->
+							      port_wait);
+					rcvdint |= 1U << i;
+				}
+			}
+		}
+		if (rcvdint) {
+			/*
+			 * only want to take one interrupt, so turn off
+			 * the rcv interrupt for all the ports that we
+			 * did the wakeup on (but never for kernel port)
+			 */
+			atomic_clear_mask(rcvdint <<
+					  INFINIPATH_R_INTRAVAIL_SHIFT,
+					  &dd->ipath_rcvctrl);
+			ipath_kput_kreg(t, kr_rcvctrl, dd->ipath_rcvctrl);
+		}
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void ipath_decode_err(char *buf, size_t blen, uint64_t err)
+{
+	*buf = '\0';
+	if (err & INFINIPATH_E_RHDRLEN)
+		strlcat(buf, "rhdrlen ", blen);
+	if (err & INFINIPATH_E_RBADTID)
+		strlcat(buf, "rbadtid ", blen);
+	if (err & INFINIPATH_E_RBADVERSION)
+		strlcat(buf, "rbadversion ", blen);
+	if (err & INFINIPATH_E_RHDR)
+		strlcat(buf, "rhdr ", blen);
+	if (err & INFINIPATH_E_RLONGPKTLEN)
+		strlcat(buf, "rlongpktlen ", blen);
+	if (err & INFINIPATH_E_RSHORTPKTLEN)
+		strlcat(buf, "rshortpktlen ", blen);
+	if (err & INFINIPATH_E_RMAXPKTLEN)
+		strlcat(buf, "rmaxpktlen ", blen);
+	if (err & INFINIPATH_E_RMINPKTLEN)
+		strlcat(buf, "rminpktlen ", blen);
+	if (err & INFINIPATH_E_RFORMATERR)
+		strlcat(buf, "rformaterr ", blen);
+	if (err & INFINIPATH_E_RUNSUPVL)
+		strlcat(buf, "runsupvl ", blen);
+	if (err & INFINIPATH_E_RUNEXPCHAR)
+		strlcat(buf, "runexpchar ", blen);
+	if (err & INFINIPATH_E_RIBFLOW)
+		strlcat(buf, "ribflow ", blen);
+	if (err & INFINIPATH_E_REBP)
+		strlcat(buf, "EBP ", blen);
+	if (err & INFINIPATH_E_SUNDERRUN)
+		strlcat(buf, "sunderrun ", blen);
+	if (err & INFINIPATH_E_SPIOARMLAUNCH)
+		strlcat(buf, "spioarmlaunch ", blen);
+	if (err & INFINIPATH_E_SUNEXPERRPKTNUM)
+		strlcat(buf, "sunexperrpktnum ", blen);
+	if (err & INFINIPATH_E_SDROPPEDDATAPKT)
+		strlcat(buf, "sdroppeddatapkt ", blen);
+	if (err & INFINIPATH_E_SDROPPEDSMPPKT)
+		strlcat(buf, "sdroppedsmppkt ", blen);
+	if (err & INFINIPATH_E_SMAXPKTLEN)
+		strlcat(buf, "smaxpktlen ", blen);
+	if (err & INFINIPATH_E_SMINPKTLEN)
+		strlcat(buf, "sminpktlen ", blen);
+	if (err & INFINIPATH_E_SUNSUPVL)
+		strlcat(buf, "sunsupVL ", blen);
+	if (err & INFINIPATH_E_SPKTLEN)
+		strlcat(buf, "spktlen ", blen);
+	if (err & INFINIPATH_E_INVALIDADDR)
+		strlcat(buf, "invalidaddr ", blen);
+	if (err & INFINIPATH_E_RICRC)
+		strlcat(buf, "CRC ", blen);
+	if (err & INFINIPATH_E_RVCRC)
+		strlcat(buf, "VCRC ", blen);
+	if (err & INFINIPATH_E_RRCVEGRFULL)
+		strlcat(buf, "rcvegrfull ", blen);
+	if (err & INFINIPATH_E_RRCVHDRFULL)
+		strlcat(buf, "rcvhdrfull ", blen);
+	if (err & INFINIPATH_E_IBSTATUSCHANGED)
+		strlcat(buf, "ibcstatuschg ", blen);
+	if (err & INFINIPATH_E_RIBLOSTLINK)
+		strlcat(buf, "riblostlink ", blen);
+	if (err & INFINIPATH_E_HARDWARE)
+		strlcat(buf, "hardware ", blen);
+	if (err & INFINIPATH_E_RESET)
+		strlcat(buf, "reset ", blen);
+}
+
+/* decode RHF errors; only used one place now, may want more later */
+static void get_rhf_errstring(uint32_t err, char *msg, size_t len)
+{
+	/* if no errors, and so don't need to check what's first */
+	*msg = '\0';
+
+	if (err & INFINIPATH_RHF_H_ICRCERR)
+		strlcat(msg, "icrcerr ", len);
+	if (err & INFINIPATH_RHF_H_VCRCERR)
+		strlcat(msg, "vcrcerr ", len);
+	if (err & INFINIPATH_RHF_H_PARITYERR)
+		strlcat(msg, "parityerr ", len);
+	if (err & INFINIPATH_RHF_H_LENERR)
+		strlcat(msg, "lenerr ", len);
+	if (err & INFINIPATH_RHF_H_MTUERR)
+		strlcat(msg, "mtuerr ", len);
+	if (err & INFINIPATH_RHF_H_IHDRERR)
+		/* infinipath hdr checksum error */
+		strlcat(msg, "ipathhdrerr ", len);
+	if (err & INFINIPATH_RHF_H_TIDERR)
+		strlcat(msg, "tiderr ", len);
+	if (err & INFINIPATH_RHF_H_MKERR)
+		/* bad port, offset, etc. */
+		strlcat(msg, "invalid ipathhdr ", len);
+	if (err & INFINIPATH_RHF_H_IBERR)
+		strlcat(msg, "iberr ", len);
+	if (err & INFINIPATH_RHF_L_SWA)
+		strlcat(msg, "swA ", len);
+	if (err & INFINIPATH_RHF_L_SWB)
+		strlcat(msg, "swB ", len);
+}
+
+static void ipath_handle_errors(const ipath_type t, uint64_t errs)
+{
+	char msg[512];
+	uint32_t piobcnt;
+	uint64_t sbuf[4], ignore_this_time = 0;
+	int i;
+	int chkerrpkts = 0, noprint = 0;
+	cycles_t nc;
+	static cycles_t nextmsg_time;
+	static unsigned nmsgs, supp_msgs;
+	struct ipath_devdata *dd = &devdata[t];
+
+#define E_SUM_PKTERRS (INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID \
+     | INFINIPATH_E_RBADVERSION \
+     | INFINIPATH_E_RHDR | INFINIPATH_E_RLONGPKTLEN | INFINIPATH_E_RSHORTPKTLEN \
+     | INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RMINPKTLEN \
+     | INFINIPATH_E_RFORMATERR | INFINIPATH_E_RUNSUPVL | INFINIPATH_E_RUNEXPCHAR \
+     | INFINIPATH_E_REBP)
+
+#define E_SUM_ERRS ( INFINIPATH_E_SPIOARMLAUNCH \
+    | INFINIPATH_E_SUNEXPERRPKTNUM | INFINIPATH_E_SDROPPEDDATAPKT \
+    | INFINIPATH_E_SDROPPEDSMPPKT | INFINIPATH_E_SMAXPKTLEN \
+    | INFINIPATH_E_SUNSUPVL | INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN \
+    | INFINIPATH_E_INVALIDADDR)
+
+	/*
+	 * throttle back "fast" messages to no more than 10 per 5 seconds
+	 * (1.4-2GHz clock).  This isn't perfect, but it's a reasonable
+	 * heuristic
+	 * If we get more than 10, give a 5x longer delay
+	 */
+	nc = get_cycles();
+	if (nmsgs > 10) {
+		if (nc < nextmsg_time) {
+			noprint = 1;
+			if (!supp_msgs++)
+				nextmsg_time = nc + 50000000000ULL;
+		} else if (supp_msgs) {
+			/*
+			 * Print the message unless it's ibc status
+			 * change only, which happens so often we never
+			 * want to count it.
+			 */
+			if (dd->ipath_lasterror & ~INFINIPATH_E_IBSTATUSCHANGED) {
+				ipath_decode_err(msg, sizeof msg,
+						 dd->
+						 ipath_lasterror &
+						 ~INFINIPATH_E_IBSTATUSCHANGED);
+				if (dd->
+				    ipath_lasterror & ~(INFINIPATH_E_RRCVEGRFULL
+							|
+							INFINIPATH_E_RRCVHDRFULL))
+					_IPATH_UNIT_ERROR(t,
+							  "Suppressed %u messages for fast-repeating errors (%s) (%llx)\n",
+							  supp_msgs, msg,
+							  dd->ipath_lasterror);
+				else {
+					/*
+					 * rcvegrfull and rcvhdrqfull are
+					 * "normal", for some types of
+					 * processes (mostly benchmarks)
+					 * that send huge numbers of
+					 * messages, while not processing
+					 * them.  So only complain about
+					 * these at debug level.
+					 */
+					_IPATH_DBG
+					    ("Suppressed %u messages for %s\n",
+					     supp_msgs, msg);
+				}
+			}
+			supp_msgs = 0;
+			nmsgs = 0;
+		}
+	} else if (!nmsgs++ || nc > nextmsg_time)	/* start timer */
+		nextmsg_time = nc + 10000000000ULL;
+
+	/*
+	 * don't report errors that are masked (includes those always
+	 * ignored)
+	 */
+	errs &= ~dd->ipath_maskederrs;
+
+	/* do these first, they are most important */
+	if (errs & INFINIPATH_E_HARDWARE) {
+		/* reuse same msg buf */
+		ipath_handle_hwerrors(t, msg, sizeof msg);
+	}
+
+	if (!noprint && (errs & ~infinipath_e_bitsextant))
+		_IPATH_UNIT_ERROR(t,
+				  "error interrupt with unknown errors %llx set\n",
+				  errs & ~infinipath_e_bitsextant);
+
+	if (errs & E_SUM_ERRS) {
+		/* if possible that sendbuffererror could be valid */
+		piobcnt = dd->ipath_piobcnt;
+		/* read these before writing errorclear */
+		sbuf[0] = ipath_kget_kreg64(t, kr_sendbuffererror);
+		sbuf[1] = ipath_kget_kreg64(t, kr_sendbuffererror + 1);
+		if (piobcnt > 128) {
+			sbuf[2] = ipath_kget_kreg64(t, kr_sendbuffererror + 2);
+			sbuf[3] = ipath_kget_kreg64(t, kr_sendbuffererror + 3);
+		}
+
+		if (sbuf[0] || sbuf[1]
+		    || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
+			_IPATH_PDBG("SendbufErrs %llx %llx ", sbuf[0], sbuf[1]);
+			if (infinipath_debug & __IPATH_PKTDBG && piobcnt > 128)
+				printk("%llx %llx ", sbuf[2], sbuf[3]);
+			for (i = 0; i < piobcnt; i++) {
+				if (test_bit(i, sbuf)) {
+					uint32_t sendctrl;
+					if (infinipath_debug & __IPATH_PKTDBG)
+						printk("%u ", i);
+					sendctrl =
+					    dd->
+					    ipath_sendctrl | INFINIPATH_S_DISARM
+					    | (i <<
+					       INFINIPATH_S_DISARMPIOBUF_SHIFT);
+					ipath_kput_kreg(t, kr_sendctrl,
+							sendctrl);
+				}
+			}
+			if (infinipath_debug & __IPATH_PKTDBG)
+				printk("\n");
+		}
+		if ((errs &
+		     (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT
+		      | INFINIPATH_E_SMINPKTLEN))
+		    && !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+			/*
+			 * This can happen when SMA is trying to bring
+			 * the link up, but the IB link changes state
+			 * at the "wrong" time.  The IB logic then
+			 * complains that the packet isn't valid.
+			 * We don't want to confuse people, so we just
+			 * don't print them, except at debug
+			 */
+			_IPATH_DBG
+			    ("Ignoring pktsend errors %llx, because not yet active\n",
+			     errs);
+			ignore_this_time |=
+			    INFINIPATH_E_SDROPPEDDATAPKT |
+			    INFINIPATH_E_SDROPPEDSMPPKT |
+			    INFINIPATH_E_SMINPKTLEN;
+		}
+	}
+
+	if (supp_msgs == 250000) {
+		/*
+		 * It's not entirely reasonable assuming that the errors
+		 * set in the last clear period are all responsible for
+		 * the problem, but the alternative is to assume it's the only
+		 * ones on this particular interrupt, which also isn't great
+		 */
+		dd->ipath_maskederrs |= dd->ipath_lasterror | errs;
+		ipath_kput_kreg(t, kr_errormask, ~dd->ipath_maskederrs);
+		ipath_decode_err(msg, sizeof msg,
+				 (dd->ipath_maskederrs & ~dd->
+				  ipath_ignorederrs));
+
+		if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs)
+		    & ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL))
+			_IPATH_UNIT_ERROR(t,
+					  "Disabling error(s) %llx because occuring too frequently (%s)\n",
+					  (dd->ipath_maskederrs & ~dd->
+					   ipath_ignorederrs), msg);
+		else {
+			/*
+			 * rcvegrfull and rcvhdrqfull are "normal",
+			 * for some types of processes (mostly benchmarks)
+			 * that send huge numbers of messages, while not
+			 * processing them.  So only complain about
+			 * these at debug level.
+			 */
+			_IPATH_DBG
+			    ("Disabling frequent queue full errors (%s)\n",
+			     msg);
+		}
+
+		/*
+		 * re-enable the masked errors after around 3 minutes.
+		 * in ipath_get_faststats().  If we have a series of
+		 * fast repeating but different errors, the interval will keep
+		 * stretching out, but that's OK, as that's pretty catastrophic.
+		 */
+		dd->ipath_unmasktime = nc + 400000000000ULL;
+	}
+
+	ipath_kput_kreg(t, kr_errorclear, errs);
+	if (ignore_this_time)
+		errs &= ~ignore_this_time;
+	if (errs & ~dd->ipath_lasterror) {
+		errs &= ~dd->ipath_lasterror;
+		/* never suppress duplicate hwerrors or ibstatuschange */
+		dd->ipath_lasterror |= errs &
+		    ~(INFINIPATH_E_HARDWARE | INFINIPATH_E_IBSTATUSCHANGED);
+	}
+	if (!errs)
+		return;
+
+	if (!noprint)
+		/* the ones we mask off are handled specially below or above */
+		ipath_decode_err(msg, sizeof msg,
+				 errs & ~(INFINIPATH_E_IBSTATUSCHANGED |
+					  INFINIPATH_E_RRCVEGRFULL |
+					  INFINIPATH_E_RRCVHDRFULL |
+					  INFINIPATH_E_HARDWARE));
+	else
+		/* so we don't need if (!noprint) at strlcat's below */
+		*msg = 0;
+
+	if (errs & E_SUM_PKTERRS) {
+		ipath_stats.sps_pkterrs++;
+		chkerrpkts = 1;
+	}
+	if (errs & E_SUM_ERRS)
+		ipath_stats.sps_errs++;
+
+	if (errs & (INFINIPATH_E_RICRC | INFINIPATH_E_RVCRC)) {
+		ipath_stats.sps_crcerrs++;
+		chkerrpkts = 1;
+	}
+
+	/*
+	 * We don't want to print these two as they happen, or we can make
+	 * the situation even worse, because it takes so long to print messages.
+	 * to serial consoles.  kernel ports get printed from fast_stats, no
+	 * more than every 5 seconds, user ports get printed on close
+	 */
+	if (errs & INFINIPATH_E_RRCVHDRFULL) {
+		int any;
+		uint32_t hd, tl;
+		ipath_stats.sps_hdrqfull++;
+		for (any = i = 0; i < dd->ipath_cfgports; i++) {
+			if (i == 0) {
+				hd = dd->ipath_port0head;
+				tl = *dd->ipath_hdrqtailptr;
+			} else if (dd->ipath_pd[i] &&
+				   dd->ipath_pd[i]->port_rcvhdrtail_kvaddr) {
+				/*
+				 * don't report same point multiple times,
+				 * except kernel
+				 */
+				tl = (uint32_t) *
+				    dd->ipath_pd[i]->port_rcvhdrtail_kvaddr;
+				if (tl == dd->ipath_lastrcvhdrqtails[i])
+					continue;
+				hd = ipath_kget_ureg32(t, ur_rcvhdrhead, i);
+			} else
+				continue;
+			if (hd == (tl + 1) || (!hd && tl == dd->ipath_hdrqlast)) {
+				dd->ipath_lastrcvhdrqtails[i] = tl;
+				dd->ipath_pd[i]->port_hdrqfull++;
+				if (i == 0)
+					chkerrpkts = 1;
+			}
+		}
+	}
+	if (errs & INFINIPATH_E_RRCVEGRFULL) {
+		/*
+		 * since this is of less importance and not likely to
+		 * happen without also getting hdrfull, only count
+		 * occurrences; don't check each port (or even the kernel
+		 * vs user)
+		 */
+		ipath_stats.sps_etidfull++;
+		if (dd->ipath_port0head != *dd->ipath_hdrqtailptr)
+			chkerrpkts = 1;
+	}
+
+	/*
+	 * do this before IBSTATUSCHANGED, in case both bits set in a single
+	 * interrupt; we want the STATUSCHANGE to "win", so we do our
+	 * internal copy of state machine correctly
+	 */
+	if (errs & INFINIPATH_E_RIBLOSTLINK) {
+		/* force through block below */
+		errs |= INFINIPATH_E_IBSTATUSCHANGED;
+		ipath_stats.sps_iblink++;
+		dd->ipath_flags |= IPATH_LINKDOWN;
+		dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+				     | IPATH_LINKARMED | IPATH_LINKACTIVE);
+		if (!noprint)
+			_IPATH_DBG("Lost link, link now down (%s)\n",
+				   ipath_ibcstatus_str[ipath_kget_kreg64
+						       (t,
+							kr_ibcstatus) & 0xf]);
+	}
+
+	if ((errs & INFINIPATH_E_IBSTATUSCHANGED) && (!ipath_diags_enabled)) {
+		uint64_t val;
+		uint32_t ltstate;
+
+		val = ipath_kget_kreg64(t, kr_ibcstatus);
+		ltstate = val & 0xff;
+		if (ltstate == 0x11 || ltstate == 0x21 || ltstate == 0x31)
+			_IPATH_DBG("Link state changed unit %u to 0x%x, last was 0x%llx\n",
+				t, ltstate, dd->ipath_lastibcstat);
+		else {
+			ltstate = dd->ipath_lastibcstat & 0xff;
+			if (ltstate == 0x11 || ltstate == 0x21 || ltstate == 0x31)
+				_IPATH_DBG("Link state unit %u changed to down state 0x%llx, last was 0x%llx\n",
+					t, val, dd->ipath_lastibcstat);
+			else
+				_IPATH_VDBG("Link state unit %u changed to 0x%llx from one of down states\n",
+					t, val);
+		}
+		ltstate = (val >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
+		    INFINIPATH_IBCS_LINKTRAININGSTATE_MASK;
+
+		if (ltstate == 2 || ltstate == 3) {
+			uint32_t last_ltstate;
+
+			/*
+			 * ignore cycling back and forth from states 2 to 3
+			 * while waiting for other end of link to come up
+			 * except that if it keeps happening, we switch between
+			 * linkinitstate SLEEP and POLL.  While we cycle
+			 * back and forth between them, we aren't seeing
+			 * any other device, either no cable plugged in,
+			 * other device powered off, other device is
+			 * switch that hasn't yet polled us, etc.
+			 */
+			last_ltstate = (dd->ipath_lastibcstat >>
+					INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT)
+			    & INFINIPATH_IBCS_LINKTRAININGSTATE_MASK;
+			if (last_ltstate == 2 || last_ltstate == 3) {
+				if (++dd->ipath_ibpollcnt > 4) {
+					uint64_t ibc;
+					dd->ipath_flags |=
+					    IPATH_LINK_SLEEPING | IPATH_NOCABLE;
+					*dd->ipath_statusp |=
+					    IPATH_STATUS_IB_NOCABLE;
+					_IPATH_VDBG
+					    ("linkinitcmd POLL, move to SLEEP\n");
+					ibc = dd->ipath_ibcctrl;
+					ibc |= INFINIPATH_IBCC_LINKINITCMD_SLEEP
+					    <<
+					    INFINIPATH_IBCC_LINKINITCMD_SHIFT;
+					/*
+					 * don't put linkinitcmd in
+					 * ipath_ibcctrl, want that to
+					 * stay a NOP
+					 */
+					ipath_kput_kreg(t, kr_ibcctrl, ibc);
+					dd->ipath_ibpollcnt = 0;
+				}
+				goto skip_ibchange;
+			}
+		}
+		/* some state other than 2 or 3 */
+		dd->ipath_ibpollcnt = 0;
+		ipath_stats.sps_iblink++;
+		/*
+		 * Note:  We try to match the Mellanox HCA LED behavior
+		 * as best we can.  That changed around Oct 2003.
+		 * Green indicates link state (something is plugged in,
+		 * and we can train).  Amber indicates the link is
+		 * logically up (ACTIVE).  Mellanox further blinks the
+		 * amber LED to indicate data packet activity, but we
+		 * have no hardware support for that, so it would require
+		 * waking up every 10-20 msecs and checking the counters
+		 * on the chip, and then turning the LED off if
+		 * appropriate.  That's visible overhead, so not something
+		 * we will do.
+		 */
+		if (ltstate != 1 || ((dd->ipath_lastibcstat & 0x30) == 0x30 &&
+				     (val & 0x30) != 0x30)) {
+			dd->ipath_flags |= IPATH_LINKDOWN;
+			dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+					     | IPATH_LINKACTIVE |
+					     IPATH_LINKARMED);
+			*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+			if (!noprint) {
+				if ((dd->ipath_lastibcstat & 0x30) == 0x30)
+					/* if from up to down be more vocal */
+					_IPATH_DBG("Link unit %u is now down (%s)\n",
+						   t, ipath_ibcstatus_str
+						   [ltstate]);
+				else
+					_IPATH_VDBG("Link unit %u is down (%s)\n",
+						    t, ipath_ibcstatus_str
+						    [ltstate]);
+			}
+
+			if (val & 0x30) {
+				/* leave just green on, 0x11 and 0x21 */
+				dd->ipath_extctrl &=
+				    ~INFINIPATH_EXTC_LEDPRIPORTYELLOWON;
+				dd->ipath_extctrl |=
+				    INFINIPATH_EXTC_LEDPRIPORTGREENON;
+			} else	/* not up at all, so turn the leds off */
+				dd->ipath_extctrl &=
+				    ~(INFINIPATH_EXTC_LEDPRIPORTGREENON |
+				      INFINIPATH_EXTC_LEDPRIPORTYELLOWON);
+			ipath_kput_kreg(t, kr_extctrl,
+					(uint64_t) dd->ipath_extctrl);
+			if (ltstate == 1
+			    && (dd->
+				ipath_flags & (IPATH_LINK_TOARMED |
+					       IPATH_LINK_TOACTIVE))) {
+				ipath_set_ib_lstate(t,
+						    INFINIPATH_IBCC_LINKCMD_INIT);
+			}
+		} else if ((val & 0x31) == 0x31) {
+			if (!noprint)
+				_IPATH_DBG("Link unit %u is now in active state\n", t);
+			dd->ipath_flags |= IPATH_LINKACTIVE;
+			dd->ipath_flags &=
+			    ~(IPATH_LINKUNK | IPATH_LINKINIT | IPATH_LINKDOWN |
+			      IPATH_LINKARMED | IPATH_NOCABLE |
+			      IPATH_LINK_TOACTIVE | IPATH_LINK_SLEEPING);
+			*dd->ipath_statusp &= ~IPATH_STATUS_IB_NOCABLE;
+			*dd->ipath_statusp |=
+			    IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF;
+			/* set the externally visible LEDs to indicate state */
+			dd->ipath_extctrl |= INFINIPATH_EXTC_LEDPRIPORTGREENON
+			    | INFINIPATH_EXTC_LEDPRIPORTYELLOWON;
+			ipath_kput_kreg(t, kr_extctrl,
+					(uint64_t) dd->ipath_extctrl);
+
+			/*
+			 * since we are now active, set the linkinitcmd
+			 * to NOP (0) it was probably either POLL or SLEEP
+			 */
+			dd->ipath_ibcctrl &=
+			    ~(INFINIPATH_IBCC_LINKINITCMD_MASK <<
+			      INFINIPATH_IBCC_LINKINITCMD_SHIFT);
+			ipath_kput_kreg(t, kr_ibcctrl, dd->ipath_ibcctrl);
+
+			if (devdata[t].ipath_layer.l_intr)
+				devdata[t].ipath_layer.l_intr(t,
+							      IPATH_LAYER_INT_IF_UP);
+		} else if ((val & 0x31) == 0x11) {
+			/*
+			 * set set INIT and DOWN.  Down is checked by
+			 * most of the other code, but INIT is useful
+			 * to know in a few places.
+			 */
+			dd->ipath_flags |= IPATH_LINKINIT | IPATH_LINKDOWN;
+			dd->ipath_flags &=
+			    ~(IPATH_LINKUNK | IPATH_LINKACTIVE | IPATH_LINKARMED
+			      | IPATH_NOCABLE | IPATH_LINK_SLEEPING);
+			*dd->ipath_statusp &= ~(IPATH_STATUS_IB_NOCABLE
+				| IPATH_STATUS_IB_READY);
+
+			/* set the externally visible LEDs to indicate state */
+			dd->ipath_extctrl &=
+			    ~INFINIPATH_EXTC_LEDPRIPORTYELLOWON;
+			dd->ipath_extctrl |= INFINIPATH_EXTC_LEDPRIPORTGREENON;
+			ipath_kput_kreg(t, kr_extctrl,
+					(uint64_t) dd->ipath_extctrl);
+			if (dd->
+			    ipath_flags & (IPATH_LINK_TOARMED |
+					   IPATH_LINK_TOACTIVE)) {
+				/*
+				 * if we got here while trying to bring
+				 * the link up, try again, but only once more!
+				 */
+				ipath_set_ib_lstate(t,
+						    INFINIPATH_IBCC_LINKCMD_ARMED);
+				dd->ipath_flags &=
+				    ~(IPATH_LINK_TOARMED | IPATH_LINK_TOACTIVE);
+			}
+		} else if ((val & 0x31) == 0x21) {
+			dd->ipath_flags |= IPATH_LINKARMED;
+			dd->ipath_flags &=
+			    ~(IPATH_LINKUNK | IPATH_LINKDOWN | IPATH_LINKINIT |
+			      IPATH_LINKACTIVE | IPATH_NOCABLE |
+			      IPATH_LINK_TOARMED | IPATH_LINK_SLEEPING);
+			*dd->ipath_statusp &= ~(IPATH_STATUS_IB_NOCABLE
+				| IPATH_STATUS_IB_READY);
+			/*
+			 * set the externally visible LEDs to indicate
+			 * state (same as 0x11)
+			 */
+			dd->ipath_extctrl &=
+			    ~INFINIPATH_EXTC_LEDPRIPORTYELLOWON;
+			dd->ipath_extctrl |= INFINIPATH_EXTC_LEDPRIPORTGREENON;
+			ipath_kput_kreg(t, kr_extctrl,
+					(uint64_t) dd->ipath_extctrl);
+			if (dd->ipath_flags & IPATH_LINK_TOACTIVE) {
+				/*
+				 * if we got here while trying to bring
+				 * the link up, try again, but only once more!
+				 */
+				ipath_set_ib_lstate(t,
+						    INFINIPATH_IBCC_LINKCMD_ACTIVE);
+				dd->ipath_flags &= ~IPATH_LINK_TOACTIVE;
+			}
+		} else {
+			if (dd->
+			    ipath_flags & (IPATH_LINK_TOARMED |
+					   IPATH_LINK_TOACTIVE))
+				ipath_set_ib_lstate(t,
+						    INFINIPATH_IBCC_LINKCMD_INIT);
+			else if (!noprint)
+				_IPATH_DBG("IBstatuschange unit %u: %s\n",
+					  t, ipath_ibcstatus_str[ltstate]);
+		}
+		dd->ipath_lastibcstat = val;
+	}
+
+skip_ibchange:
+
+	if (errs & INFINIPATH_E_RESET) {
+		if (!noprint)
+			_IPATH_UNIT_ERROR(t,
+					  "Got reset, requires re-initialization (unload and reload driver)\n");
+		dd->ipath_flags &= ~IPATH_INITTED;	/* needs re-init */
+		/* mark as having had error */
+		*dd->ipath_statusp |= IPATH_STATUS_HWERROR;
+		*dd->ipath_statusp &= ~IPATH_STATUS_IB_CONF;
+	}
+
+	if (!noprint && *msg)
+		_IPATH_UNIT_ERROR(t, "%s error\n", msg);
+	if (dd->ipath_sma_state_wanted & dd->ipath_flags) {
+		_IPATH_VDBG("sma wanted state %x, iflags now %x, waking\n",
+			    dd->ipath_sma_state_wanted, dd->ipath_flags);
+		wake_up_interruptible(&ipath_sma_state_wait);
+	}
+
+	if (chkerrpkts)
+		/* process possible error packets in hdrq */
+		ipath_kreceive(t);
+}



More information about the general mailing list