[openib-general] [PATCH 10 of 20] ipath - core driver, part 3 of 4
Bryan O'Sullivan
bos at pathscale.com
Wed Dec 28 16:31:29 PST 2005
Signed-off-by: Bryan O'Sullivan <bos at pathscale.com>
diff -r dad2e87e21f4 -r c37b118ef806 drivers/infiniband/hw/ipath/ipath_driver.c
--- a/drivers/infiniband/hw/ipath/ipath_driver.c Wed Dec 28 14:19:42 2005 -0800
+++ b/drivers/infiniband/hw/ipath/ipath_driver.c Wed Dec 28 14:19:42 2005 -0800
@@ -3878,3 +3878,1533 @@
/* process possible error packets in hdrq */
ipath_kreceive(t);
}
+
+/* must only be called if ipath_pd[port] is known to be allocated */
+static inline void *ipath_get_egrbuf(const ipath_type t, uint32_t bufnum,
+ int err)
+{
+ return devdata[t].ipath_port0_skbs ?
+ (void *)devdata[t].ipath_port0_skbs[bufnum]->data : NULL;
+
+#ifdef _USE_FOR_DEBUGGING_ONLY
+ /*
+ * want routine to be inlined and fast this is here so if we do ports
+ * other than 0, I don't have to rewrite the code, since it's slightly
+ * complicated
+ */
+ if (port != 1) {
+ void *chunkbase;
+ /*
+ * This calculation takes about 50 cycles. Could do
+ * what I did for protocol code, and have an array of
+ * addresses, getting it down to just a few cycles per
+ * lookup, at the cost of 16KB of memory.
+ */
+ if (!devdata[t].ipath_pd[port]->port_rcvegrbuf_virt)
+ return NULL;
+ chunkbase = devdata[t].ipath_pd[port]->port_rcvegrbuf_virt
+ [bufnum /
+ devdata[t].ipath_pd[port]->port_rcvegrbufs_perchunk];
+ return (void *)(chunkbase +
+ (bufnum %
+ devdata[t].ipath_pd[port]->
+ port_rcvegrbufs_perchunk)
+ * devdata[t].ipath_rcvegrbufsize);
+ }
+#endif
+}
+
+/* receive an sma packet. Separate for better overall optimization */
+static void ipath_rcv_sma(const ipath_type t, uint32_t tlen,
+ uint64_t * rc, void *ebuf)
+{
+ int sindex, slen, elen;
+ void *smbuf;
+ uint8_t pad, *bthbytes;
+
+ ipath_stats.sps_sma_rpkts++; /* another SMA packet received */
+
+ bthbytes = (uint8_t *)((struct ips_message_header_typ *) &rc[1])->bth;
+
+ pad = (bthbytes[1] >> 4) & 3;
+ elen = tlen - (IPATH_SMA_HDRSZ + pad + (uint32_t) sizeof(uint32_t));
+ if (elen > (SMA_MAX_PKTSZ - IPATH_SMA_HDRSZ))
+ elen = SMA_MAX_PKTSZ - IPATH_SMA_HDRSZ;
+
+ spin_lock_irq(&ipath_sma_lock);
+ sindex = ipath_sma_next;
+ smbuf = ipath_sma_data[sindex].buf;
+ ipath_sma_data[sindex].unit = t;
+ slen = ipath_sma_data[ipath_sma_next].len;
+ memcpy(smbuf, &rc[1], IPATH_SMA_HDRSZ);
+ memcpy(smbuf + IPATH_SMA_HDRSZ, ebuf, elen);
+ if (slen) {
+ /*
+ * overwriting a yet unread old one (buffer wrap), have to
+ * advance ipath_sma_first to next oldest
+ */
+
+ /* count OK packets that we drop */
+ ipath_stats.sps_krdrops++;
+ if (++ipath_sma_first >= IPATH_NUM_SMAPKTS)
+ ipath_sma_first = 0;
+ }
+ slen = ipath_sma_data[sindex].len = elen + IPATH_SMA_HDRSZ;
+ if (++ipath_sma_next >= IPATH_NUM_SMAPKTS)
+ ipath_sma_next = 0;
+ spin_unlock_irq(&ipath_sma_lock);
+}
+
+/*
+ * receive a packet for the layered (ethernet) driver.
+ * Separate routine for better overall optimization
+ */
+static void ipath_rcv_layer(const ipath_type t, uint32_t etail,
+ uint32_t tlen, struct ether_header_typ * hdr)
+{
+ uint32_t elen;
+ uint8_t pad, *bthbytes;
+ struct sk_buff *skb;
+ struct sk_buff *nskb;
+ struct ipath_devdata *dd = &devdata[t];
+ struct ipath_portdata *pd;
+ unsigned long pa, pent;
+ uint64_t __iomem *egrbase;
+ uint64_t lenvalid; /* in words */
+
+ if (dd->ipath_port0_skbs && hdr->sub_opcode == OPCODE_ENCAP) {
+ /*
+ * Allocate a new sk_buff to replace the one we give
+ * to the network stack.
+ */
+ if (!(nskb = dev_alloc_skb(dd->ipath_ibmaxlen + 4))) {
+ /* count OK packets that we drop */
+ ipath_stats.sps_krdrops++;
+ return;
+ }
+
+ bthbytes = (uint8_t *) hdr->bth;
+ pad = (bthbytes[1] >> 4) & 3;
+ /* +CRC32 */
+ elen = tlen - (sizeof(*hdr) + pad + sizeof(uint32_t));
+
+ skb_reserve(nskb, 4);
+
+ skb = dd->ipath_port0_skbs[etail];
+ dd->ipath_port0_skbs[etail] = nskb;
+ skb_put(skb, elen);
+
+ pd = dd->ipath_pd[0];
+ lenvalid = (dd->ipath_ibmaxlen - pd->port_egrskip) >> 2;
+ lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT;
+ lenvalid |= INFINIPATH_RT_VALID;
+ pa = virt_to_phys(nskb->data);
+ pa += pd->port_egrskip;
+ pent = (pa & INFINIPATH_RT_ADDR_MASK) | lenvalid;
+ /* This is simplified for port 0 */
+ egrbase = (uint64_t __iomem *)
+ ((char __iomem *)(dd->ipath_kregbase) +
+ dd->ipath_rcvegrbase);
+ ipath_kput_memq(t, &egrbase[etail], pent);
+
+ dd->ipath_layer.l_rcv(t, hdr, skb);
+
+ /* another ether packet received */
+ ipath_stats.sps_ether_rpkts++;
+ } else if (hdr->sub_opcode == OPCODE_LID_ARP) {
+ if (dd->ipath_layer.l_rcv_lid)
+ dd->ipath_layer.l_rcv_lid(t, hdr);
+ }
+
+}
+
+/* called from interrupt handler for errors or receive interrupt */
+void ipath_kreceive(const ipath_type t)
+{
+ uint64_t *rc;
+ void *ebuf;
+ struct ipath_devdata *dd = &devdata[t];
+ const uint32_t rsize = dd->ipath_rcvhdrentsize; /* words */
+ const uint32_t maxcnt = dd->ipath_rcvhdrcnt * rsize; /* in words */
+ uint32_t etail = -1, l, hdrqtail, sma_this_time = 0;
+ struct ips_message_header_typ *hdr;
+ uint32_t eflags, i, etype, tlen, pkttot=0;
+ static uint64_t totcalls; /* stats, may eventually remove */
+ char emsg[128];
+
+ if (!dd->ipath_hdrqtailptr) {
+ _IPATH_UNIT_ERROR(t,
+ "hdrqtailptr not set, can't do receives\n");
+ return;
+ }
+
+ if (test_and_set_bit(0, &dd->ipath_rcv_pending)) {
+ /* There is already a thread processing this queue. */
+ return;
+ }
+
+ if (dd->ipath_port0head == *dd->ipath_hdrqtailptr)
+ goto done;
+
+gotmore:
+ /*
+ * read only once at start. If in flood situation, this helps
+ * performance slightly. If more arrive while we are processing,
+ * we'll come back here and do them
+ */
+ hdrqtail = *dd->ipath_hdrqtailptr;
+
+ for (i = 0, l = dd->ipath_port0head; l != hdrqtail; i++) {
+ uint32_t qp;
+ uint8_t *bthbytes;
+
+
+ rc = (uint64_t *) (dd->ipath_pd[0]->port_rcvhdrq + (l << 2));
+ hdr = (struct ips_message_header_typ *) & rc[1];
+ /*
+ * could make a network order version of IPATH_KD_QP, and
+ * do the obvious shift before masking to speed this up.
+ */
+ qp = ntohl(hdr->bth[1]) & 0xffffff;
+ bthbytes = (uint8_t *) hdr->bth;
+
+ eflags = ips_get_hdr_err_flags((uint32_t*)rc);
+ etype = ips_get_rcv_type((uint32_t*)rc);
+ tlen = ips_get_length_in_bytes((uint32_t*)rc); /* total length */
+ ebuf = NULL;
+ if (etype != RCVHQ_RCV_TYPE_EXPECTED) {
+ /*
+ * it turns out that the chips uses an eager buffer for
+ * all non-expected packets, whether it "needs"
+ * one or not. So always get the index, but
+ * don't set ebuf (so we try to copy data)
+ * unless the length requires it.
+ */
+ etail = ips_get_index((uint32_t*)rc);
+ if (tlen > sizeof(*hdr)
+ || etype == RCVHQ_RCV_TYPE_NON_KD) {
+ ebuf = ipath_get_egrbuf(t, etail, 0);
+ }
+ }
+
+ /*
+ * both tiderr and ipathhdrerr are set for all plain IB
+ * packets; only ipathhdrerr should be set.
+ */
+
+ if (etype != RCVHQ_RCV_TYPE_NON_KD
+ && etype != RCVHQ_RCV_TYPE_ERROR
+ && ips_get_ipath_ver(hdr->iph.ver_port_tid_offset) !=
+ IPS_PROTO_VERSION) {
+ _IPATH_PDBG("Bad InfiniPath protocol version %x\n",
+ etype);
+ }
+
+ if (eflags &
+ ~(INFINIPATH_RHF_H_TIDERR | INFINIPATH_RHF_H_IHDRERR)) {
+ get_rhf_errstring(eflags, emsg, sizeof emsg);
+ _IPATH_PDBG
+ ("RHFerrs %x hdrqtail=%x typ=%u tlen=%x opcode=%x egridx=%x: %s\n",
+ eflags, l, etype, tlen, bthbytes[0],
+ ips_get_index((uint32_t*)rc), emsg);
+ } else if (etype == RCVHQ_RCV_TYPE_NON_KD) {
+ /*
+ * If there is a userland SMA and this is a MAD packet,
+ * then pass it to the userland SMA.
+ */
+ if (ipath_sma_alive && qp <= 1) {
+ /*
+ * count OK packets that we drop because
+ * SMA isn't yet running, or because we
+ * are in an sma flood (no point in
+ * constantly acquiring the spin lock, and
+ * overwriting previous packets).
+ * Eventually things will recover.
+ * Similarly if the sma consumer is
+ * so far behind that we would overwrite
+ * (yes, it's outside the lock)
+ */
+ if (!ipath_sma_data_spare ||
+ ipath_sma_data[ipath_sma_next].len ||
+ ++sma_this_time > IPATH_NUM_SMAPKTS) {
+ ipath_stats.sps_krdrops++;
+ } else if (ebuf) {
+ ipath_rcv_sma(t, tlen, rc, ebuf);
+ }
+ } else if (dd->verbs_layer.l_rcv) {
+ dd->verbs_layer.l_rcv(t, rc + 1, ebuf, tlen);
+ } else {
+ _IPATH_VDBG("received IB packet, not SMA (QP=%x)\n",
+ qp);
+ }
+ } else if (etype == RCVHQ_RCV_TYPE_EAGER) {
+ if (qp == IPATH_KD_QP && bthbytes[0] ==
+ dd->ipath_layer.l_rcv_opcode && ebuf)
+ ipath_rcv_layer(t, etail, tlen,
+ (struct ether_header_typ *)hdr);
+ else
+ _IPATH_PDBG
+ ("typ %x, opcode %x (eager, qp=%x), len %x; ignored\n",
+ etype, bthbytes[0], qp, tlen);
+ } else if (etype == RCVHQ_RCV_TYPE_EXPECTED) {
+ _IPATH_DBG("Bug: Expected TID, opcode %x; ignored\n",
+ hdr->bth[0] & 0xff);
+ } else if (eflags &
+ (INFINIPATH_RHF_H_TIDERR | INFINIPATH_RHF_H_IHDRERR))
+ {
+ /*
+ * This is a type 3 packet, only the LRH is in
+ * the rcvhdrq, the rest of the header is in
+ * the eager buffer.
+ */
+ uint8_t opcode;
+ if (ebuf) {
+ bthbytes = (uint8_t *) ebuf;
+ opcode = *bthbytes;
+ } else
+ opcode = 0;
+ get_rhf_errstring(eflags, emsg, sizeof emsg);
+ _IPATH_DBG
+ ("Err %x (%s), opcode %x, egrbuf %x, len %x\n",
+ eflags, emsg, opcode, etail, tlen);
+ } else {
+ /*
+ * error packet, type of error unknown.
+ * Probably type 3, but we don't know, so don't
+ * even try to print the opcode, etc.
+ */
+ _IPATH_DBG
+ ("Error Pkt, but no eflags! egrbuf %x, len %x\n"
+ "hdrq@%lx;hdrq+%x rhf: %llx; hdr %llx %llx %llx %llx %llx\n",
+ etail, tlen, (unsigned long)rc, l, rc[0], rc[1],
+ rc[2], rc[3], rc[4], rc[5]);
+ }
+ l += rsize;
+ if (l >= maxcnt)
+ l = 0;
+ /*
+ * update for each packet, to help prevent overflows if we have
+ * lots of packets.
+ */
+ (void)ipath_kput_ureg(t, ur_rcvhdrhead, l, 0);
+ if (etype != RCVHQ_RCV_TYPE_EXPECTED)
+ (void)ipath_kput_ureg(t, ur_rcvegrindexhead, etail, 0);
+ }
+
+ pkttot += i;
+
+ dd->ipath_port0head = l;
+
+ if (hdrqtail != *dd->ipath_hdrqtailptr)
+ goto gotmore; /* more arrived while we handled first batch */
+
+ if (pkttot > ipath_stats.sps_maxpkts_call)
+ ipath_stats.sps_maxpkts_call = pkttot;
+ ipath_stats.sps_port0pkts += pkttot;
+ ipath_stats.sps_avgpkts_call = ipath_stats.sps_port0pkts / ++totcalls;
+
+ if (sma_this_time) /* only once at end, not each time */
+ wake_up_interruptible(&ipath_sma_wait);
+
+done:
+ clear_bit(0, &dd->ipath_rcv_pending);
+ smp_mb__after_clear_bit();
+}
+
+/*
+ * Update our shadow copy of the PIO availability register map, called
+ * whenever our local copy indicates we have run out of send buffers
+ * NOTE: This can be called from interrupt context by ipath_bufavail()
+ * and from non-interrupt context by ipath_getpiobuf().
+ */
+
+static void ipath_update_pio_bufs(const ipath_type t)
+{
+ unsigned long flags;
+ int i;
+ const unsigned piobregs = (unsigned)devdata[t].ipath_pioavregs;
+
+ /* If the generation (check) bits have changed, then we update the
+ * busy bit for the corresponding PIO buffer. This algorithm will
+ * modify positions to the value they already have in some cases
+ * (i.e., no change), but it's faster than changing only the bits
+ * that have changed.
+ *
+ * We would like to do this atomicly, to avoid spinlocks in the
+ * critical send path, but that's not really possible, given the
+ * type of changes, and that this routine could be called on multiple
+ * cpu's simultaneously, so we lock in this routine only, to avoid
+ * conflicting updates; all we change is the shadow, and it's a
+ * single 64 bit memory location, so by definition the update is
+ * atomic in terms of what other cpu's can see in testing the
+ * bits. The spin_lock overhead isn't too bad, since it only
+ * happens when all buffers are in use, so only cpu overhead,
+ * not latency or bandwidth is affected.
+ */
+#define _IPATH_ALL_CHECKBITS 0x5555555555555555ULL
+ if (!devdata[t].ipath_pioavailregs_dma) {
+ _IPATH_DBG("Update shadow pioavail, but regs_dma NULL!\n");
+ return;
+ }
+ if (infinipath_debug & __IPATH_VERBDBG) {
+ /* only if packet debug and verbose */
+ _IPATH_PDBG("Refill avail, dma0=%llx shad0=%llx, "
+ "d1=%llx s1=%llx, d2=%llx s2=%llx, d3=%llx s3=%llx\n",
+ devdata[t].ipath_pioavailregs_dma[0],
+ devdata[t].ipath_pioavailshadow[0],
+ devdata[t].ipath_pioavailregs_dma[1],
+ devdata[t].ipath_pioavailshadow[1],
+ devdata[t].ipath_pioavailregs_dma[2],
+ devdata[t].ipath_pioavailshadow[2],
+ devdata[t].ipath_pioavailregs_dma[3],
+ devdata[t].ipath_pioavailshadow[3]);
+ if (piobregs > 4)
+ _IPATH_PDBG("2nd group, dma4=%llx shad4=%llx, "
+ "d5=%llx s5=%llx, d6=%llx s6=%llx, d7=%llx s7=%llx\n",
+ devdata[t].ipath_pioavailregs_dma[4],
+ devdata[t].ipath_pioavailshadow[4],
+ devdata[t].ipath_pioavailregs_dma[5],
+ devdata[t].ipath_pioavailshadow[5],
+ devdata[t].ipath_pioavailregs_dma[6],
+ devdata[t].ipath_pioavailshadow[6],
+ devdata[t].ipath_pioavailregs_dma[7],
+ devdata[t].ipath_pioavailshadow[7]);
+ }
+ spin_lock_irqsave(&ipath_pioavail_lock, flags);
+ for (i = 0; i < piobregs; i++) {
+ uint64_t pchbusy, pchg, piov, pnew;
+ /* Chip Errata: bug 6641; even and odd qwords>3 are swapped */
+ piov = devdata[t].ipath_pioavailregs_dma[i > 3 ? i ^ 1 : i];
+ pchg =
+ _IPATH_ALL_CHECKBITS & ~(devdata[t].
+ ipath_pioavailshadow[i] ^ piov);
+ pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT;
+ if (pchg && (pchbusy & devdata[t].ipath_pioavailshadow[i])) {
+ pnew = devdata[t].ipath_pioavailshadow[i] & ~pchbusy;
+ pnew |= piov & pchbusy;
+ devdata[t].ipath_pioavailshadow[i] = pnew;
+ }
+ }
+ spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+}
+
+static int ipath_do_user_init(struct ipath_portdata *pd,
+ struct ipath_user_info __user *uinfo)
+{
+ int ret = 0;
+ ipath_type t = pd->port_unit;
+ struct ipath_devdata *dd = &devdata[t];
+ struct ipath_user_info kinfo;
+
+ if (copy_from_user(&kinfo, uinfo, sizeof kinfo))
+ ret = -EFAULT;
+ else {
+ /* for now, if major version is different, bail */
+ if ((kinfo.spu_userversion >> 16) != IPATH_USER_SWMAJOR) {
+ _IPATH_INFO
+ ("User major version %d not same as driver major %d\n",
+ kinfo.spu_userversion >> 16, IPATH_USER_SWMAJOR);
+ ret = -ENODEV;
+ } else {
+ if ((kinfo.spu_userversion & 0xffff) !=
+ IPATH_USER_SWMINOR)
+ _IPATH_DBG
+ ("User minor version %d not same as driver minor %d\n",
+ kinfo.spu_userversion & 0xffff,
+ IPATH_USER_SWMINOR);
+ if (kinfo.spu_rcvhdrsize) {
+ if ((ret =
+ ipath_setrcvhdrsize(t,
+ kinfo.spu_rcvhdrsize)))
+ goto done;
+ } else if (!dd->ipath_rcvhdrsize) {
+ /*
+ * first user of field, kernel or user
+ * code, and using default
+ */
+ dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE;
+ ipath_kput_kreg(pd->port_unit, kr_rcvhdrsize,
+ dd->ipath_rcvhdrsize);
+ _IPATH_VDBG
+ ("Use default protocol header size %u\n",
+ dd->ipath_rcvhdrsize);
+ }
+
+ pd->port_egrskip = kinfo.spu_egrskip;
+ if (pd->port_egrskip) {
+ if (pd->port_egrskip & 3) {
+ _IPATH_DBG
+ ("eager skip 0x%x invalid, must be word multiple; using 0x%x\n",
+ pd->port_egrskip,
+ pd->port_egrskip & ~3);
+ pd->port_egrskip &= ~3;
+ }
+ _IPATH_DBG
+ ("user reserves 0x%x bytes at start of eager TIDs\n",
+ pd->port_egrskip);
+ }
+
+ /*
+ * for now we do nothing with rcvhdrcnt:
+ * kinfo.spu_rcvhdrcnt
+ */
+
+ /*
+ * set up for the rcvhdr Q tail register writeback
+ * to user memory
+ */
+ if (kinfo.spu_rcvhdraddr &&
+ access_ok(VERIFY_WRITE,
+ (uint64_t __user *) kinfo.spu_rcvhdraddr,
+ sizeof(uint64_t))) {
+ uint64_t physaddr, uaddr, off, atmp;
+ struct page *pagep;
+ off = offset_in_page(kinfo.spu_rcvhdraddr);
+ uaddr =
+ PAGE_MASK & (unsigned long)kinfo.
+ spu_rcvhdraddr;
+ if ((ret = ipath_get_upages_nocopy(uaddr, &pagep))) {
+ _IPATH_INFO
+ ("Failed to lookup and lock address %llx for rcvhdrtail: errno %d\n",
+ kinfo.spu_rcvhdraddr, -ret);
+ goto done;
+ }
+ ipath_stats.sps_pagelocks++;
+ pd->port_rcvhdrtail_uaddr = uaddr;
+ pd->port_rcvhdrtail_pagep = pagep;
+ pd->port_rcvhdrtail_kvaddr =
+ page_address(pagep);
+ pd->port_rcvhdrtail_kvaddr += off;
+ physaddr = page_to_phys(pagep) + off;
+ _IPATH_VDBG
+ ("port %d user addr %llx hdrtailaddr, %llx physical (off=%llx)\n",
+ pd->port_port, kinfo.spu_rcvhdraddr,
+ physaddr, off);
+ ipath_kput_kreg_port(t, kr_rcvhdrtailaddr,
+ pd->port_port, physaddr);
+ atmp =
+ ipath_kget_kreg64_port(t, kr_rcvhdrtailaddr,
+ pd->port_port);
+ if (physaddr != atmp) {
+ _IPATH_UNIT_ERROR(t,
+ "Catastrophic software error, RcvHdrTailAddr%u written as %llx, read back as %llx\n",
+ pd->port_port,
+ physaddr, atmp);
+ ret = -EINVAL;
+ goto done;
+ }
+ } else {
+ _IPATH_DBG
+ ("Port %d rcvhdrtail addr %llx not valid\n",
+ pd->port_port, kinfo.spu_rcvhdraddr);
+ ret = -EINVAL;
+ goto done;
+ }
+
+ /*
+ * for right now, kernel piobufs are at end,
+ * so port 1 is at 0
+ */
+ pd->port_piobufs = dd->ipath_piobufbase +
+ dd->ipath_pbufsport * (pd->port_port -
+ 1) * dd->ipath_palign;
+ _IPATH_VDBG("Set base of piobufs for port %u to 0x%x\n",
+ pd->port_port, pd->port_piobufs);
+
+ /*
+ * Now allocate the rcvhdr Q and eager TIDs;
+ * skip the TID array for time being.
+ * If pd->port_port > chip-supported, we need
+ * to do extra stuff here to handle by handling
+ * overflow through port 0, someday
+ */
+ if (!(ret = ipath_create_rcvhdrq(pd)))
+ ret = ipath_create_user_egr(pd);
+ if (!ret) { /* enable receives now */
+ uint64_t head;
+ uint32_t head32;
+ /* atomically set enable bit for this port */
+ atomic_set_mask(1U <<
+ (INFINIPATH_R_PORTENABLE_SHIFT +
+ pd->port_port),
+ &dd->ipath_rcvctrl);
+
+ /*
+ * set the head registers for this port
+ * to the current values of the tail
+ * pointers, since we don't know if they
+ * were updated on last use of the port.
+ */
+ head32 =
+ ipath_kget_ureg32(t, ur_rcvhdrtail,
+ pd->port_port);
+ head = (uint64_t) head32;
+ ipath_kput_ureg(t, ur_rcvhdrhead, head,
+ pd->port_port);
+ head32 =
+ ipath_kget_ureg32(t, ur_rcvegrindextail,
+ pd->port_port);
+ ipath_kput_ureg(t, ur_rcvegrindexhead, head32,
+ pd->port_port);
+ dd->ipath_lastegrheads[pd->port_port] = -1;
+ dd->ipath_lastrcvhdrqtails[pd->port_port] = -1;
+ _IPATH_VDBG
+ ("Wrote port%d head %llx, egrhead %x from tail regs\n",
+ pd->port_port, head, head32);
+ /* start at beginning after open */
+ pd->port_tidcursor = 0;
+ {
+ /*
+ * now enable the port; the tail
+ * registers will be written to
+ * memory by the chip as soon
+ * as it sees the write to
+ * kr_rcvctrl. The update only
+ * happens on transition from 0
+ * to 1, so clear it first, then
+ * set it as part of enabling
+ * the port. This will (very
+ * briefly) affect any other open
+ * ports, but it shouldn't be long
+ * enough to be an issue.
+ */
+ ipath_kput_kreg(t, kr_rcvctrl,
+ dd->
+ ipath_rcvctrl &
+ ~INFINIPATH_R_TAILUPD);
+ ipath_kput_kreg(t, kr_rcvctrl,
+ dd->ipath_rcvctrl);
+ }
+ }
+ }
+ }
+
+done:
+ return ret;
+}
+
+static int ipath_get_baseinfo(struct ipath_portdata *pd,
+ struct ipath_base_info __user *ubase)
+{
+ int ret = 0;
+ struct ipath_base_info kbase;
+ struct ipath_devdata *dd = &devdata[pd->port_unit];
+
+ /* be sure anything we don't set is 0ed */
+ memset(&kbase, 0, sizeof kbase);
+ kbase.spi_rcvhdr_cnt = dd->ipath_rcvhdrcnt;
+ kbase.spi_rcvhdrent_size = dd->ipath_rcvhdrentsize;
+ kbase.spi_tidegrcnt = dd->ipath_rcvegrcnt;
+ kbase.spi_rcv_egrbufsize = dd->ipath_rcvegrbufsize;
+ kbase.spi_rcv_egrbuftotlen = pd->port_rcvegrbuf_chunks * PAGE_SIZE * (1 << pd->port_rcvegrbuf_order); /* have to mmap whole thing */
+ kbase.spi_rcv_egrperchunk = pd->port_rcvegrbufs_perchunk;
+ kbase.spi_rcv_egrchunksize = kbase.spi_rcv_egrbuftotlen /
+ pd->port_rcvegrbuf_chunks;
+ kbase.spi_tidcnt = dd->ipath_rcvtidcnt;
+ /*
+ * for this use, may be ipath_cfgports summed over all chips that
+ * are are configured and present
+ */
+ kbase.spi_nports = dd->ipath_cfgports;
+ kbase.spi_unit = pd->port_unit; /* unit (chip/board) our port is on */
+ /* for now, only a single page */
+ kbase.spi_tid_maxsize = PAGE_SIZE;
+
+ /*
+ * doing this per port, and based on the skip value, etc.
+ * This has to be the actual buffer size, since the protocol
+ * code treats it as an array.
+ *
+ * These have to be set to user addresses in the user code via mmap
+ * These values are used on return to user code for the mmap target
+ * addresses only. For 32 bit, same 44 bit address problem, so use
+ * the physical address, not virtual. Before 2.6.11, using the
+ * page_address() macro worked, but in 2.6.11, even that returns
+ * the full 64 bit address (upper bits all 1's).
+ * So far, using the physical addresses (or chip offsets, for
+ * chip mapping) works, but no doubt some future kernel release
+ * will chang that, and we'll be on to yet another method of
+ * dealing with this
+ */
+ kbase.spi_rcvhdr_base = (uint64_t) pd->port_rcvhdrq_phys;
+ kbase.spi_rcv_egrbufs = (uint64_t) pd->port_rcvegr_phys;
+ kbase.spi_pioavailaddr = (uint64_t) dd->ipath_pioavailregs_phys;
+ kbase.spi_status = (uint64_t) kbase.spi_pioavailaddr +
+ (void *)dd->ipath_statusp - (void *)dd->ipath_pioavailregs_dma;
+ kbase.spi_piobufbase = (uint64_t) pd->port_piobufs;
+ kbase.__spi_uregbase =
+ dd->ipath_uregbase + dd->ipath_palign * pd->port_port;
+
+ kbase.spi_pioindex = dd->ipath_pbufsport * (pd->port_port - 1);
+ kbase.spi_piocnt = dd->ipath_pbufsport;
+ kbase.spi_pioalign = dd->ipath_palign;
+
+ kbase.spi_qpair = IPATH_KD_QP;
+ kbase.spi_piosize = dd->ipath_ibmaxlen;
+ kbase.spi_mtu = dd->ipath_ibmaxlen; /* maxlen, not ibmtu */
+ kbase.spi_port = pd->port_port;
+ kbase.spi_sw_version = IPATH_KERN_SWVERSION;
+ kbase.spi_hw_version = dd->ipath_revision;
+
+ if (copy_to_user(ubase, &kbase, sizeof kbase))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+/*
+ * return number of units supported by driver. This is infinipath_max,
+ * unless there are no initted units.
+ */
+static int ipath_get_units(void)
+{
+ int i;
+
+ for (i = 0; i < infinipath_max; i++)
+ if (devdata[i].ipath_flags & IPATH_INITTED)
+ return infinipath_max;
+ return 0;
+}
+
+/* write data to the EEPROM on the board */
+static int ipath_wr_eeprom(struct ipath_portdata* pd,
+ struct ipath_eeprom_req __user *req)
+{
+ int ret = 0;
+ struct ipath_eeprom_req kreq;
+ void *buf = NULL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM; /* not just any old user can write flash */
+ if (copy_from_user(&kreq, req, sizeof kreq))
+ return -EFAULT;
+ if (!kreq.addr || (kreq.offset + kreq.len) > 128) {
+ _IPATH_DBG
+ ("called with NULL addr %llx, or bad cnt %u or offset %u\n",
+ kreq.addr, kreq.len, kreq.offset);
+ return -EINVAL;
+ }
+
+ if (!(buf = vmalloc(kreq.len))) {
+ ret = -ENOMEM;
+ _IPATH_UNIT_ERROR(pd->port_unit,
+ "Couldn't allocate memory to write %u bytes from eeprom\n",
+ kreq.len);
+ goto done;
+ }
+ if (copy_from_user(buf, (void __user *) kreq.addr, kreq.len)) {
+ ret = -EFAULT;
+ goto done;
+ }
+ if (ipath_eeprom_write(pd->port_unit, kreq.offset, buf, kreq.len)) {
+ ret = -ENXIO;
+ _IPATH_UNIT_ERROR(pd->port_unit,
+ "Failed write to eeprom %u bytes offset %u\n",
+ kreq.len, kreq.offset);
+ }
+
+done:
+ if (buf)
+ vfree(buf);
+ return ret;
+}
+
+/* read data from the EEPROM on the board */
+int ipath_rd_eeprom(const ipath_type port_unit,
+ struct ipath_eeprom_req __user *req)
+{
+ int ret = 0;
+ struct ipath_eeprom_req kreq;
+ void *buf = NULL;
+
+ if (copy_from_user(&kreq, req, sizeof kreq))
+ return -EFAULT;
+ if (!kreq.addr || (kreq.offset + kreq.len) > 128) {
+ _IPATH_DBG
+ ("called with NULL addr %llx, or bad cnt %u or offset %u\n",
+ kreq.addr, kreq.len, kreq.offset);
+ return -EINVAL;
+ }
+
+ if (!(buf = vmalloc(kreq.len))) {
+ ret = -ENOMEM;
+ _IPATH_UNIT_ERROR(port_unit,
+ "Couldn't allocate memory to read %u bytes from eeprom\n",
+ kreq.len);
+ goto done;
+ }
+ if (ipath_eeprom_read(port_unit, kreq.offset, buf, kreq.len)) {
+ ret = -ENXIO;
+ _IPATH_UNIT_ERROR(port_unit,
+ "Failed reading %u bytes offset %u from eeprom\n",
+ kreq.len, kreq.offset);
+ }
+ if (copy_to_user((void __user *) kreq.addr, buf, kreq.len))
+ ret = -EFAULT;
+
+done:
+ if (buf)
+ vfree(buf);
+ return ret;
+}
+
+/*
+ * wait for something to happen on a port. Currently this is
+ * PIO buffer available, or a packet being received. For now, at
+ * least, we wait no longer than 1/2 seconds on rcv, 1 tick on PIO, so
+ * we recover from any bugs (or, as we see in ips.c init and close, cases
+ * where other side isn't yet ready).
+ * NOTE: currently called only with PIO or RCV, never both, so path with both
+ * has not been tested
+ */
+static int ipath_wait_intr(struct ipath_portdata * pd, uint32_t flag)
+{
+ struct ipath_devdata *dd = &devdata[pd->port_unit];
+ /* stupid compiler can't tell it's initialized */
+ uint32_t im = 0;
+ uint32_t head, tail, timeo = 0, wflag = 0;
+
+ if (!(flag & (IPATH_WAIT_RCV | IPATH_WAIT_PIO)))
+ return -EINVAL;
+ if (flag & IPATH_WAIT_RCV) {
+ head = flag >> 16;
+ im = (1U << pd->port_port) << INFINIPATH_R_INTRAVAIL_SHIFT;
+ atomic_set_mask(im, &dd->ipath_rcvctrl);
+ /*
+ * now, before blocking, make sure that head is still == tail,
+ * reading from the chip, so we can be sure the interrupt enable
+ * has made it to the chip. If not equal, disable
+ * interrupt again and return immediately. This avoids
+ * races, and the overhead of the chip read doesn't
+ * matter much at this point, since we are waiting for
+ * something anyway.
+ */
+ ipath_kput_kreg(pd->port_unit, kr_rcvctrl, dd->ipath_rcvctrl);
+ tail =
+ ipath_kget_ureg32(pd->port_unit, ur_rcvhdrtail,
+ pd->port_port);
+ if (tail == head) {
+ timeo = HZ / 2;
+ wflag = IPATH_PORT_WAITING_RCV;
+ } else {
+ atomic_clear_mask(im, &dd->ipath_rcvctrl);
+ ipath_kput_kreg(pd->port_unit, kr_rcvctrl,
+ dd->ipath_rcvctrl);
+ }
+ }
+ if (flag & IPATH_WAIT_PIO) {
+ /*
+ * this one's a bit worse than the receive case, in that we
+ * can't really verify that at least one interrupt
+ * will happen...
+ * We do use a really short timeout, however
+ */
+ timeo = 1; /* if both, the short PIO timeout wins */
+ atomic_set_mask(1U << pd->port_port, &dd->ipath_portpiowait);
+ wflag |= IPATH_PORT_WAITING_PIO;
+ /*
+ * this has a possible race with the ipath stuff, so do
+ * it atomicly
+ */
+ atomic_set_mask(INFINIPATH_S_PIOINTBUFAVAIL,
+ &dd->ipath_sendctrl);
+ ipath_kput_kreg(pd->port_unit, kr_sendctrl, dd->ipath_sendctrl);
+ }
+ if (wflag) {
+ pd->port_flag |= wflag;
+ wait_event_interruptible_timeout(pd->port_wait,
+ (pd->port_flag & wflag) !=
+ wflag, timeo);
+ if (wflag & pd->port_flag & IPATH_PORT_WAITING_PIO) {
+ /* timed out, no PIO interrupts */
+ atomic_clear_mask(IPATH_PORT_WAITING_PIO,
+ &pd->port_flag);
+ pd->port_piowait_to++;
+ atomic_clear_mask(1U << pd->port_port,
+ &dd->ipath_portpiowait);
+ /*
+ * *don't* clear the pio interrupt enable;
+ * let that happen in the interrupt handler;
+ * else we have a race condition.
+ */
+ }
+ if (wflag & pd->port_flag & IPATH_PORT_WAITING_RCV) {
+ /* timed out, no packets received */
+ atomic_clear_mask(IPATH_PORT_WAITING_RCV,
+ &pd->port_flag);
+ pd->port_rcvwait_to++;
+ atomic_clear_mask(im, &dd->ipath_rcvctrl);
+ ipath_kput_kreg(pd->port_unit, kr_rcvctrl,
+ dd->ipath_rcvctrl);
+ }
+ } else {
+ /* else it's already happened, don't do wait_event overhead */
+ if (flag & IPATH_WAIT_RCV)
+ pd->port_rcvnowait++;
+ if (flag & IPATH_WAIT_PIO)
+ pd->port_pionowait++;
+ }
+ return 0;
+}
+
+/*
+ * The new implementation as of Oct 2004 is that the driver assigns
+ * the tid and returns it to the caller. To make it easier to
+ * catch bugs, and to reduce search time, we keep a cursor for
+ * each port, walking the shadow tid array to find one that's not
+ * in use.
+ *
+ * For now, if we can't allocate the full list, we fail, although
+ * in the long run, we'll allocate as many as we can, and the
+ * caller will deal with that by trying the remaining pages later.
+ * That means that when we fail, we have to mark the tids as not in
+ * use again, in our shadow copy.
+ *
+ * It's up to the caller to free the tids when they are done.
+ * We'll unlock the pages as they free them.
+ *
+ * Also, right now we are locking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.
+ */
+static int ipath_tid_update(struct ipath_portdata * pd,
+ struct _tidupd __user *tidu)
+{
+ int ret = 0, ntids;
+ uint32_t tid, porttid, cnt, i, tidcnt;
+ struct _tidupd tu;
+ uint16_t *tidlist;
+ struct ipath_devdata *dd = &devdata[pd->port_unit];
+ uint64_t vaddr, physaddr, lenvalid;
+ uint64_t __iomem *tidbase;
+ uint64_t tidmap[8];
+ struct page **pagep = NULL;
+
+ tu.tidcnt = 0; /* for early errors */
+ if (!dd->ipath_pageshadow) {
+ ret = -ENOMEM;
+ goto done;
+ }
+ if (copy_from_user(&tu, tidu, sizeof tu)) {
+ ret = -EFAULT;
+ goto done;
+ }
+
+ if (!(cnt = tu.tidcnt)) {
+ _IPATH_DBG("After copyin, tidcnt 0, tidlist %llx\n",
+ tu.tidlist);
+ /* or should we treat as success? likely a bug */
+ ret = -EFAULT;
+ goto done;
+ }
+ tidcnt = dd->ipath_rcvtidcnt;
+ if (cnt >= tidcnt) { /* make sure it all fits in port_tid_pg_list */
+ _IPATH_INFO
+ ("Process tried to allocate %u TIDs, only trying max (%u)\n",
+ cnt, tidcnt);
+ cnt = tidcnt;
+ }
+ pagep = (struct page **)pd->port_tid_pg_list;
+ tidlist = (uint16_t *) (&pagep[cnt]);
+
+ memset(tidmap, 0, sizeof(tidmap));
+ tid = pd->port_tidcursor;
+ /* before decrement; chip actual # */
+ porttid = pd->port_port * tidcnt;
+ ntids = tidcnt;
+ tidbase = (uint64_t __iomem *)
+ (((char __iomem *) devdata[pd->port_unit].ipath_kregbase) +
+ devdata[pd->port_unit].ipath_rcvtidbase +
+ porttid * sizeof(*tidbase));
+
+ _IPATH_VDBG("Port%u %u tids, cursor %u, tidbase %p\n", pd->port_port,
+ cnt, tid, tidbase);
+
+ vaddr = tu.tidvaddr; /* virtual address of first page in transfer */
+ if (!access_ok(VERIFY_WRITE, (void __user *) vaddr, cnt * PAGE_SIZE)) {
+ _IPATH_DBG("Fail vaddr %llx, %u pages, !access_ok\n",
+ vaddr, cnt);
+ ret = -EFAULT;
+ goto done;
+ }
+ if ((ret = ipath_get_upages((unsigned long)vaddr, cnt, pagep))) {
+ if (ret == -EBUSY) {
+ _IPATH_DBG
+ ("Failed to lock addr %p, %u pages (already locked)\n",
+ (void *)vaddr, cnt);
+ /*
+ * for now, continue, and see what happens
+ * but with the new implementation, this should
+ * never happen, unless perhaps the user has
+ * mpin'ed the pages themselves (something we
+ * need to test)
+ */
+ ret = 0;
+ } else {
+ _IPATH_INFO
+ ("Failed to lock addr %p, %u pages: errno %d\n",
+ (void *)vaddr, cnt, -ret);
+ goto done;
+ }
+ }
+ for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) {
+ for (; ntids--; tid++) {
+ if (tid == tidcnt)
+ tid = 0;
+ if (!dd->ipath_pageshadow[porttid + tid])
+ break;
+ }
+ if (ntids < 0) {
+ /*
+ * oops, wrapped all the way through their TIDs,
+ * and didn't have enough free; see comments at
+ * start of routine
+ */
+ _IPATH_DBG
+ ("Not enough free TIDs for %u pages (index %d), failing\n",
+ cnt, i);
+ i--; /* last tidlist[i] not filled in */
+ ret = -ENOMEM;
+ break;
+ }
+ tidlist[i] = tid;
+ _IPATH_VDBG("Updating idx %u to TID %u, vaddr %llx\n",
+ i, tid, vaddr);
+ /* for now we "know" system pages and TID pages are same size */
+ /* for ipath_free_tid */
+ dd->ipath_pageshadow[porttid + tid] = pagep[i];
+ __set_bit(tid, tidmap); /* don't need atomic or it's overhead */
+ physaddr = page_to_phys(pagep[i]);
+ ipath_stats.sps_pagelocks++;
+ _IPATH_VDBG("TID %u, vaddr %llx, physaddr %llx pgp %p\n",
+ tid, vaddr, physaddr, pagep[i]);
+ /*
+ * in words (fixed, full page). could make less for very last
+ * page in transfer, but for now we won't worry about it.
+ */
+ lenvalid = PAGE_SIZE >> 2;
+ lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT;
+ physaddr |= lenvalid | INFINIPATH_RT_VALID;
+ ipath_kput_memq(pd->port_unit, &tidbase[tid], physaddr);
+ /*
+ * don't check this tid in ipath_portshadow, since we
+ * just filled it in; start with the next one.
+ */
+ tid++;
+ }
+
+ if (ret) {
+ uint32_t limit;
+ uint64_t tidval;
+ /*
+ * chip errata bug 7358, try to work around it by
+ * marking invalid tids as having max length
+ */
+ tidval =
+ (-1LL & INFINIPATH_RT_BUFSIZE_MASK) <<
+ INFINIPATH_RT_BUFSIZE_SHIFT;
+ cleanup:
+ /* jump here if copy out of updated info failed... */
+ _IPATH_DBG("After failure (ret=%d), undo %d of %d entries\n",
+ -ret, i, cnt);
+ /* same code that's in ipath_free_tid() */
+ if ((limit = sizeof(tidmap) * BITS_PER_BYTE) > tidcnt)
+ /* just in case size changes in future */
+ limit = tidcnt;
+ tid = find_first_bit((const unsigned long *)tidmap, limit);
+ /*
+ * chip errata bug 7358, try to work around it by
+ * marking invalid tids as having max length
+ */
+ tidval =
+ (-1LL & INFINIPATH_RT_BUFSIZE_MASK) <<
+ INFINIPATH_RT_BUFSIZE_SHIFT;
+ for (; tid < limit; tid++) {
+ if (!test_bit(tid, tidmap))
+ continue;
+ if (dd->ipath_pageshadow[porttid + tid]) {
+ _IPATH_VDBG("Freeing TID %u\n", tid);
+ ipath_kput_memq(pd->port_unit, &tidbase[tid],
+ tidval);
+ dd->ipath_pageshadow[porttid + tid] = NULL;
+ ipath_stats.sps_pageunlocks++;
+ }
+ }
+ ipath_putpages(cnt, pagep);
+ } else {
+ /*
+ * copy the updated array, with ipath_tid's filled in,
+ * back to user. Since we did the copy in already, this
+ * "should never fail"
+ * If it does, we have to clean up...
+ */
+ int r;
+ if ((r = copy_to_user((void __user *) tu.tidlist, tidlist,
+ cnt * sizeof(*tidlist)))) {
+ _IPATH_DBG("Failed to copy out %d TIDs (%lx bytes) "
+ "to %llx (ret %x)\n", cnt,
+ cnt * sizeof(*tidlist), tu.tidlist, r);
+ ret = -EFAULT;
+ goto cleanup;
+ }
+ if (copy_to_user((void __user *) tu.tidmap, tidmap,
+ sizeof tidmap)) {
+ _IPATH_DBG("Failed to copy out TID map to %llx\n",
+ tu.tidmap);
+ ret = -EFAULT;
+ goto cleanup;
+ }
+ if (tid == tidcnt)
+ tid = 0;
+ pd->port_tidcursor = tid;
+ }
+
+done:
+ if (ret)
+ _IPATH_DBG("Failed to map %u TID pages, failing with %d, "
+ "tidu %p\n", tu.tidcnt, -ret, tidu);
+ return ret;
+}
+
+/*
+ * right now we are unlocking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance. We check that the TID is in range for this port
+ * but otherwise don't check validity; if user has an error and
+ * frees the wrong tid, it's only their own data that can thereby
+ * be corrupted. We do check that the TID was in use, for sanity
+ * We always use our idea of the saved address, not the address that
+ * they pass in to us.
+ */
+
+static int ipath_tid_free(struct ipath_portdata * pd,
+ struct _tidupd __user *tidu)
+{
+ int ret = 0;
+ uint32_t tid, porttid, cnt, limit, tidcnt;
+ struct _tidupd tu;
+ struct ipath_devdata *dd = &devdata[pd->port_unit];
+ uint64_t __iomem *tidbase;
+ uint64_t tidmap[8];
+ uint64_t tidval;
+
+ tu.tidcnt = 0; /* for early errors */
+ if (!dd->ipath_pageshadow) {
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ if (copy_from_user(&tu, tidu, sizeof tu)) {
+ _IPATH_DBG("copy of tidupd structure failed\n");
+ ret = -EFAULT;
+ goto done;
+ }
+ if (copy_from_user(tidmap, (void __user *) tu.tidmap, sizeof tidmap)) {
+ _IPATH_DBG("copy of tidmap failed\n");
+ ret = -EFAULT;
+ goto done;
+ }
+
+ porttid = pd->port_port * dd->ipath_rcvtidcnt;
+ tidbase = (uint64_t __iomem *)
+ ((char __iomem *) (devdata[pd->port_unit].ipath_kregbase) +
+ devdata[pd->port_unit].ipath_rcvtidbase +
+ porttid * sizeof(*tidbase));
+
+ tidcnt = dd->ipath_rcvtidcnt;
+ if ((limit = sizeof(tidmap) * BITS_PER_BYTE) > tidcnt)
+ limit = tidcnt; /* just in case size changes in future */
+ tid = find_first_bit((const unsigned long *)tidmap, limit);
+ _IPATH_VDBG
+ ("Port%u free %u tids; first bit (max=%d) set is %d, porttid %u\n",
+ pd->port_port, tu.tidcnt, limit, tid, porttid);
+ /*
+ * chip errata bug 7358, try to work around it by marking invalid
+ * tids as having max length
+ */
+ tidval =
+ (-1LL & INFINIPATH_RT_BUFSIZE_MASK) << INFINIPATH_RT_BUFSIZE_SHIFT;
+ for (cnt = 0; tid < limit; tid++) {
+ /*
+ * small optimization; if we detect a run of 3 or so without
+ * any set, use find_first_bit again. That's mainly to
+ * accelerate the case where we wrapped, so we have some at
+ * the beginning, and some at the end, and a big gap
+ * in the middle.
+ */
+ if (!test_bit(tid, tidmap))
+ continue;
+ cnt++;
+ if (dd->ipath_pageshadow[porttid + tid]) {
+ _IPATH_VDBG("Freeing TID %u\n", tid);
+ ipath_kput_memq(pd->port_unit, &tidbase[tid], tidval);
+ ipath_putpages(1, &dd->ipath_pageshadow[porttid + tid]);
+ dd->ipath_pageshadow[porttid + tid] = NULL;
+ ipath_stats.sps_pageunlocks++;
+ } else
+ _IPATH_DBG("Unused tid %u, ignoring\n", tid);
+ }
+ if (cnt != tu.tidcnt)
+ _IPATH_DBG("passed in tidcnt %d, only %d bits set in map\n",
+ tu.tidcnt, cnt);
+done:
+ if (ret)
+ _IPATH_DBG("Failed to unmap %u TID pages, failing with %d\n",
+ tu.tidcnt, -ret);
+ return ret;
+}
+
+/* called from user init code, and also layered driver init */
+int ipath_setrcvhdrsize(const ipath_type mdev, unsigned rhdrsize)
+{
+ int ret = 0;
+ if (devdata[mdev].ipath_flags & IPATH_RCVHDRSZ_SET) {
+ if (devdata[mdev].ipath_rcvhdrsize != rhdrsize) {
+ _IPATH_INFO
+ ("Error: can't set protocol header size %u, already %u\n",
+ rhdrsize, devdata[mdev].ipath_rcvhdrsize);
+ ret = -EAGAIN;
+ } else
+ /* OK if set already, with same value, nothing to do */
+ _IPATH_VDBG("Reuse same protocol header size %u\n",
+ devdata[mdev].ipath_rcvhdrsize);
+ } else if (rhdrsize >
+ (devdata[mdev].ipath_rcvhdrentsize -
+ (sizeof(uint64_t) / sizeof(uint32_t)))) {
+ _IPATH_DBG
+ ("Error: can't set protocol header size %u (> max %u)\n",
+ rhdrsize,
+ devdata[mdev].ipath_rcvhdrentsize -
+ (uint32_t) (sizeof(uint64_t) / sizeof(uint32_t)));
+ ret = -EOVERFLOW;
+ } else {
+ devdata[mdev].ipath_flags |= IPATH_RCVHDRSZ_SET;
+ devdata[mdev].ipath_rcvhdrsize = rhdrsize;
+ ipath_kput_kreg(mdev, kr_rcvhdrsize,
+ devdata[mdev].ipath_rcvhdrsize);
+ _IPATH_VDBG("Set protocol header size to %u\n",
+ devdata[mdev].ipath_rcvhdrsize);
+ }
+ return ret;
+}
+
+
+/*
+ * find an available pio buffer, and do appropriate marking as busy, etc.
+ * returns buffer number if one found (>=0), negative number is error.
+ * Used by ipath_send_smapkt and ipath_layer_send
+ */
+uint32_t __iomem *ipath_getpiobuf(int mdev, uint32_t *pbufnum)
+{
+ int i, j, starti, updated = 0;
+ unsigned piobcnt, iter;
+ unsigned long flags;
+ struct ipath_devdata *dd = &devdata[mdev];
+ uint64_t *shadow = dd->ipath_pioavailshadow;
+ uint32_t __iomem *buf;
+
+ piobcnt = (unsigned)devdata[mdev].ipath_piobcnt;
+ starti = devdata[mdev].ipath_lastport_piobuf;
+ iter = piobcnt - starti;
+ if (dd->ipath_upd_pio_shadow) {
+ /*
+ * minor optimization. If we had no buffers on last call, start out
+ * by doing the update; continue and do scan even if no buffers
+ * were updated, to be paranoid
+ */
+ ipath_update_pio_bufs(mdev);
+ updated = 1; /* we scanned here, don't do it at end of scan */
+ i = starti;
+ }
+ else
+ i = devdata[mdev].ipath_lastpioindex;
+
+rescan:
+ /*
+ * while test_and_set_bit() is atomic,
+ * we do that and then the change_bit(), and the pair is not.
+ * See if this is the cause of the remaining armlaunch errors.
+ */
+ spin_lock_irqsave(&ipath_pioavail_lock, flags);
+ for (j = 0; j < iter; j++, i++) {
+ if (i >= piobcnt)
+ i = starti;
+ /*
+ * To avoid bus lock overhead, we first find a candidate
+ * buffer, then do the test and set, and continue if that fails.
+ */
+ if (test_bit((2 * i) + 1, shadow) ||
+ test_and_set_bit((2 * i) + 1, shadow)) {
+ continue;
+ }
+ /* flip generation bit */
+ change_bit(2 * i, shadow);
+ break;
+ }
+ spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+
+ if (j == iter) {
+ /*
+ * first time through; shadow exhausted, but may be
+ * real buffers available, so go see; if any updated, rescan (once)
+ */
+ if (!updated) {
+ ipath_update_pio_bufs(mdev);
+ updated = 1;
+ i = starti;
+ goto rescan;
+ }
+ dd->ipath_upd_pio_shadow = 1;
+ /* not atomic, but if we lose one once in a while, that's OK */
+ ipath_stats.sps_nopiobufs++;
+ if (!(++dd->ipath_consec_nopiobuf % 100000)) {
+ _IPATH_DBG
+ ("%u pio sends with no bufavail; dmacopy: %llx %llx %llx %llx; shadow: %llx %llx %llx %llx\n",
+ dd->ipath_consec_nopiobuf,
+ dd->ipath_pioavailregs_dma[0],
+ dd->ipath_pioavailregs_dma[1],
+ dd->ipath_pioavailregs_dma[2],
+ dd->ipath_pioavailregs_dma[3],
+ shadow[0], shadow[1], shadow[2], shadow[3]);
+ /*
+ * 4 buffers per byte, 4 registers above, cover
+ * rest below
+ */
+ if (dd->ipath_piobcnt > (sizeof(shadow[0])
+ * 4 * 4))
+ _IPATH_DBG
+ ("2nd group: dmacopy: %llx %llx %llx %llx; shadow: %llx %llx %llx %llx\n",
+ devdata[mdev].ipath_pioavailregs_dma[4],
+ devdata[mdev].ipath_pioavailregs_dma[5],
+ devdata[mdev].ipath_pioavailregs_dma[6],
+ devdata[mdev].ipath_pioavailregs_dma[7],
+ shadow[4], shadow[5], shadow[6], shadow[7]);
+ }
+ return NULL;
+ }
+
+ if (updated && devdata[mdev].ipath_layer.l_intr) {
+ /*
+ * ran out of bufs, now some (at least this one we just got)
+ * are now available, so tell the layered driver.
+ */
+ dd->ipath_layer.l_intr(mdev, IPATH_LAYER_INT_SEND_CONTINUE);
+ }
+
+ /*
+ * set next starting place. Since it's just an optimization,
+ * it doesn't matter who wins on this, so no locking
+ */
+ dd->ipath_lastpioindex = i + 1;
+ if (dd->ipath_upd_pio_shadow)
+ dd->ipath_upd_pio_shadow = 0;
+ if (dd->ipath_consec_nopiobuf)
+ dd->ipath_consec_nopiobuf = 0;
+ buf = (uint32_t __iomem *)(dd->ipath_piobase + i * dd->ipath_palign);
+ _IPATH_VDBG("Return piobuf %u @ %p\n", i, buf);
+ if (pbufnum)
+ *pbufnum = i;
+ return buf;
+}
+
+/*
+ * this is like ipath_getpiobuf(), except it just probes to see if a buffer
+ * is available. If it returns that there is one, it's not allocated,
+ * and so may not be available if caller tries to send.
+ * NOTE: This can be called from interrupt context by ipath_intr()
+ * and from non-interrupt context by layer_send_getpiobuf().
+ */
+int ipath_bufavail(int mdev)
+{
+ int i;
+ unsigned piobcnt;
+ uint64_t *shadow = devdata[mdev].ipath_pioavailshadow;
+
+ piobcnt = (unsigned)devdata[mdev].ipath_piobcnt;
+
+ for (i = devdata[mdev].ipath_lastport_piobuf; i < piobcnt; i++)
+ if (!test_bit((2 * i) + 1, shadow))
+ return 1;
+
+ /* if none, check for update and rescan if we updated */
+ ipath_update_pio_bufs(mdev);
+ for (i = devdata[mdev].ipath_lastport_piobuf; i < piobcnt; i++)
+ if (!test_bit((2 * i) + 1, shadow))
+ return 1;
+ _IPATH_PDBG("No bufs avail\n");
+ return 0;
+}
+
+/*
+ * This routine is no longer on any critical paths; it is used only
+ * for sending SMA packets, and some diagnostic usage.
+ * Because it's currently sma only, there are no checks to see if the
+ * link is up; sma must be able to send in the not fully initialized state
+ */
+int ipath_send_smapkt(struct ipath_sendpkt __user *upkt)
+{
+ int i, ret = 0;
+ uint32_t __iomem *piobuf;
+ uint32_t plen = 0, clen, pbufn;
+ struct ipath_sendpkt kpkt;
+ struct ipath_iovec *iov = kpkt.sps_iov;
+ ipath_type t;
+ uint32_t *tmpbuf = NULL;
+
+ if (unlikely((copy_from_user(&kpkt, upkt, sizeof kpkt))))
+ ret = -EFAULT;
+ if (ret) {
+ _IPATH_VDBG("Send failed: error %d\n", -ret);
+ goto done;
+ }
+ t = kpkt.sps_flags;
+ if (t >= infinipath_max || !(devdata[t].ipath_flags & IPATH_PRESENT) ||
+ !devdata[t].ipath_kregbase) {
+ _IPATH_SMADBG("illegal unit %u for sma send\n", t);
+ return -ENODEV;
+ }
+ if (!(devdata[t].ipath_flags & IPATH_INITTED)) {
+ /* no hardware, freeze, etc. */
+ _IPATH_SMADBG("unit %u not usable\n", t);
+ return -ENODEV;
+ }
+
+ /* need total length before first word written */
+ plen = sizeof(uint32_t); /* +1 word is for the qword padding */
+ for (i = 0; i < kpkt.sps_cnt; i++)
+ /* each must be dword multiple */
+ plen += kpkt.sps_iov[i].iov_len;
+
+ if ((plen + 4) > devdata[t].ipath_ibmaxlen) {
+ _IPATH_DBG("Pkt len 0x%x > ibmaxlen %x\n",
+ plen - 4, devdata[t].ipath_ibmaxlen);
+ ret = -EINVAL;
+ goto done; /* before writing pbc */
+ }
+ if (!(tmpbuf = vmalloc(plen))) {
+ _IPATH_INFO("Unable to allocate tmp buffer, failing\n");
+ ret = -ENOMEM;
+ goto done;
+ }
+ plen >>= 2; /* in words */
+
+ piobuf = ipath_getpiobuf(t, &pbufn);
+ if (!piobuf) {
+ ret = -EBUSY;
+ devdata[t].ipath_nosma_bufs++;
+ _IPATH_SMADBG("No PIO buffers available unit %u %u times\n",
+ t, devdata[t].ipath_nosma_bufs);
+ goto done;
+ }
+ if (devdata[t].ipath_nosma_bufs) {
+ _IPATH_SMADBG(
+ "Unit %u got SMA send buffer after %u failures, %u seconds\n",
+ t, devdata[t].ipath_nosma_bufs, devdata[t].ipath_nosma_secs);
+ devdata[t].ipath_nosma_bufs = 0;
+ devdata[t].ipath_nosma_secs = 0;
+ }
+ if ((devdata[t].ipath_lastibcstat & 0x11) != 0x11 &&
+ (devdata[t].ipath_lastibcstat & 0x21) != 0x21) {
+ /* we need to be at least at INIT for SMA packets to go out. If we
+ * aren't, something has gone wrong, and SMA hasn't noticed.
+ * Therefore we'll try to go to INIT here, in hopes of fixing up the
+ * problem. First we verify that indeed the state is still "bad"
+ * (that is, that lastibcstat * isn't "stale") */
+ uint64_t val;
+ val = ipath_kget_kreg64(t, kr_ibcstatus);
+ if ((val & 0x11) != 0x11 && (val & 0x21) != 0x21) {
+ _IPATH_SMADBG("Invalid Link state 0x%llx unit %u for send, try INIT\n",
+ val, t);
+ ipath_set_ib_lstate(t, INFINIPATH_IBCC_LINKCMD_INIT);
+ val = ipath_kget_kreg64(t, kr_ibcstatus);
+ if ((val & 0x11) != 0x11 && (val & 0x21) != 0x21)
+ _IPATH_SMADBG("Link state still not OK unit %u (0x%llx) after INIT\n",
+ t, val);
+ else
+ _IPATH_SMADBG("Link state OK unit %u (0x%llx) after INIT\n",
+ t, val);
+ }
+ /* and continue, regardless */
+ }
+
+ if (infinipath_debug & __IPATH_PKTDBG) // SMA and PKT, both
+ _IPATH_SMADBG("unit %u 0x%x+1w pio%d, (scnt %d)\n",
+ t, plen - 1, pbufn, kpkt.sps_cnt);
+
+
+ /* we have to flush after the PBC for correctness on some cpus
+ * or WC buffer can be written out of order */
+ writeq(plen, piobuf);
+ mb();
+ ret = 0;
+ for (clen=i=0; i < kpkt.sps_cnt; i++) {
+ if (unlikely(copy_from_user(tmpbuf + clen,
+ (void __user *) iov->iov_base,
+ iov->iov_len)))
+ ret = -EFAULT; /* no break */
+ clen += iov->iov_len >> 2;
+ iov++;
+ }
+ /* copy all by the trigger word, then flush, so it's written
+ * to chip before trigger word, then write trigger word, then
+ * flush again, so packet is sent. */
+ memcpy_toio32(piobuf+2, tmpbuf, clen-1);
+ mb();
+ writel(tmpbuf[clen-1], piobuf+clen+1);
+ mb();
+
+ if (ret) {
+ /*
+ * Packet is bad, so we need to use the PIO abort mechanism to
+ * abort the packet
+ */
+ uint32_t sendctrl;
+ sendctrl = devdata[t].ipath_sendctrl | INFINIPATH_S_DISARM |
+ (pbufn << INFINIPATH_S_DISARMPIOBUF_SHIFT);
+ _IPATH_DBG("Doing PIO abort on buffer %u after error\n",
+ pbufn);
+ ipath_kput_kreg(t, kr_sendctrl, sendctrl);
+ }
+
+done:
+ vfree(tmpbuf);
+ return ret;
+}
+
+/*
+ * implemention of the ioctl to get the counter values from the chip
+ * For the time being, we get all of them when asked, no shadowing.
+ * We need to shadow the byte counters at a minimum, because otherwise
+ * they will wrap in just a few seconds at full bandwidth
+ * The second argument is the user address to which we do the copy_to_user()
+ */
+static int ipath_get_counters(ipath_type t,
+ struct infinipath_counters __user *ucounters)
+{
+ int ret = 0;
+ uint64_t val;
+ uint64_t __user *ucreg;
+ uint16_t vcreg;
+
+ ucreg = (uint64_t __user *) ucounters;
+ /*
+ * for now, let's do this one at a time. It's not the most
+ * optimal method, but it is simple, and has no intermediate
+ * memory requirements.
+ */
+ for (vcreg = 0;
+ vcreg < (sizeof(struct infinipath_counters) / sizeof(val));
+ vcreg++, ucreg++) {
+ ipath_creg creg = vcreg;
+ val = ipath_snap_cntr(t, creg);
+ if ((ret = copy_to_user(ucreg, &val, sizeof(val)))) {
+ _IPATH_DBG("copy_to_user error on counter %d\n", creg);
+ ret = -EFAULT;
+ break;
+ }
+ }
+
+ return ret;
+}
More information about the general
mailing list