[openib-general] [PATCH 3 of 18] ipath - copy and send routines for sending an skb
Bryan O'Sullivan
bos at pathscale.com
Thu Mar 23 20:41:35 PST 2006
These routines handle the access and alignment patterns required by the
hardware, so that skbs, which have looser requirements on alignment and
sizing, can be copied, checksummed, and sent efficiently.
Signed-off-by: Bryan O'Sullivan <bos at pathscale.com>
diff -r 4b2debbcae33 -r 5685fc1cd481 drivers/infiniband/hw/ipath/ipath_copy.c
--- /dev/null Thu Jan 1 00:00:00 1970 +0000
+++ b/drivers/infiniband/hw/ipath/ipath_copy.c Thu Mar 23 20:27:44 2006 -0800
@@ -0,0 +1,521 @@
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file provides support for doing sk_buff buffer swapping between
+ * the low level driver eager buffers, and the network layer. It's part
+ * of the core driver, rather than the ether driver, because it relies
+ * on variables and functions in the core driver. It exports a single
+ * entry point for use in the ipath_ether module.
+ */
+
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+
+#include "ipath_kernel.h"
+#include "ips_common.h"
+
+/**
+ * layer_send_getpiobuf - allocate, setup and copy out a PIO send buffer
+ * @dd: the infinipath device
+ * @cdp: the data to copy
+ *
+ * Allocate a PIO send buffer, initialize the header and copy it out.
+ */
+static int layer_send_getpiobuf(struct ipath_devdata *dd,
+ struct copy_data_s *cdp)
+{
+ u32 extra_bytes;
+ u32 len, nwords, hdrwords;
+ u32 __iomem *piobuf;
+ int ret;
+
+ piobuf = ipath_getpiobuf(dd, NULL);
+ if (!piobuf) {
+ cdp->error = -EBUSY;
+ ret = cdp->error;
+ goto bail;
+ }
+
+ /*
+ * Compute the max amount of data that can fit into a PIO buffer.
+ * buffer size - header size - trigger qword length & flags - CRC
+ */
+ len = dd->ipath_ibmaxlen -
+ sizeof(struct ether_header) - 8 - (SIZE_OF_CRC << 2);
+ if (len > dd->ipath_rcvegrbufsize)
+ len = dd->ipath_rcvegrbufsize;
+ if (len > (cdp->len + cdp->extra))
+ len = (cdp->len + cdp->extra);
+ /* Compute word aligment (i.e., (len & 3) ? 4 - (len & 3) : 0) */
+ extra_bytes = (4 - len) & 3;
+ nwords = (sizeof(struct ether_header) + len + extra_bytes) >> 2;
+ cdp->hdr->lrh[2] = htons(nwords + SIZE_OF_CRC);
+ cdp->hdr->bth[0] = htonl((OPCODE_ITH4X << 24) +
+ (extra_bytes << 20) +
+ IPS_DEFAULT_P_KEY);
+ cdp->hdr->sub_opcode = OPCODE_ENCAP;
+
+ cdp->hdr->bth[2] = 0;
+ /*
+ * Generate an interrupt on the receive side for the last
+ * fragment.
+ */
+ cdp->hdr->iph.pkt_flags = ((cdp->len + cdp->extra) == len)
+ ? __cpu_to_le16(INFINIPATH_KPF_INTR) : 0;
+ cdp->hdr->iph.chksum = __cpu_to_le16(
+ (u16) IPS_LRH_BTH + (u16) (nwords + SIZE_OF_CRC) -
+ (u16) ((__le32_to_cpu(cdp->hdr->iph.ver_port_tid_offset)
+ >> 16) & 0xFFFF) -
+ (u16) (__le32_to_cpu(cdp->hdr->iph.ver_port_tid_offset)
+ & 0xFFFF) -
+ (u16) __le16_to_cpu(cdp->hdr->iph.pkt_flags));
+
+ ipath_cdbg(VERBOSE, "send %d (%x %x %x %x %x %x %x)\n", nwords,
+ cdp->hdr->lrh[0], cdp->hdr->lrh[1],
+ cdp->hdr->lrh[2], cdp->hdr->lrh[3],
+ cdp->hdr->bth[0], cdp->hdr->bth[1], cdp->hdr->bth[2]);
+ /*
+ * Write len to control qword, no flags.
+ * +1 is for the qword padding of pbc.
+ */
+ writeq(nwords + 1ULL, (u64 __iomem *) piobuf);
+ /* we have to flush after the PBC for correctness on some cpus
+ * or WC buffer can be written out of order */
+ ipath_flush_wc();
+ piobuf += 2;
+ hdrwords = sizeof(struct ether_header) >> 2;
+ __iowrite32_copy(piobuf, cdp->hdr, hdrwords);
+ cdp->csum_pio = &((struct ether_header __iomem *)piobuf)->csum;
+ cdp->to = piobuf + hdrwords;
+ cdp->flen = nwords - hdrwords;
+ cdp->hdr->frag_num++;
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+/**
+ * copy_extra_dword - copy the last full dword
+ * @dd: the infinipath device
+ * @cdp: the data to copy
+ * @dosum: write a checksum if true
+ *
+ * copy the last full dword when that's the "extra" word, preceding it
+ * with a memory fence, so that all prior data is written to the PIO
+ * buffer before the trigger word, to enforce the correct bus ordering
+ * of the WC buffer contents on the bus.
+ */
+static unsigned copy_extra_dword(struct ipath_devdata *dd,
+ struct copy_data_s *cdp,
+ unsigned dosum)
+{
+ unsigned ret;
+
+ if (!cdp->flen && layer_send_getpiobuf(dd, cdp) < 0) {
+ ret = 1;
+ goto bail;
+ }
+ /* write the checksum before the last PIO write, if requested. */
+ if (dosum && cdp->flen == 1)
+ __raw_writel(csum_fold(cdp->csum), cdp->csum_pio);
+ cdp->extra = 0;
+ cdp->flen -= 1;
+ if (!cdp->flen) { /* trigger word being written */
+ ipath_flush_wc();
+ __raw_writel(cdp->u.w, cdp->to++);
+ ipath_flush_wc();
+ } else /* still more to copy to pio buf */
+ __raw_writel(cdp->u.w, cdp->to++);
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+/**
+ * copy_a_buffer - copy a PIO buffer's worth to the PIO buffer
+ * @dd: the infinipath device
+ * @cdp: the destination
+ * @p: the data to copy
+ * @n: the amount to copy
+ * @dosum: write a checksum if true
+ *
+ * copy a PIO buffer's worth (or the skb fragment, at least) to the PIO
+ * buffer, adding a memory fence before the last word. We need the fence
+ * as part of forcing the WC ordering on some cpus, for the cases where
+ * it will be the trigger word. The final fence after the trigger word
+ * will be done either at the next chunk, or on final return from the caller
+ * Takes max byte count, returns byte count actually done (always rounded
+ * to dword multiple).
+ */
+static u32 copy_a_buffer(struct ipath_devdata *dd, struct copy_data_s *cdp,
+ void *p, u32 n, unsigned dosum)
+{
+ u32 *p32;
+ u32 ret;
+
+ if (!cdp->flen && layer_send_getpiobuf(dd, cdp) < 0) {
+ ret = -1;
+ goto bail;
+ }
+ if (n > cdp->flen)
+ n = cdp->flen;
+ if (dosum && cdp->flen == n)
+ __raw_writel(csum_fold(cdp->csum), cdp->csum_pio);
+ p32 = p;
+ cdp->flen -= n;
+ if (!cdp->flen) { /* trigger word being written */
+ __iowrite32_copy(cdp->to, p32, n - 1);
+ cdp->to += n - 1;
+ ipath_flush_wc();
+ __raw_writel(p32[n - 1], cdp->to++);
+ ipath_flush_wc();
+ } else { /* still more to copy to pio buf */
+ __iowrite32_copy(cdp->to, p32, n);
+ cdp->to += n;
+ }
+ n <<= 2;
+ cdp->offset += n;
+ cdp->len -= n;
+ ret = n;
+
+bail:
+ return n;
+}
+
+static int copy_bits_internal(struct ipath_devdata *dd,
+ struct copy_data_s *cdp, u8 *p,
+ unsigned *copyp, unsigned *offsetp,
+ unsigned *lenp, int do_csum)
+{
+ unsigned copy = *copyp;
+ unsigned len = *lenp;
+ int ret = 1;
+
+ if (copy > len)
+ copy = len;
+ *offsetp += copy;
+ len -= copy;
+ if (do_csum && !cdp->checksum_calc) {
+ unsigned int csum2;
+
+ cdp->checksum_calc = 1;
+
+ csum2 = csum_partial(p, copy, 0);
+ cdp->csum = csum_block_add(cdp->csum, csum2, cdp->pos);
+ cdp->pos += copy;
+ }
+ /*
+ * If the alignment buffer is not empty, fill it and write it
+ * out.
+ */
+ if (cdp->extra) {
+ if (cdp->extra == 4) {
+ if (copy_extra_dword(dd, cdp, 1))
+ goto done;
+ }
+ else while (copy != 0) {
+ cdp->u.buf[cdp->extra] = *p++;
+ copy--;
+ cdp->offset++;
+ cdp->len--;
+ if (++cdp->extra == 4) {
+ if (copy_extra_dword(dd, cdp, 1))
+ goto done;
+ break;
+ }
+ }
+ }
+
+ while (copy >= 4) {
+ u32 n = copy_a_buffer(dd, cdp, p, copy >> 2, 1);
+ if (n == -1)
+ goto done;
+ p += n;
+ copy -= n;
+ }
+ /*
+ * Either cdp->extra is zero or copy is zero, which means that
+ * the loop here can't cause the alignment buffer to fill up.
+ */
+ while (copy != 0) {
+ cdp->u.buf[cdp->extra++] = *p++;
+ copy--;
+ cdp->offset++;
+ cdp->len--;
+ }
+
+ if (do_csum)
+ cdp->checksum_calc = 0;
+
+ if (len == 0)
+ goto done;
+
+ ret = 0;
+done:
+ *copyp = copy;
+ *lenp = len;
+ return ret;
+}
+
+/**
+ * copy_and_maybe_csum_bits - copy data into the PIO buffer
+ * @dd: the infinipath device
+ * @skb: the source sk_buff
+ * @offset: the offset within the source
+ * @len: the len of the data to copy
+ * @cdp: the destination
+ * @do_csum: write a checksum if true
+ *
+ * Copy data out of one or a chain of sk_buffs, into the PIO buffer,
+ * generating the checksum as we go if do_csum is non-zero.
+ * Fragment an sk_buff into multiple IB packets if the amount of data
+ * is more than a single eager send.
+ * Offset and len are in bytes.
+ * Note that this function is recursive!
+ */
+static void copy_and_maybe_csum_bits(struct ipath_devdata *dd,
+ const struct sk_buff *skb,
+ unsigned int offset,
+ unsigned int len,
+ struct copy_data_s *cdp,
+ int do_csum)
+{
+ unsigned int start = skb_headlen(skb);
+ unsigned int i, copy;
+
+ /* Copy header. */
+ copy = start - offset;
+ if ((int) copy > 0) {
+ u8 *p = skb->data + offset;
+
+ if (copy_bits_internal(dd, cdp, p, ©, &offset, &len,
+ do_csum))
+ goto done;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ unsigned int end;
+
+ end = start + frag->size;
+ copy = end - offset;
+ if ((int) copy > 0) {
+ u8 *vaddr = kmap_skb_frag(frag);
+ u8 *p = vaddr + frag->page_offset + offset - start;
+ int ret;
+
+ ret = copy_bits_internal(dd, cdp, p, ©,
+ &offset, &len, do_csum);
+
+ kunmap_skb_frag(vaddr);
+
+ if (ret)
+ goto done;
+ }
+ start = end;
+ }
+
+ if (skb_shinfo(skb)->frag_list) {
+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+ for (; list; list = list->next) {
+ unsigned int end;
+
+ end = start + list->len;
+ copy = end - offset;
+ if ((int) copy > 0) {
+ if (copy > len)
+ copy = len;
+ copy_and_maybe_csum_bits(dd, list,
+ offset - start,
+ copy, cdp,
+ do_csum);
+ len -= copy;
+ if (cdp->error || len == 0)
+ goto done;
+ offset += copy;
+ }
+ start = end;
+ }
+ }
+ if (len)
+ cdp->error = -EFAULT;
+done:
+ /* we have to flush after trigger word for correctness on some
+ * cpus or WC buffer can be written out of order; needed even
+ * if there was an error */
+ ipath_flush_wc();
+}
+
+static inline void copy_and_csum_bits(struct ipath_devdata *dd,
+ const struct sk_buff *skb,
+ unsigned int offset,
+ unsigned int len,
+ struct copy_data_s *cdp)
+{
+ copy_and_maybe_csum_bits(dd, skb, offset, len, cdp, 1);
+}
+
+static inline void copy_bits(struct ipath_devdata *dd,
+ const struct sk_buff *skb,
+ unsigned int offset,
+ unsigned int len,
+ struct copy_data_s *cdp)
+{
+ copy_and_maybe_csum_bits(dd, skb, offset, len, cdp, 0);
+}
+
+/**
+ * ipath_layer_send_skb - layered sk_buff send
+ * @dd: the infinipath device
+ * @cdata: the data to send
+ *
+ * This is called by the ipath_ether module.
+ *
+ * Note that the header should have the unchanging parts
+ * initialized but the rest of the header is computed as needed in
+ * order to break up skb data buffers larger than the hardware MTU.
+ * In other words, the Linux network stack MTU can be larger than the
+ * hardware MTU.
+ */
+int ipath_layer_send_skb(struct ipath_devdata *dd,
+ struct copy_data_s *cdata)
+{
+ int ret = 0;
+ u16 vlsllnh;
+
+ if (!(dd->ipath_flags & IPATH_RCVHDRSZ_SET)) {
+ dev_info(&dd->pcidev->dev, "send while not open\n");
+ ret = -EINVAL;
+ }
+ else if ((dd->ipath_flags & (IPATH_LINKUNK | IPATH_LINKDOWN)) ||
+ dd->ipath_lid == 0) {
+ /* lid check is for when sma hasn't yet configured */
+ ret = -ENETDOWN;
+ ipath_cdbg(VERBOSE, "send while not ready, mylid=%u, "
+ "flags=0x%x\n", dd->ipath_lid, dd->ipath_flags);
+ }
+ vlsllnh = *((u16 *) cdata->hdr);
+ if (vlsllnh != htons(IPS_LRH_BTH)) {
+ ipath_dbg("Warning: lrh[0] wrong (%x, not %x); "
+ "not sending\n", vlsllnh, htons(IPS_LRH_BTH));
+ ret = -EINVAL;
+ }
+ if (ret)
+ goto done;
+
+ cdata->error = 0; /* clear last calls error */
+
+ if (cdata->skb->ip_summed == CHECKSUM_HW) {
+ unsigned int csstart = cdata->skb->h.raw - cdata->skb->data;
+
+ /*
+ * Computing the checksum is a bit tricky since if we
+ * fragment the packet, the fragment that should contain the
+ * checksum will have already been sent. The solution is to
+ * store the checksum in the header of the last fragment
+ * just before we write the last data word which triggers
+ * the last fragment to be sent. The receiver will check
+ * the header "tag" field, see that there is a checksum, and
+ * store the checksum back into the packet.
+ *
+ * Save the offset of the two byte checksum.
+ *
+ * Note that we have to add 2 to account for the two bytes
+ * of the ethernet address we stripped from the packet and
+ * put in the header.
+ */
+
+ cdata->hdr->csum_offset = csstart + cdata->skb->csum + 2;
+
+ if (cdata->offset < csstart)
+ copy_bits(dd, cdata->skb, cdata->offset,
+ csstart - cdata->offset, cdata);
+
+ if (cdata->error) {
+ ret = cdata->error;
+ goto done;
+ }
+
+ if (cdata->offset < cdata->skb->len)
+ copy_and_csum_bits(dd, cdata->skb, cdata->offset,
+ cdata->skb->len - cdata->offset,
+ cdata);
+
+ }
+ else
+ copy_bits(dd, cdata->skb, cdata->offset,
+ cdata->skb->len - cdata->offset, cdata);
+
+ if (cdata->error) {
+ ret = cdata->error;
+ goto done;
+ }
+
+ if (cdata->extra) {
+ /*
+ * do not increment extra while zeroing, or we
+ * might not calculate the pad correctly, if all
+ * the data to be sent is in the pad
+ */
+ int ei = cdata->extra;
+ while (ei < 4)
+ cdata->u.buf[ei++] = 0;
+ (void)copy_extra_dword(dd, cdata, 1);
+
+ if (cdata->error) {
+ ret = cdata->error;
+ if (cdata->error != -EBUSY)
+ ipath_dev_err(dd, "layer_send copy_bits "
+ "failed with error %d\n",
+ -ret);
+ }
+ }
+
+ /* another ether packet sent */
+ ipath_stats.sps_ether_spkts++;
+
+done:
+ /*
+ * we have to flush after trigger word for correctness on
+ * some cpus or WC buffer can be written out of order; needed
+ * even if there was an error
+ */
+ ipath_flush_wc();
+ return ret;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_send_skb);
More information about the general
mailing list