[ofa-general] Re: [PATCH 4/4] [RFC] IPoIB/cm: Add connected mode support for devices without SRQs

Wed Nov 7 13:30:47 PST 2007

Sorry. Resending comments as requested. This is based on the for-2.6.25 git tree that I pulled
from, and so is not in patch format.

static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id,
                                   struct ipoib_cm_rx *rx)
{
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        int ret;
        int i;

        rx->rx_ring = kcalloc(ipoib_recvq_size, sizeof *rx->rx_ring, GFP_KERNEL);
        if (!rx->rx_ring)
                return -ENOMEM;

        spin_lock_irq(&priv->lock);

        if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) {
                spin_unlock_irq(&priv->lock);
                ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0);
                return -EINVAL;
        } else
                ++priv->cm.nonsrq_conn_qp;

        spin_unlock_irq(&priv->lock);

        for (i = 0; i < ipoib_recvq_size; ++i) {
                if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1,
                                           rx->rx_ring[i].mapping)) {
                        ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
                                ret = -ENOMEM;

This will cause an skb leak- see generic comments about ipoib_cm_alloc_rx_skb() below

                                goto err;
                        }
                ret = ipoib_cm_post_receive_nonsrq(dev, rx, i);
                if (ret) {
                        ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq "
                                   "failed for buf %d\n", i);
                        ret = -EIO;

This will cause an skb leak- see generic comments about ipoib_cm_alloc_rx_skb() below

                        goto err;
                }
        }

        rx->recv_count = ipoib_recvq_size;

        return 0;

err:
        spin_lock_irq(&priv->lock);
        --priv->cm.nonsrq_conn_qp;
        spin_unlock_irq(&priv->lock);

kfree(rx_ring) is missing

        return ret;
}

Generic comment about ipoib_cm_alloc_rx_skb() which is true for both the srq and non srq cases 
(except in the receive wc handler):
I find that there will be skb leakage if ipoib_cm_alloc_rx_skb() fails before all the rx skbs
are allocated. We must undo those allocations and mappings. Probably we should call 
ipoib_cm_dev_cleanup() and free the skbs and do the unmap in that routine.

void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
{
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        struct ipoib_cm_rx_buf *rx_ring;
        unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
        struct sk_buff *skb, *newskb;
        struct ipoib_cm_rx *p;
        unsigned long flags;
        u64 mapping[IPOIB_CM_RX_SG];
        int frags;

        ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
                       wr_id, wc->status);

        if (unlikely(wr_id >= ipoib_recvq_size)) {
                printk(KERN_WARNING "Inside rx wc with wr_id=0x%x\n", wr_id);
                if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
                        spin_lock_irqsave(&priv->lock, flags);
                        list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
                        ipoib_cm_start_rx_drain(priv);

I do not understand why we need to call ipoib_cm_start_rx_drain(). We have already received a work 
completion with RX_DRAIN set.

                        queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
                        spin_unlock_irqrestore(&priv->lock, flags);
                } else
                        ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
                                   wr_id, ipoib_recvq_size);
                return;
        }

        p = wc->qp->qp_context;

        if (ipoib_cm_has_srq(dev))
                rx_ring = priv->cm.srq_ring;
        else
                rx_ring = p->rx_ring;

        skb = rx_ring[wr_id].skb;

        if (unlikely(wc->status != IB_WC_SUCCESS)) {
                ipoib_dbg(priv, "cm recv error "
                           "(status=%d, wrid=%d vend_err %x)\n",
                           wc->status, wr_id, wc->vendor_err);
                ++dev->stats.rx_dropped;
                if (!p->rx_ring)

If rx_ring is indeed NULL, there is no point in continuing further,
we will probably crash sooner or later. We should insert a BUG_ON and 
thus crash the system.

                        goto repost;
                else {
                        if (!--p->recv_count) {
                                printk(KERN_WARNING "recv_count=0x%x\n", p->recv_count);
                                spin_lock_irqsave(&priv->lock, flags);
                                list_move(&p->list, &priv->cm.rx_reap_list);
                                spin_unlock_irqrestore(&priv->lock, flags);
                                queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
                        }
                        return;
                }
        }

        if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) {
                if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
                        spin_lock_irqsave(&priv->lock, flags);
                        p->jiffies = jiffies;
                        /* Move this entry to list head, but do not re-add it
                         * if it has been moved out of list. */
                        if (p->state == IPOIB_CM_RX_LIVE)
                                list_move(&p->list, &priv->cm.passive_ids);
                        spin_unlock_irqrestore(&priv->lock, flags);
                }
        }

        frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,
                                              (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE;

        newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, mapping);
        if (unlikely(!newskb)) {
                /*
                 * If we can't allocate a new RX buffer, dump
                 * this packet and reuse the old buffer.
                 */
                ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
                ++dev->stats.rx_dropped;
                goto repost;
        }

        ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping);
        memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping);

        ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
                       wc->byte_len, wc->slid);

        skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb);

        skb->protocol = ((struct ipoib_header *) skb->data)->proto;
        skb_reset_mac_header(skb);
        skb_pull(skb, IPOIB_ENCAP_LEN);

        dev->last_rx = jiffies;
        ++dev->stats.rx_packets;
        dev->stats.rx_bytes += skb->len;

        skb->dev = dev;
        /* XXX get correct PACKET_ type here */
        skb->pkt_type = PACKET_HOST;
        netif_receive_skb(skb);

repost:
        if (p->rx_ring) {

Shouldn't this be if(!ipoib_cm_has_srq())?

                if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p, wr_id))) {
                        --p->recv_count;
                        ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
                                   "for buf %d\n", wr_id);
                }
        } else {
                if (unlikely(ipoib_cm_post_receive_srq(dev, wr_id)))
                        ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
                                   "for buf %d\n", wr_id);
        }
}