[Openib-windows] First draft of virtualization patch for IPOIB

Tzachi Dar tzachid at mellanox.co.il
Tue Mar 7 09:00:01 PST 2006


Hi fab,
 
Attached is the first patch of the virtualization code for IPOIB. 
 
There is an assumption that the number of virtual servers will be small
and therefore they are currently stored in an array. We can move to
something more efficient if you feel that it is needed. For example a
sorted array which will make the lookups faster or some hash mechanism.
 
There is also a need to decide what are the conditions to get one out of
the table (timeout ?) Currently there is no way out, but as the number
of virtual machines is small there is no such (real) problem. We might
change the  table to have a fixed size to solve any potential security
problems.
 
There is also some assert in the DHCP code (see bellow) not sure what it
really means, but ignoring it seems fine.
 
The last thing to solve is the locking problem. The table is read many
times and written only a small time. I thought of using the same
mechanism as for the end point manager that you have. If this is true
than we only have to add locking before inserting to the table.
 
Please send me your feedback.
 
Thanks
Tzachi
 
Index: Q:/OpenIb/gen1/trunk/ulp/ipoib/kernel/ipoib_debug.h
===================================================================
--- Q:/OpenIb/gen1/trunk/ulp/ipoib/kernel/ipoib_debug.h (revision 226)
+++ Q:/OpenIb/gen1/trunk/ulp/ipoib/kernel/ipoib_debug.h (working copy)
@@ -60,7 +60,9 @@
 #define IPOIB_DBG_ALLOC (1 << 8)
 #define IPOIB_DBG_OID (1 << 9)
 #define IPOIB_DBG_IOCTL (1 << 10)
+#define IPOIB_DBG_VM (1 << 11)
 
+
 #define IPOIB_DBG_FUNC (1 << 28) /* For function entry/exit */
 #define IPOIB_DBG_INFO (1 << 29) /* For verbose information */
 #define IPOIB_DBG_WARN (1 << 30) /* For warnings. */
Index: Q:/OpenIb/gen1/trunk/ulp/ipoib/kernel/ipoib_port.c
===================================================================
--- Q:/OpenIb/gen1/trunk/ulp/ipoib/kernel/ipoib_port.c (revision 226)
+++ Q:/OpenIb/gen1/trunk/ulp/ipoib/kernel/ipoib_port.c (working copy)
@@ -237,6 +237,7 @@
 
 static ib_api_status_t
 __recv_gen(
+ IN  const   ipoib_port_t* const   p_port,
  IN  const ipoib_pkt_t* const   p_ipoib,
   OUT   eth_pkt_t* const   p_eth,
  IN    ipoib_endpt_t* const  p_src,
@@ -461,7 +462,34 @@
  return cl_memcmp( p_key1, p_key2, sizeof(ib_gid_t) );
 }
 
+/**********************************************************************
********
+*
+* Virtual server ip to mac translations
+*
+***********************************************************************
*******/
+static void
+__init_vs_ip_to_mac_translation(
+ IN    VS_ip_mac_manager *   p_manager);
 
+static void
+__shutdown_vs_ip_to_mac_translation(
+ IN    VS_ip_mac_manager *   p_manager);
+
+static ib_api_status_t
+__get_mac_from_ip(
+ IN const  VS_ip_mac_manager * const p_manager,
+ IN    net32_t      dst_ip,
+ OUT    mac_addr_t *    p_dst_mac);
+
+static ib_api_status_t
+__put_mac_ip_pair(
+ IN    VS_ip_mac_manager *   p_manager,
+ IN    net32_t      dst_ip,
+ IN    mac_addr_t     dst_mac);
+
+
+
+
 
/***********************************************************************
*******
 *
 * Implementation
@@ -548,6 +576,8 @@
 
  __endpt_mgr_construct( p_port );
 
+ __init_vs_ip_to_mac_translation(  &p_port->vs_manager );
+
  IPOIB_EXIT( IPOIB_DBG_INIT );
 }
 
@@ -702,6 +732,8 @@
 
  cl_obj_deinit( p_obj );
 
+ __shutdown_vs_ip_to_mac_translation( &p_port->vs_manager );
+
  cl_free( p_port );
 
  IPOIB_EXIT( IPOIB_DBG_INIT );
@@ -1780,7 +1812,7 @@
    continue;
   }
 
-  len = p_wc->length - sizeof(ib_grh_t);
+  len = p_wc->length - sizeof(ib_grh_t); //????? Can there be a buffer
overrun here ????
 
   if( len < sizeof(ipoib_hdr_t) )
   {
@@ -1838,7 +1870,7 @@
    {
     /* Unfiltered.  Setup the ethernet header and report. */
     cl_perf_start( RecvTcp );
-    status = __recv_gen( p_ipoib, p_eth, p_src, p_dst );
+    status = __recv_gen( p_port, p_ipoib, p_eth, p_src, p_dst );
     cl_perf_stop( &p_port->p_adapter->perf, RecvTcp );
     break;
    }
@@ -1876,7 +1908,7 @@
    {
     /* Unfiltered.  Setup the ethernet header and report. */
     cl_perf_start( RecvUdp );
-    status = __recv_gen( p_ipoib, p_eth, p_src, p_dst );
+    status = __recv_gen( p_port, p_ipoib, p_eth, p_src, p_dst );
     cl_perf_stop( &p_port->p_adapter->perf, RecvUdp );
    }
    break;
@@ -1898,7 +1930,7 @@
   default:
    /* Unfiltered.  Setup the ethernet header and report. */
    cl_perf_start( RecvGen );
-   status = __recv_gen( p_ipoib, p_eth, p_src, p_dst );
+   status = __recv_gen( p_port, p_ipoib, p_eth, p_src, p_dst );
    cl_perf_stop( &p_port->p_adapter->perf, RecvGen );
   }
 
@@ -1943,11 +1975,15 @@
 
 static ib_api_status_t
 __recv_gen(
+ IN  const   ipoib_port_t* const   p_port,
  IN  const ipoib_pkt_t* const   p_ipoib,
   OUT   eth_pkt_t* const   p_eth,
  IN    ipoib_endpt_t* const  p_src,
  IN    ipoib_endpt_t* const  p_dst )
 {
+ net16_t  OriginalType;
+ ib_api_status_t  status;
+ mac_addr_t dst_mac;
  IPOIB_ENTER( IPOIB_DBG_RECV );
 
  if( !p_src || !p_dst )
@@ -1957,6 +1993,8 @@
   return IB_NOT_DONE;
  }
 
+ OriginalType = p_ipoib->hdr.type;
+
  /*
   * Fill in the ethernet header.  Note that doing so will overwrite
   * the IPoIB header, so start by moving the information from the IPoIB
@@ -1966,6 +2004,14 @@
  p_eth->hdr.src = p_src->mac;
  p_eth->hdr.dst = p_dst->mac;
 
+ if (OriginalType == ETH_PROT_TYPE_IP) 
+ {
+  status = __get_mac_from_ip(&p_port->vs_manager,
p_ipoib->type.ip.hdr.dst_ip, &dst_mac);
+  if ( status == IB_SUCCESS) 
+  {
+   p_eth->hdr.dst = dst_mac;
+  }
+ }
  IPOIB_EXIT( IPOIB_DBG_RECV );
  return IB_SUCCESS;
 }
@@ -1991,7 +2037,7 @@
  UNUSED_PARAM( p_port );
 
  /* Create the ethernet header. */
- status = __recv_gen( p_ipoib, p_eth, p_src, p_dst );
+ status = __recv_gen( p_port, p_ipoib, p_eth, p_src, p_dst );
  if( status != IB_SUCCESS )
  {
   IPOIB_TRACE_EXIT( IPOIB_DBG_ERROR,
@@ -2128,6 +2174,7 @@
  const ipoib_arp_pkt_t *p_ib_arp;
  ib_gid_t    gid;
  mac_addr_t    mac;
+ mac_addr_t dst_mac;
  ipoib_hw_addr_t   null_hw = {0};
 
  IPOIB_ENTER( IPOIB_DBG_RECV );
@@ -2279,7 +2326,7 @@
   * Create the ethernet header.  Note that this is done last so that
   * we have a chance to create a new endpoint.
   */
- status = __recv_gen( p_ipoib, p_eth, *pp_src, p_dst );
+ status = __recv_gen( p_port, p_ipoib, p_eth, *pp_src, p_dst );
  if( status != IB_SUCCESS )
  {
   IPOIB_TRACE_EXIT( IPOIB_DBG_ERROR,
@@ -2288,6 +2335,20 @@
   return status;
  }
 
+    if (p_eth->hdr.type == ETH_PROT_TYPE_ARP) {
+  if ((p_eth->type.arp.op == ARP_OP_REP)) 
+  {
+   status = __get_mac_from_ip(&p_port->vs_manager,
p_eth->type.arp.dst_ip, &dst_mac);
+   if ( status == IB_SUCCESS) 
+   {
+    p_eth->hdr.dst = dst_mac;
+    p_eth->type.arp.dst_hw = dst_mac;
+   }
+  } 
+
+
+    }
+
  IPOIB_EXIT( IPOIB_DBG_RECV );
  return IB_SUCCESS;
 }
@@ -3133,7 +3194,7 @@
    p_cid[1] = 21;
   }
 
-  CL_ASSERT( p_cid[1] == 21 );
+//??????  CL_ASSERT( p_cid[1] == 21 ); // This asserts seems to bounce,
nothing happens if ignored ???
   p_cid[23]= DHCP_OPT_END;
   ib_gid_set_default( &gid, p_port->p_adapter->guids.port_guid );
   cl_memcpy( &p_cid[7], &gid, sizeof(ib_gid_t) );
@@ -3219,6 +3280,11 @@
   return NDIS_STATUS_INVALID_DATA;
  }
 
+ if ((p_arp->op == ARP_OP_REQ)) 
+ {
+  __put_mac_ip_pair(&p_port->vs_manager,p_arp->src_ip, p_arp->src_hw );
+ }
+
  /* Allocate our scratch buffer. */
  p_desc->p_buf = (send_buf_t*)
   ExAllocateFromNPagedLookasideList( &p_port->buf_mgr.send_buf_list );
@@ -5211,3 +5277,124 @@
 
  IPOIB_EXIT( IPOIB_DBG_MCAST );
 }
+
+static void
+__init_vs_ip_to_mac_translation(
+ IN    VS_ip_mac_manager *   p_manager)
+{
+ p_manager->p_pairs = NULL;
+ p_manager->data_size = 0;
+ p_manager->array_size = 0;
+}
+
+static void
+__shutdown_vs_ip_to_mac_translation(
+ IN    VS_ip_mac_manager *   p_manager)
+{
+ if ( p_manager->p_pairs != NULL ) 
+ {
+  cl_free( p_manager->p_pairs );
+ }
+}
+
+static ib_api_status_t
+__get_mac_from_ip(
+ IN const  VS_ip_mac_manager * const p_manager,
+ IN    net32_t      dst_ip,
+ OUT    mac_addr_t *    p_dst_mac)
+{
+ uint32_t i;
+ for (i = 0 ; i < p_manager->data_size; i++ ) 
+ {
+  if (p_manager->p_pairs[i].dst_ip == dst_ip) 
+  {
+   // We have found the IP that we are looking for
+   *p_dst_mac = p_manager->p_pairs[i].mac;
+   IPOIB_TRACE( IPOIB_DBG_VM,("__get_mac_from_ip dst_ip = %d.%d.%d.%d
found in table\n", 
+    ((dst_ip & 0xff      )       ),
+    ((dst_ip & 0xff00    ) >> 8  ),
+    ((dst_ip & 0xff0000  ) >> 16 ),
+    ((dst_ip & 0xff000000) >> 24 )));
+   
+   return IB_SUCCESS;
+  }
+   
+ }
+ // Not found
+ IPOIB_TRACE( IPOIB_DBG_VM,("__get_mac_from_ip dst_ip = %d.%d.%d.%d not
found \n", 
+   ((dst_ip & 0xff      )       ),
+   ((dst_ip & 0xff00    ) >> 8  ),
+   ((dst_ip & 0xff0000  ) >> 16 ),
+   ((dst_ip & 0xff000000) >> 24 )));
+
+ 
+ return IB_NOT_FOUND;
+}
+
+static ib_api_status_t
+__put_mac_ip_pair(
+ IN    VS_ip_mac_manager *   p_manager,
+ IN    net32_t      dst_ip,
+ IN    mac_addr_t     dst_mac)
+{
+ uint32_t i;
+ uint32_t new_size = 0;
+ VS_ip_mac_pair *new_array;
+ IPOIB_ENTER( IPOIB_DBG_VM );
+
+ IPOIB_TRACE( IPOIB_DBG_VM,("__put_mac_ip_pair dst_ip = %d.%d.%d.%d
\n", 
+    ((dst_ip & 0xff      )       ),
+    ((dst_ip & 0xff00    ) >> 8  ),
+    ((dst_ip & 0xff0000  ) >> 16 ),
+    ((dst_ip & 0xff000000) >> 24 )));
+
+ // First step is to look if this is actually an update and not adding
+ for (i = 0 ; i < p_manager->data_size; i++ ) 
+ {
+  if (p_manager->p_pairs[i].dst_ip == dst_ip) 
+  {
+   // We have found the IP that we are looking for, update it
+   p_manager->p_pairs[i].mac = dst_mac;
+   return IB_SUCCESS;
+  }
+   
+ }
+ // Not found, let see if we need to increase the table
+ if ( p_manager->array_size <= p_manager->data_size )
+ {
+  // Need to increase the array
+  if (p_manager->array_size < 4) 
+  {
+   new_size = 4;
+  } else {
+   new_size = p_manager->array_size * 2;
+  }
+  new_array = cl_zalloc(new_size * sizeof (VS_ip_mac_pair));
+  if ( new_array == NULL )
+  {
+   IPOIB_TRACE_EXIT( IPOIB_DBG_ERROR,
+    ("Failed to allocate new_array.\n") );
+   return CL_INSUFFICIENT_MEMORY;
+  }
+  // copy the data to the new array
+  if ( p_manager->array_size > 0 ) 
+  {
+   cl_memcpy ( new_array, p_manager->p_pairs, p_manager->data_size *
sizeof (VS_ip_mac_pair));
+  }
+  if ( p_manager->p_pairs != NULL ) 
+  {
+   cl_free( p_manager->p_pairs );
+  }
+  p_manager->p_pairs = new_array;
+  new_array = NULL;
+  p_manager->array_size = new_size;  
+ }
+
+ p_manager->p_pairs[p_manager->data_size].dst_ip = dst_ip;
+ p_manager->p_pairs[p_manager->data_size].mac = dst_mac;
+ p_manager->data_size++;
+
+ return IB_SUCCESS;
+
+}
+
Index: Q:/OpenIb/gen1/trunk/ulp/ipoib/kernel/ipoib_port.h
===================================================================
--- Q:/OpenIb/gen1/trunk/ulp/ipoib/kernel/ipoib_port.h (revision 226)
+++ Q:/OpenIb/gen1/trunk/ulp/ipoib/kernel/ipoib_port.h (working copy)
@@ -468,7 +468,20 @@
 *  are inserted in the LID map.
 *********/
 
+typedef struct _VS_ip_mac_pair
+{
+ mac_addr_t    mac;
+ net32_t     dst_ip;
+} VS_ip_mac_pair;
 
+typedef struct _VS_ip_mac_manager
+{
+ VS_ip_mac_pair   *p_pairs;
+ uint32_t    array_size;
+ uint32_t    data_size;
+} VS_ip_mac_manager;
+
+
 typedef struct _ipoib_port
 {
  cl_obj_t    obj;
@@ -496,8 +509,12 @@
  atomic32_t    endpt_rdr;
 
  atomic32_t    hdr_idx;
- ipoib_hdr_t    hdr[1];
 
+ VS_ip_mac_manager  vs_manager;
+
+ // Must be last
+ ipoib_hdr_t    hdr[1]; 
+
 } ipoib_port_t;
 /*
 * FIELDS
@@ -536,7 +553,6 @@
 *  Endpoint manager.
 *********/
 
-
 ib_api_status_t
 ipoib_create_port(
  IN    struct _ipoib_adapter* const p_adapter,

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20060307/700dbe66/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: vm.patch
Type: application/octet-stream
Size: 10895 bytes
Desc: vm.patch
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20060307/700dbe66/attachment.obj>


More information about the ofw mailing list