[openib-general] [PATCH] uDAPL with uCM and uAT support
Arlin Davis
arlin.r.davis at intel.com
Sun Jul 24 17:48:05 PDT 2005
James,
Here is a third drop that adds IBAT to the uCM support. This also includes
some fixes to common code evd_wait and evd_resize.
Instructions From README:
Third drop of code, includes uCM and uAT support.
NOTE: uAT user library and kernel code in separate branch.
build uAT library from following branch:
cd gen2/branches/shaharf-ibat/src/userspace/libibat/
./autogen.sh &&./configure && make && make install
copy following uat source to latest trunk kernel src:
gen2/branches/shaharf-ibat/src/linux-kernel/infiniband/core
at.c at_priv.h att.c uat.c uat.h Makefile
gen2/branches/shaharf-ibat/src/linux-kernel/infiniband/include
ib_at.h ib_user_at.h
add udev rule:
KERNEL="uat*", NAME="infiniband/%k"
Signed-off by: Arlin Davis <ardavis at ichips.intel.com>
Index: dapl/udapl/dapl_evd_wait.c
===================================================================
--- dapl/udapl/dapl_evd_wait.c (revision 2899)
+++ dapl/udapl/dapl_evd_wait.c (working copy)
@@ -74,9 +74,10 @@
DAPL_EVD *evd_ptr;
DAT_RETURN dat_status;
DAT_EVENT *local_event;
- DAT_BOOLEAN notify_requested = DAT_FALSE;
+ DAT_BOOLEAN notify_needed = DAT_FALSE;
DAT_BOOLEAN waitable;
DAPL_EVD_STATE evd_state;
+ DAT_COUNT total_events,new_events;
dapl_dbg_log (DAPL_DBG_TYPE_API,
"dapl_evd_wait (%p, %d, %d, %p, %p)\n",
@@ -124,9 +125,9 @@
}
dapl_dbg_log (DAPL_DBG_TYPE_EVD,
- "dapl_evd_wait: EVD %p, CQ %p\n",
- evd_ptr,
- (void *)evd_ptr->ib_cq_handle);
+ "dapl_evd_wait: EVD %p, CQ %p, Timeout %d, Threshold %d\n",
+ evd_ptr,(void *)evd_ptr->ib_cq_handle, time_out, threshold);
+
/*
* Make sure there are no other waiters and the evd is active.
@@ -145,24 +146,12 @@
(DAT_COUNT) DAPL_EVD_STATE_OPEN,
(DAT_COUNT) DAPL_EVD_STATE_WAITED );
dapl_os_unlock ( &evd_ptr->header.lock );
-
- if ( evd_state != DAPL_EVD_STATE_OPEN )
+ if ( evd_state != DAPL_EVD_STATE_OPEN || !waitable)
{
- /* Bogus state, bail out */
dat_status = DAT_ERROR (DAT_INVALID_STATE,0);
goto bail;
}
- if (!waitable)
- {
- /* This EVD is not waitable, reset the state and bail */
- (void) dapl_os_atomic_assign ((DAPL_ATOMIC *)&evd_ptr->evd_state,
- (DAT_COUNT) DAPL_EVD_STATE_WAITED,
- evd_state);
- dat_status = DAT_ERROR (DAT_INVALID_STATE, DAT_INVALID_STATE_EVD_UNWAITABLE);
- goto bail;
- }
-
/*
* We now own the EVD, even though we don't have the lock anymore,
* because we're in the WAITED state.
@@ -182,37 +171,54 @@
* return right away if the ib_cq_handle associate with these evd
* equal to IB_INVALID_HANDLE
*/
- dapls_evd_copy_cq(evd_ptr);
-
- if (dapls_rbuf_count(&evd_ptr->pending_event_queue) >= threshold)
- {
- break;
- }
-
- /*
- * Do not enable the completion notification if this evd is not
- * a DTO_EVD or RMR_BIND_EVD
+ /* Logic to prevent missing completion between copy_cq (poll)
+ * and completion_notify (re-arm)
*/
- if ( (!notify_requested) &&
- ((evd_ptr->evd_flags & DAT_EVD_DTO_FLAG) ||
- (evd_ptr->evd_flags & DAT_EVD_RMR_BIND_FLAG)) )
+ notify_needed = DAT_TRUE;
+ new_events = 0;
+ while (DAT_TRUE)
{
- dat_status = dapls_ib_completion_notify (
- evd_ptr->header.owner_ia->hca_ptr->ib_hca_handle,
- evd_ptr,
- (evd_ptr->completion_type == DAPL_EVD_STATE_SOLICITED_WAIT) ?
- IB_NOTIFY_ON_SOLIC_COMP : IB_NOTIFY_ON_NEXT_COMP );
-
- DAPL_CNTR(DCNT_EVD_WAIT_CMP_NTFY);
- /* FIXME report error */
- dapl_os_assert(dat_status == DAT_SUCCESS);
+ dapls_evd_copy_cq(evd_ptr); /* poll for new completions */
+ total_events = dapls_rbuf_count (&evd_ptr->pending_event_queue);
+ new_events = total_events - new_events;
+ if (total_events >= threshold ||
+ (!new_events && notify_needed == DAT_FALSE))
+ {
+ break;
+ }
+
+ /*
+ * Do not enable the completion notification if this evd is not
+ * a DTO_EVD or RMR_BIND_EVD
+ */
+ if ( (evd_ptr->evd_flags & DAT_EVD_DTO_FLAG) ||
+ (evd_ptr->evd_flags & DAT_EVD_RMR_BIND_FLAG) )
+ {
+ dat_status = dapls_ib_completion_notify (
+ evd_ptr->header.owner_ia->hca_ptr->ib_hca_handle,
+ evd_ptr,
+ (evd_ptr->completion_type == DAPL_EVD_STATE_SOLICITED_WAIT)
?
+ IB_NOTIFY_ON_SOLIC_COMP : IB_NOTIFY_ON_NEXT_COMP );
+
+ DAPL_CNTR(DCNT_EVD_WAIT_CMP_NTFY);
+ notify_needed = DAT_FALSE;
+ new_events = total_events;
+
+ /* FIXME report error */
+ dapl_os_assert(dat_status == DAT_SUCCESS);
+ }
+ else
+ {
+ break;
+ }
- notify_requested = DAT_TRUE;
+ } /* while completions < threshold, and rearm needed */
- /* Try again. */
- continue;
+ if (total_events >= threshold)
+ {
+ break;
}
-
+
/*
* Unused by poster; it has no way to tell how many
@@ -232,8 +238,6 @@
#endif
dat_status = dapl_os_wait_object_wait (
&evd_ptr->wait_object, time_out );
-
- notify_requested = DAT_FALSE; /* We've used it up. */
/* See if we were awakened by evd_set_unwaitable */
if ( !evd_ptr->evd_waitable )
@@ -243,13 +247,22 @@
if (dat_status != DAT_SUCCESS)
{
- /*
- * If the status is DAT_TIMEOUT, we'll break out of the
- * loop, *not* dequeue an event (because dat_status
- * != DAT_SUCCESS), set *nmore (as we should for timeout)
- * and return DAT_TIMEOUT.
- */
- break;
+ /*
+ * If the status is DAT_TIMEOUT, we'll break out of the
+ * loop, *not* dequeue an event (because dat_status
+ * != DAT_SUCCESS), set *nmore (as we should for timeout)
+ * and return DAT_TIMEOUT.
+ */
+
+#if defined(DAPL_DBG)
+ dapls_evd_copy_cq(evd_ptr); /* poll */
+ dapl_dbg_log (DAPL_DBG_TYPE_EVD,
+ "dapl_evd_wait: WAKEUP ERROR (0x%x): EVD %p, CQ %p, events? %d\n",
+ dat_status,evd_ptr,(void *)evd_ptr->ib_cq_handle,
+ dapls_rbuf_count(&evd_ptr->pending_event_queue) );
+#endif /* DAPL_DBG */
+
+ break;
}
}
Index: dapl/udapl/Makefile
===================================================================
--- dapl/udapl/Makefile (revision 2899)
+++ dapl/udapl/Makefile (working copy)
@@ -122,7 +122,8 @@
#
ifeq ($(VERBS),openib)
PROVIDER = $(TOPDIR)/../openib
-CFLAGS += -DOPENIB -DCQ_WAIT_OBJECT
+CFLAGS += -DOPENIB
+#CFLAGS += -DCQ_WAIT_OBJECT
CFLAGS += -I/usr/local/include/infiniband
endif
@@ -232,7 +233,7 @@
endif
ifeq ($(VERBS),openib)
-LDFLAGS += -libverbs /usr/local/lib/infiniband/mthca.so -libcm
+LDFLAGS += -libverbs /usr/local/lib/infiniband/mthca.so -libcm -libat
LDFLAGS += -rpath /usr/local/lib -L /usr/local/lib
LDFLAGS += -rpath /usr/local/lib/infiniband -L /usr/local/lib/infiniband
PROVIDER_SRCS = dapl_ib_util.c dapl_ib_cq.c dapl_ib_qp.c
Index: dapl/common/dapl_evd_resize.c
===================================================================
--- dapl/common/dapl_evd_resize.c (revision 2899)
+++ dapl/common/dapl_evd_resize.c (working copy)
@@ -67,72 +67,130 @@
IN DAT_EVD_HANDLE evd_handle,
IN DAT_COUNT evd_qlen )
{
- DAPL_IA *ia_ptr;
- DAPL_EVD *evd_ptr;
- DAT_COUNT pend_cnt;
- DAT_RETURN dat_status;
+ DAPL_IA *ia_ptr;
+ DAPL_EVD *evd_ptr;
+ DAT_EVENT *event_ptr;
+ DAT_EVENT *events;
+ DAT_EVENT *orig_event;
+ DAPL_RING_BUFFER free_event_queue;
+ DAPL_RING_BUFFER pending_event_queue;
+ DAT_COUNT pend_cnt;
+ DAT_COUNT i;
dapl_dbg_log (DAPL_DBG_TYPE_API, "dapl_evd_resize (%p, %d)\n",
evd_handle, evd_qlen);
if (DAPL_BAD_HANDLE (evd_handle, DAPL_MAGIC_EVD))
{
- dat_status = DAT_ERROR (DAT_INVALID_HANDLE,0);
- goto bail;
+ return DAT_ERROR (DAT_INVALID_PARAMETER,DAT_INVALID_ARG1);
}
evd_ptr = (DAPL_EVD *)evd_handle;
ia_ptr = evd_ptr->header.owner_ia;
- if ( evd_qlen == evd_ptr->qlen )
+ if ((evd_qlen <= 0) || (evd_ptr->qlen > evd_qlen))
{
- dat_status = DAT_SUCCESS;
- goto bail;
+ return DAT_ERROR (DAT_INVALID_PARAMETER,DAT_INVALID_ARG2);
}
if ( evd_qlen > ia_ptr->hca_ptr->ia_attr.max_evd_qlen )
{
- dat_status = DAT_ERROR (DAT_INVALID_PARAMETER,DAT_INVALID_ARG2);
- goto bail;
+ return DAT_ERROR (DAT_INSUFFICIENT_RESOURCES,DAT_RESOURCE_TEVD);
}
dapl_os_lock(&evd_ptr->header.lock);
- /* Don't try to resize if we are actively waiting */
if (evd_ptr->evd_state == DAPL_EVD_STATE_WAITED)
{
dapl_os_unlock(&evd_ptr->header.lock);
- dat_status = DAT_ERROR (DAT_INVALID_STATE,0);
- goto bail;
+ return DAT_ERROR (DAT_INVALID_STATE,0);
}
pend_cnt = dapls_rbuf_count(&evd_ptr->pending_event_queue);
if (pend_cnt > evd_qlen) {
- dapl_os_unlock(&evd_ptr->header.lock);
- dat_status = DAT_ERROR (DAT_INVALID_STATE,0);
- goto bail;
+ dapl_os_unlock(&evd_ptr->header.lock);
+ return DAT_ERROR (DAT_INVALID_STATE,0);
+ }
+
+ if (DAT_SUCCESS != dapls_ib_cq_resize(evd_ptr->header.owner_ia,
+ evd_ptr,
+ &evd_qlen))
+ {
+ dapl_os_unlock(&evd_ptr->header.lock);
+ return DAT_ERROR (DAT_INSUFFICIENT_RESOURCES,DAT_RESOURCE_MEMORY);
+ }
+
+ /* Allocate EVENTs */
+ events = (DAT_EVENT *) dapl_os_alloc (evd_qlen * sizeof (DAT_EVENT));
+ if (!events)
+ {
+ dapl_os_unlock(&evd_ptr->header.lock);
+ return DAT_ERROR (DAT_INSUFFICIENT_RESOURCES,DAT_RESOURCE_MEMORY);
}
+ event_ptr = events;
- dat_status = dapls_ib_cq_resize(evd_ptr->header.owner_ia,
- evd_ptr,
- &evd_qlen);
- if (dat_status != DAT_SUCCESS)
+ /* allocate free event queue */
+ if (DAT_SUCCESS != dapls_rbuf_alloc (&free_event_queue, evd_qlen))
{
+ dapl_os_free(event_ptr, evd_qlen * sizeof (DAT_EVENT));
dapl_os_unlock(&evd_ptr->header.lock);
- goto bail;
+ return DAT_ERROR (DAT_INSUFFICIENT_RESOURCES,DAT_RESOURCE_MEMORY);
}
- dat_status = dapls_evd_event_realloc (evd_ptr, evd_qlen);
- if (dat_status != DAT_SUCCESS)
+ /* allocate pending event queue */
+ if (DAT_SUCCESS != dapls_rbuf_alloc (&pending_event_queue, evd_qlen))
{
+ dapl_os_free(event_ptr, evd_qlen * sizeof (DAT_EVENT));
dapl_os_unlock(&evd_ptr->header.lock);
- goto bail;
+ return DAT_ERROR (DAT_INSUFFICIENT_RESOURCES,DAT_RESOURCE_MEMORY);
}
+ for (i = 0; i < pend_cnt; i++)
+ {
+ orig_event = dapls_rbuf_remove(&evd_ptr->pending_event_queue);
+ if (orig_event == NULL) {
+ dapl_dbg_log (DAPL_DBG_TYPE_ERR, " Inconsistent event queue\n");
+ dapl_os_free(event_ptr, evd_qlen * sizeof (DAT_EVENT));
+ dapl_os_unlock(&evd_ptr->header.lock);
+ return DAT_ERROR (DAT_INSUFFICIENT_RESOURCES,DAT_RESOURCE_MEMORY);
+ }
+ memcpy(event_ptr, orig_event, sizeof(DAT_EVENT));
+ if (DAT_SUCCESS != dapls_rbuf_add(&pending_event_queue, event_ptr)) {
+ dapl_os_free(event_ptr, evd_qlen * sizeof (DAT_EVENT));
+ dapl_os_unlock(&evd_ptr->header.lock);
+ return DAT_ERROR (DAT_INSUFFICIENT_RESOURCES,DAT_RESOURCE_MEMORY);
+ }
+ event_ptr++;
+ }
+
+ for (i = pend_cnt; i < evd_qlen; i++)
+ {
+ if (DAT_SUCCESS != dapls_rbuf_add(&free_event_queue,
+ (void *) event_ptr)) {
+ dapl_os_free(event_ptr, evd_qlen * sizeof (DAT_EVENT));
+ dapl_os_unlock(&evd_ptr->header.lock);
+ return DAT_ERROR (DAT_INSUFFICIENT_RESOURCES,DAT_RESOURCE_MEMORY);
+ }
+ event_ptr++;
+ }
+
+ dapls_rbuf_destroy (&evd_ptr->free_event_queue);
+ dapls_rbuf_destroy (&evd_ptr->pending_event_queue);
+ if (evd_ptr->events)
+ {
+ dapl_os_free (evd_ptr->events, evd_ptr->qlen * sizeof (DAT_EVENT));
+ }
+ evd_ptr->free_event_queue = free_event_queue;
+ evd_ptr->pending_event_queue = pending_event_queue;
+ evd_ptr->events = events;
+ evd_ptr->qlen = evd_qlen;
+
dapl_os_unlock(&evd_ptr->header.lock);
- bail:
- return dat_status;
+ dapl_dbg_log (DAPL_DBG_TYPE_RTN,
+ "dapl_evd_resize returns DAT_SUCCESS\n");
+
+ return DAT_SUCCESS;
}
/*
Index: dapl/openib/TODO
===================================================================
--- dapl/openib/TODO (revision 2899)
+++ dapl/openib/TODO (working copy)
@@ -1,7 +1,7 @@
IB Verbs:
- CQ resize?
-- query call to get current qp state
+- query call to get current qp state, remote port number
- ibv_get_cq_event() needs timed event call and wakeup
- query call to get device attributes
- memory window support
@@ -9,8 +9,6 @@
DAPL:
- reinit EP needs a QP timewait completion notification
- add cq_object wakeup, time based cq_object wait when verbs support arrives
-- update uDAPL code with real ATS support
-- etc, etc.
Other:
- Shared memory in udapl and kernel module to support?
Index: dapl/openib/dapl_ib_util.c
===================================================================
--- dapl/openib/dapl_ib_util.c (revision 2899)
+++ dapl/openib/dapl_ib_util.c (working copy)
@@ -111,27 +111,40 @@
}
-/* just get IP address for hostname */
-int dapli_get_addr( char *addr, int addr_len)
+/* just get IP address, IPv4 only for now */
+int dapli_get_hca_addr( struct dapl_hca *hca_ptr )
{
- struct sockaddr_in *ipv4_addr = (struct sockaddr_in*)addr;
- struct hostent *h_ptr;
- struct utsname ourname;
-
- if ( uname( &ourname ) < 0 )
- return 1;
-
- h_ptr = gethostbyname( ourname.nodename );
- if ( h_ptr == NULL )
+ struct sockaddr_in *ipv4_addr;
+ struct ib_at_completion at_comp;
+ struct dapl_at_record at_rec;
+ int status;
+ DAT_RETURN dat_status;
+
+ ipv4_addr = (struct sockaddr_in*)&hca_ptr->hca_address;
+ ipv4_addr->sin_family = AF_INET;
+ ipv4_addr->sin_addr.s_addr = 0;
+
+ at_comp.fn = dapli_ip_comp_handler;
+ at_comp.context = &at_rec;
+ at_rec.addr = &hca_ptr->hca_address;
+ at_rec.wait_object = &hca_ptr->ib_trans.wait_object;
+
+ /* call with async_comp until the sync version works */
+ status = ib_at_ips_by_gid(&hca_ptr->ib_trans.gid, &ipv4_addr->sin_addr.s_addr, 1,
+ &at_comp, &at_rec.req_id);
+
+ if (status < 0)
return 1;
-
- if ( h_ptr->h_addrtype == AF_INET ) {
- ipv4_addr = (struct sockaddr_in*) addr;
- ipv4_addr->sin_family = AF_INET;
- dapl_os_memcpy( &ipv4_addr->sin_addr, h_ptr->h_addr_list[0], 4 );
- } else
+
+ if (status > 0)
+ dapli_ip_comp_handler(at_rec.req_id, (void*)ipv4_addr, status);
+
+ /* wait for answer, 5 seconds max */
+ dat_status = dapl_os_wait_object_wait (&hca_ptr->ib_trans.wait_object,5000000);
+
+ if ((dat_status != DAT_SUCCESS ) || (!ipv4_addr->sin_addr.s_addr))
return 1;
-
+
return 0;
}
@@ -152,14 +165,17 @@
*/
int32_t dapls_ib_init (void)
{
- if (dapli_cm_thread_init())
- return -1;
- else
- return 0;
+ dapl_dbg_log (DAPL_DBG_TYPE_UTIL, " dapl_ib_init: \n" );
+ if (dapli_cm_thread_init() || dapli_at_thread_init())
+ return 1;
+
+ return 0;
}
int32_t dapls_ib_release (void)
{
+ dapl_dbg_log (DAPL_DBG_TYPE_UTIL, " dapl_ib_release: \n" );
+ dapli_at_thread_destroy();
dapli_cm_thread_destroy();
return 0;
}
@@ -186,7 +202,6 @@
IN DAPL_HCA *hca_ptr)
{
struct dlist *dev_list;
- DAT_RETURN dat_status = DAT_SUCCESS;
dapl_dbg_log (DAPL_DBG_TYPE_UTIL,
" open_hca: %s - %p\n", hca_name, hca_ptr );
@@ -217,36 +232,46 @@
ibv_get_device_name(hca_ptr->ib_trans.ib_dev) );
return DAT_INTERNAL_ERROR;
}
-
+
/* set inline max with enviromment or default, get local lid and gid 0 */
hca_ptr->ib_trans.max_inline_send =
dapl_os_get_env_val ( "DAPL_MAX_INLINE", INLINE_SEND_DEFAULT );
- if ( dapli_get_lid(hca_ptr, hca_ptr->port_num,
- &hca_ptr->ib_trans.lid )) {
+ if (dapli_get_lid(hca_ptr, hca_ptr->port_num,
+ &hca_ptr->ib_trans.lid)) {
dapl_dbg_log (DAPL_DBG_TYPE_ERR,
" open_hca: IB get LID failed for %s\n",
ibv_get_device_name(hca_ptr->ib_trans.ib_dev) );
- return DAT_INTERNAL_ERROR;
+ goto bail;
}
- if ( dapli_get_gid(hca_ptr, hca_ptr->port_num, 0,
- &hca_ptr->ib_trans.gid )) {
+ if (dapli_get_gid(hca_ptr, hca_ptr->port_num, 0,
+ &hca_ptr->ib_trans.gid)) {
dapl_dbg_log (DAPL_DBG_TYPE_ERR,
" open_hca: IB get GID failed for %s\n",
ibv_get_device_name(hca_ptr->ib_trans.ib_dev) );
- return DAT_INTERNAL_ERROR;
+ goto bail;
}
-
/* get the IP address of the device */
- if ( dapli_get_addr((char*)&hca_ptr->hca_address,
- sizeof(DAT_SOCK_ADDR6) )) {
+ if (dapli_get_hca_addr(hca_ptr)) {
dapl_dbg_log (DAPL_DBG_TYPE_ERR,
" open_hca: IB get ADDR failed for %s\n",
ibv_get_device_name(hca_ptr->ib_trans.ib_dev) );
- return DAT_INTERNAL_ERROR;
+ goto bail;
+ }
+
+ /* one thread for each device open */
+ if (dapli_cq_thread_init(hca_ptr)) {
+ dapl_dbg_log (DAPL_DBG_TYPE_ERR,
+ " open_hca: cq_thread_init failed for %s\n",
+ ibv_get_device_name(hca_ptr->ib_trans.ib_dev) );
+ goto bail;
}
+ /* initialize cq_lock and wait object */
+ dapl_os_lock_init(&hca_ptr->ib_trans.cq_lock);
+ dapl_os_wait_object_init (&hca_ptr->ib_trans.wait_object);
+
dapl_dbg_log (DAPL_DBG_TYPE_UTIL,
" open_hca: %s, port %d, %s %d.%d.%d.%d INLINE_MAX=%d\n",
ibv_get_device_name(hca_ptr->ib_trans.ib_dev), hca_ptr->port_num,
@@ -257,7 +282,19 @@
((struct sockaddr_in *)&hca_ptr->hca_address)->sin_addr.s_addr >> 24 & 0xff,
hca_ptr->ib_trans.max_inline_send );
- return dat_status;
+ dapl_dbg_log(DAPL_DBG_TYPE_CM,
+ " open_hca: LID 0x%x GID subnet %016llx id %016llx\n",
+ hca_ptr->ib_trans.lid,
+ (unsigned long long)bswap_64(hca_ptr->ib_trans.gid.global.subnet_prefix),
+ (unsigned long long)bswap_64(hca_ptr->ib_trans.gid.global.interface_id) );
+
+ return DAT_SUCCESS;
+
+bail:
+ ibv_close_device(hca_ptr->ib_hca_handle);
+ hca_ptr->ib_hca_handle = IB_INVALID_HANDLE;
+ return DAT_INTERNAL_ERROR;
+
}
@@ -282,10 +319,14 @@
dapl_dbg_log (DAPL_DBG_TYPE_UTIL," close_hca: %p\n",hca_ptr);
if (hca_ptr->ib_hca_handle != IB_INVALID_HANDLE) {
+ dapli_cq_thread_destroy(hca_ptr);
if (ibv_close_device(hca_ptr->ib_hca_handle))
return(dapl_convert_errno(errno,"ib_close_device"));
hca_ptr->ib_hca_handle = IB_INVALID_HANDLE;
}
+
+ dapl_os_lock_destroy(&hca_ptr->ib_trans.cq_lock);
+
return (DAT_SUCCESS);
}
@@ -448,35 +489,4 @@
return DAT_SUCCESS;
}
-#ifdef PROVIDER_SPECIFIC_ATTR
-
-/*
- * dapls_set_provider_specific_attr
- *
- * Input:
- * attr_ptr Pointer provider attributes
- *
- * Output:
- * none
- *
- * Returns:
- * void
- */
-DAT_NAMED_ATTR ib_attrs[] = {
- {
- "I_DAT_SEND_INLINE_THRESHOLD",
- "128"
- },
-};
-
-#define SPEC_ATTR_SIZE( x ) (sizeof( x ) / sizeof( DAT_NAMED_ATTR))
-
-void dapls_set_provider_specific_attr(
- IN DAT_PROVIDER_ATTR *attr_ptr )
-{
- attr_ptr->num_provider_specific_attr = SPEC_ATTR_SIZE( ib_attrs );
- attr_ptr->provider_specific_attr = ib_attrs;
-}
-
-#endif
Index: dapl/openib/dapl_ib_cm.c
===================================================================
--- dapl/openib/dapl_ib_cm.c (revision 2899)
+++ dapl/openib/dapl_ib_cm.c (working copy)
@@ -70,19 +70,8 @@
static inline uint64_t cpu_to_be64(uint64_t x) { return x; }
#endif
-#ifndef IB_AT
-
-#include <stdio.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <netinet/tcp.h>
-#include <sysfs/libsysfs.h>
-#include <signal.h>
-
-/* iclust-20 hard coded values, network order */
-#define REMOTE_GID "fe80:0000:0000:0000:0002:c902:0000:4071"
-#define REMOTE_LID "0002"
-
+static int g_at_destroy;
+static DAPL_OS_THREAD g_at_thread;
static int g_cm_destroy;
static DAPL_OS_THREAD g_cm_thread;
static DAPL_OS_LOCK g_cm_lock;
@@ -122,7 +111,7 @@
while (g_cm_destroy) {
struct timespec sleep, remain;
sleep.tv_sec = 0;
- sleep.tv_nsec = 200000000; /* 200 ms */
+ sleep.tv_nsec = 10000000; /* 10 ms */
dapl_dbg_log(DAPL_DBG_TYPE_CM,
" cm_thread_destroy: waiting for cm_thread\n");
nanosleep (&sleep, &remain);
@@ -130,112 +119,70 @@
dapl_dbg_log(DAPL_DBG_TYPE_CM," cm_thread_destroy(%d) exit\n",getpid());
}
-static int ib_at_route_by_ip(uint32_t dst_ip, uint32_t src_ip, int tos, uint16_t flags,
- struct ib_at_ib_route *ib_route,
- struct ib_at_completion *async_comp)
-{
- struct dapl_cm_id *conn = (struct dapl_cm_id *)async_comp->context;
-
- dapl_dbg_log (
- DAPL_DBG_TYPE_CM,
- " CM at_route_by_ip: conn %p cm_id %d src %d.%d.%d.%d -> dst %d.%d.%d.%d (%d)\n",
- conn,conn->cm_id,
- src_ip >> 0 & 0xff, src_ip >> 8 & 0xff,
- src_ip >> 16 & 0xff,src_ip >> 24 & 0xff,
- dst_ip >> 0 & 0xff, dst_ip >> 8 & 0xff,
- dst_ip >> 16 & 0xff,dst_ip >> 24 & 0xff, conn->service_id);
-
- /* use req_id for loopback indication */
- if (( src_ip == dst_ip ) || ( dst_ip == 0x0100007f ))
- async_comp->req_id = 1;
- else
- async_comp->req_id = 0;
-
- return 1;
-}
-
-static int ib_at_paths_by_route(struct ib_at_ib_route *ib_route, uint32_t mpath_type,
- struct ib_sa_path_rec *pr, int npath,
- struct ib_at_completion *async_comp)
+int dapli_at_thread_init(void)
{
- struct dapl_cm_id *conn = (struct dapl_cm_id *)async_comp->context;
- char *env, *token;
- char dgid[40];
- uint16_t *p_gid = (uint16_t*)&ib_route->gid;
+ DAT_RETURN dat_status;
- /* set local path record values and send to remote */
- (void)dapl_os_memzero(pr, sizeof(*pr));
+ dapl_dbg_log(DAPL_DBG_TYPE_CM," at_thread_init(%d)\n", getpid());
- pr->slid = htons(conn->hca->ib_trans.lid);
- pr->sgid.global.subnet_prefix = conn->hca->ib_trans.gid.global.subnet_prefix;
- pr->sgid.global.interface_id = conn->hca->ib_trans.gid.global.interface_id;
+ /* create thread to process AT async requests */
+ dat_status = dapl_os_thread_create(at_thread, NULL, &g_at_thread);
+ if (dat_status != DAT_SUCCESS)
+ {
+ dapl_dbg_log(DAPL_DBG_TYPE_ERR,
+ " at_thread_init: failed to create thread\n");
+ return 1;
+ }
+ return 0;
+}
- env = getenv("DAPL_REMOTE_LID");
- if ( env == NULL )
- env = REMOTE_LID;
- ib_route->lid = strtol(env,NULL,0);
+void dapli_at_thread_destroy(void)
+{
+ dapl_dbg_log(DAPL_DBG_TYPE_CM," at_thread_destroy(%d)\n", getpid());
- env = getenv("DAPL_REMOTE_GID");
- if ( env == NULL )
- env = REMOTE_GID;
+ /* destroy cr_thread and lock */
+ g_at_destroy = 1;
+ pthread_kill( g_at_thread, SIGUSR1 );
+ dapl_dbg_log(DAPL_DBG_TYPE_CM," at_thread_destroy(%d) SIGUSR1 sent\n",getpid());
+ while (g_at_destroy) {
+ struct timespec sleep, remain;
+ sleep.tv_sec = 0;
+ sleep.tv_nsec = 10000000; /* 10 ms */
+ dapl_dbg_log(DAPL_DBG_TYPE_CM,
+ " at_thread_destroy: waiting for at_thread\n");
+ nanosleep (&sleep, &remain);
+ }
+ dapl_dbg_log(DAPL_DBG_TYPE_CM," at_thread_destroy(%d) exit\n",getpid());
+}
- dapl_dbg_log(DAPL_DBG_TYPE_CM,
- " ib_at_paths_by_route: remote LID %x GID %s\n",
- ib_route->lid,env);
+void dapli_ip_comp_handler(uint64_t req_id, void *context, int rec_num)
+{
+ struct dapl_at_record *at_rec = context;
- dapl_os_memcpy( dgid, env, 40 );
+ dapl_dbg_log(DAPL_DBG_TYPE_CM,
+ " ip_comp_handler: ctxt %p, req_id %lld rec_num %d\n",
+ context, req_id, rec_num);
- /* get GID with token strings and delimiter */
- token = strtok(dgid,":");
- while (token) {
- *p_gid = strtoul(token,NULL,16);
- *p_gid = htons(*p_gid); /* convert each token to network order */
- token = strtok(NULL,":");
- p_gid++;
- }
-
- /* set remote lid and gid, req_id is indication of loopback */
- if ( !async_comp->req_id ) {
- pr->dlid = htons(ib_route->lid);
- pr->dgid.global.subnet_prefix = ib_route->gid.global.subnet_prefix;
- pr->dgid.global.interface_id = ib_route->gid.global.interface_id;
- } else {
- pr->dlid = pr->slid;
- pr->dgid.global.subnet_prefix = pr->sgid.global.subnet_prefix;
- pr->dgid.global.interface_id = pr->sgid.global.interface_id;
- }
-
- pr->reversible = 0x1000000;
- pr->pkey = 0xffff;
- pr->mtu = IBV_MTU_1024;
- pr->mtu_selector = 2;
- pr->rate_selector = 2;
- pr->rate = 3;
- pr->packet_life_time_selector = 2;
- pr->packet_life_time = 2;
+ if ((at_rec) && ( at_rec->req_id == req_id)) {
+ dapl_os_wait_object_wakeup(at_rec->wait_object);
+ return;
+ }
- dapl_dbg_log(DAPL_DBG_TYPE_CM,
- " ib_at_paths_by_route: SRC LID 0x%x GID subnet %016llx id %016llx\n",
- pr->slid,(unsigned long long)(pr->sgid.global.subnet_prefix),
- (unsigned long long)(pr->sgid.global.interface_id) );
- dapl_dbg_log(DAPL_DBG_TYPE_CM,
- " ib_at_paths_by_route: DST LID 0x%x GID subnet %016llx id %016llx\n",
- pr->dlid,(unsigned long long)(pr->dgid.global.subnet_prefix),
- (unsigned long long)(pr->dgid.global.interface_id) );
-
- dapli_path_comp_handler( async_comp->req_id, (void*)conn, 1);
-
- return 0;
+ dapl_dbg_log(DAPL_DBG_TYPE_ERR,
+ " ip_comp_handler: at_rec->req_id %lld != req_id %lld\n",
+ at_rec->req_id, req_id );
}
-#endif /* ifndef IB_AT */
-
static void dapli_path_comp_handler(uint64_t req_id, void *context, int rec_num)
{
struct dapl_cm_id *conn = context;
int status;
ib_cm_events_t event;
+ dapl_dbg_log(DAPL_DBG_TYPE_CM,
+ " path_comp_handler: ctxt %p, req_id %lld rec_num %d\n",
+ context, req_id, rec_num);
+
if (rec_num <= 0) {
dapl_dbg_log(DAPL_DBG_TYPE_CM,
" path_comp_handler: resolution err %d retry %d\n",
@@ -249,7 +196,7 @@
status = ib_at_paths_by_route(&conn->dapl_rt, 0,
&conn->dapl_path, 1,
- &conn->dapl_comp);
+ &conn->dapl_comp, &conn->dapl_comp.req_id);
if (status) {
dapl_dbg_log(DAPL_DBG_TYPE_CM,
" path_by_route: err %d id %lld\n",
@@ -287,6 +234,21 @@
int status;
ib_cm_events_t event;
+ dapl_dbg_log(DAPL_DBG_TYPE_CM,
+ " rt_comp_handler: conn %p, req_id %lld rec_num %d\n",
+ conn, req_id, rec_num);
+
+ dapl_dbg_log(DAPL_DBG_TYPE_CM,
+ " rt_comp_handler: SRC GID subnet %016llx id %016llx\n",
+ (unsigned long long)cpu_to_be64(conn->dapl_rt.sgid.global.subnet_prefix),
+ (unsigned long long)cpu_to_be64(conn->dapl_rt.sgid.global.interface_id) );
+
+ dapl_dbg_log(DAPL_DBG_TYPE_CM,
+ " rt_comp_handler: DST GID subnet %016llx id %016llx\n",
+ (unsigned long long)cpu_to_be64(conn->dapl_rt.dgid.global.subnet_prefix),
+ (unsigned long long)cpu_to_be64(conn->dapl_rt.dgid.global.interface_id) );
+
+
if (rec_num <= 0) {
dapl_dbg_log(DAPL_DBG_TYPE_ERR,
" dapl_rt_comp_handler: rec %d retry %d\n",
@@ -298,7 +260,8 @@
}
status = ib_at_route_by_ip(((struct sockaddr_in *)&conn->r_addr)->sin_addr.s_addr,
- 0, 0, 0, &conn->dapl_rt, &conn->dapl_comp);
+ 0, 0, 0, &conn->dapl_rt,
+ &conn->dapl_comp,&conn->dapl_comp.req_id);
if (status < 0) {
dapl_dbg_log(DAPL_DBG_TYPE_ERR, "dapl_rt_comp_handler: "
"ib_at_route_by_ip failed with status %d\n",
@@ -306,9 +269,16 @@
event = IB_CME_DESTINATION_UNREACHABLE;
goto bail;
}
-
if (status == 1)
dapli_rt_comp_handler(conn->dapl_comp.req_id, conn, 1);
+
+ return;
+ }
+
+ if (!conn->dapl_rt.dgid.global.subnet_prefix || req_id != conn->dapl_comp.req_id) {
+ dapl_dbg_log(DAPL_DBG_TYPE_ERR,
+ " dapl_rt_comp_handler: ERROR: unexpected callback req_id=%d(%d)\n",
+ req_id, conn->dapl_comp.req_id );
return;
}
@@ -316,7 +286,7 @@
conn->dapl_comp.context = conn;
conn->retries = 0;
status = ib_at_paths_by_route(&conn->dapl_rt, 0, &conn->dapl_path, 1,
- &conn->dapl_comp);
+ &conn->dapl_comp, &conn->dapl_comp.req_id);
if (status) {
dapl_dbg_log(DAPL_DBG_TYPE_ERR,
"dapl_rt_comp_handler: ib_at_paths_by_route "
@@ -346,8 +316,6 @@
ib_cm_destroy_id(conn->cm_id);
if (conn->ep)
conn->ep->cm_handle = IB_INVALID_HANDLE;
- if (conn->sp)
- conn->sp->cm_srvc_handle = IB_INVALID_HANDLE;
/* take off the CM thread work queue and free */
dapl_os_lock( &g_cm_lock );
@@ -621,10 +589,8 @@
}
/* something to catch the signal */
-static void cm_handler(int signum)
+static void ib_sig_handler(int signum)
{
- dapl_dbg_log (DAPL_DBG_TYPE_CM," cm_thread(%d,0x%x): ENTER cm_handler %d\n",
- getpid(),g_cm_thread,signum);
return;
}
@@ -643,7 +609,7 @@
sigemptyset(&sigset);
sigaddset(&sigset, SIGUSR1);
pthread_sigmask(SIG_UNBLOCK, &sigset, NULL);
- signal( SIGUSR1, cm_handler);
+ signal( SIGUSR1, ib_sig_handler);
dapl_os_lock( &g_cm_lock );
while (!g_cm_destroy) {
@@ -667,7 +633,7 @@
dapl_dbg_log(DAPL_DBG_TYPE_CM,
" cm_thread: GET EVENT fd=%d n=%d\n",
ib_cm_get_fd(),ret);
- if (ib_cm_event_get(&event)) {
+ if (ib_cm_event_get_timed(0,&event)) {
dapl_dbg_log(DAPL_DBG_TYPE_CM,
" cm_thread: ERR %s eventi_get on %d\n",
strerror(errno), ib_cm_get_fd() );
@@ -732,6 +698,33 @@
g_cm_destroy = 0;
}
+/* async AT processing thread */
+void at_thread(void *arg)
+{
+ sigset_t sigset;
+
+ dapl_dbg_log (DAPL_DBG_TYPE_CM,
+ " at_thread(%d,0x%x): ENTER: at_fd %d\n",
+ getpid(), g_at_thread, ib_at_get_fd());
+
+ sigemptyset(&sigset);
+ sigaddset(&sigset, SIGUSR1);
+ pthread_sigmask(SIG_UNBLOCK, &sigset, NULL);
+ signal(SIGUSR1, ib_sig_handler);
+
+ while (!g_at_destroy) {
+ /* poll forever until callback or signal */
+ if (ib_at_callback_get_timed(-1) < 0) {
+ dapl_dbg_log(DAPL_DBG_TYPE_CM,
+ " at_thread: SIG? ret=%s, destroy=%d\n",
+ strerror(errno), g_at_destroy );
+ }
+ dapl_dbg_log(DAPL_DBG_TYPE_CM," at_thread: callback woke\n");
+ }
+ dapl_dbg_log(DAPL_DBG_TYPE_CM," at_thread(%d) EXIT \n", getpid());
+ g_at_destroy = 0;
+}
+
/************************ DAPL provider entry points **********************/
/*
@@ -826,33 +819,34 @@
conn->dapl_comp.context = conn;
conn->retries = 0;
dapl_os_memcpy(&conn->r_addr, r_addr, sizeof(DAT_SOCK_ADDR6));
+
+ /* put on CM thread work queue */
+ dapl_llist_init_entry((DAPL_LLIST_ENTRY*)&conn->entry);
+ dapl_os_lock( &g_cm_lock );
+ dapl_llist_add_tail(&g_cm_list,
+ (DAPL_LLIST_ENTRY*)&conn->entry, conn);
+ dapl_os_unlock(&g_cm_lock);
status = ib_at_route_by_ip(
((struct sockaddr_in *)&conn->r_addr)->sin_addr.s_addr,
((struct sockaddr_in *)&conn->hca->hca_address)->sin_addr.s_addr,
- 0, 0, &conn->dapl_rt, &conn->dapl_comp);
+ 0, 0, &conn->dapl_rt, &conn->dapl_comp, &conn->dapl_comp.req_id);
+
+ dapl_dbg_log(DAPL_DBG_TYPE_CM, " connect: at_route ret=%d,%s req_id %d GID %016llx
%016llx\n",
+ status, strerror(errno), conn->dapl_comp.req_id,
+ (unsigned long long)cpu_to_be64(conn->dapl_rt.dgid.global.subnet_prefix),
+ (unsigned long long)cpu_to_be64(conn->dapl_rt.dgid.global.interface_id) );
if (status < 0) {
dat_status = dapl_convert_errno(errno,"ib_at_route_by_ip");
- goto destroy;
+ dapli_destroy_cm_id(conn);
+ return dat_status;
}
- if (status == 1)
- dapli_rt_comp_handler(conn->dapl_comp.req_id, conn, 1);
-
- /* put on CM thread work queue */
- dapl_llist_init_entry((DAPL_LLIST_ENTRY*)&conn->entry);
- dapl_os_lock( &g_cm_lock );
- dapl_llist_add_tail(&g_cm_list,
- (DAPL_LLIST_ENTRY*)&conn->entry, conn);
- dapl_os_unlock(&g_cm_lock);
+ if (status > 0)
+ dapli_rt_comp_handler(conn->dapl_comp.req_id, conn, status);
return DAT_SUCCESS;
-
-destroy:
- dapli_destroy_cm_id(conn);
- return dat_status;
-
}
/*
@@ -992,6 +986,13 @@
conn->hca = ia_ptr->hca_ptr;
conn->service_id = ServiceID;
+ /* put on CM thread work queue */
+ dapl_llist_init_entry((DAPL_LLIST_ENTRY*)&conn->entry);
+ dapl_os_lock( &g_cm_lock );
+ dapl_llist_add_tail(&g_cm_list,
+ (DAPL_LLIST_ENTRY*)&conn->entry, conn);
+ dapl_os_unlock(&g_cm_lock);
+
dapl_dbg_log(DAPL_DBG_TYPE_EP,
" setup_listener(conn=%p cm_id=%d)\n",
sp_ptr->cm_srvc_handle,conn->cm_id);
@@ -1003,19 +1004,13 @@
dat_status = DAT_CONN_QUAL_IN_USE;
else
dat_status = DAT_INSUFFICIENT_RESOURCES;
- /* success */
- } else {
- /* put on CM thread work queue */
- dapl_llist_init_entry((DAPL_LLIST_ENTRY*)&conn->entry);
- dapl_os_lock( &g_cm_lock );
- dapl_llist_add_tail(&g_cm_list,
- (DAPL_LLIST_ENTRY*)&conn->entry, conn);
- dapl_os_unlock(&g_cm_lock);
+
+ dapli_destroy_cm_id(conn);
return dat_status;
}
- dapli_destroy_cm_id(conn);
- return dat_status;
+ /* success */
+ return DAT_SUCCESS;
}
@@ -1047,9 +1042,11 @@
" remove_listener(ia_ptr %p sp_ptr %p cm_ptr %p)\n",
ia_ptr, sp_ptr, conn );
- if (sp_ptr->cm_srvc_handle != IB_INVALID_HANDLE)
+ if (conn != IB_INVALID_HANDLE) {
+ sp_ptr->cm_srvc_handle = NULL;
dapli_destroy_cm_id(conn);
-
+ }
+
return DAT_SUCCESS;
}
Index: dapl/openib/dapl_ib_util.h
===================================================================
--- dapl/openib/dapl_ib_util.h (revision 2899)
+++ dapl/openib/dapl_ib_util.h (working copy)
@@ -53,6 +53,7 @@
#include <byteswap.h>
#include <infiniband/sa.h>
#include <infiniband/cm.h>
+#include <infiniband/at.h>
/* Typedefs to map common DAPL provider types to IB verbs */
typedef struct ibv_qp *ib_qp_handle_t;
@@ -68,8 +69,8 @@
#define IB_RC_RETRY_COUNT 7
#define IB_RNR_RETRY_COUNT 7
-#define IB_CM_RESPONSE_TIMEOUT 20 /* 4 sec */
-#define IB_MAX_CM_RETRIES 4
+#define IB_CM_RESPONSE_TIMEOUT 18 /* 1 sec */
+#define IB_MAX_CM_RETRIES 7
#define IB_REQ_MRA_TIMEOUT 27 /* a little over 9 minutes */
#define IB_MAX_AT_RETRY 3
@@ -92,21 +93,12 @@
IB_CME_BROKEN
} ib_cm_events_t;
-#ifndef IB_AT
-/* implement a quick hack to exchange GID/LID's until user IB_AT arrives */
-struct ib_at_ib_route {
- union ibv_gid gid;
- uint16_t lid;
+struct dapl_at_record {
+ uint64_t req_id;
+ DAT_SOCK_ADDR6 *addr;
+ DAPL_OS_WAIT_OBJECT *wait_object;
};
-struct ib_at_completion {
- void (*fn)(uint64_t req_id, void *context, int rec_num);
- void *context;
- uint64_t req_id;
-};
-
-#endif
-
/*
* dapl_llist_entry in dapl.h but dapl.h depends on provider
* typedef's in this file first. move dapl_llist_entry out of dapl.h
@@ -122,6 +114,7 @@
struct dapl_cm_id {
struct ib_llist_entry entry;
DAPL_OS_LOCK lock;
+ DAPL_OS_WAIT_OBJECT wait_object;
int retries;
int destroy;
int in_callback;
@@ -238,6 +231,10 @@
{
struct ibv_device *ib_dev;
ib_cq_handle_t ib_cq_empty;
+ DAPL_OS_LOCK cq_lock;
+ DAPL_OS_WAIT_OBJECT wait_object;
+ int cq_destroy;
+ DAPL_OS_THREAD cq_thread;
int max_inline_send;
uint16_t lid;
union ibv_gid gid;
@@ -257,11 +254,18 @@
void cm_thread (void *arg);
int dapli_cm_thread_init(void);
void dapli_cm_thread_destroy(void);
+void at_thread (void *arg);
+int dapli_at_thread_init(void);
+void dapli_at_thread_destroy(void);
+void cq_thread (void *arg);
+int dapli_cq_thread_init(struct dapl_hca *hca_ptr);
+void dapli_cq_thread_destroy(struct dapl_hca *hca_ptr);
-int dapli_get_lid(struct dapl_hca *hca_ptr, int port, uint16_t *lid );
+int dapli_get_lid(struct dapl_hca *hca_ptr, int port, uint16_t *lid);
int dapli_get_gid(struct dapl_hca *hca_ptr, int port, int index,
- union ibv_gid *gid );
-int dapli_get_addr(char *addr, int addr_len);
+ union ibv_gid *gid);
+int dapli_get_hca_addr(struct dapl_hca *hca_ptr);
+void dapli_ip_comp_handler(uint64_t req_id, void *context, int rec_num);
DAT_RETURN
dapls_modify_qp_state ( IN ib_qp_handle_t qp_handle,
Index: dapl/openib/README
===================================================================
--- dapl/openib/README (revision 2899)
+++ dapl/openib/README (working copy)
@@ -39,18 +39,33 @@
server: dtest -s
client: dtest -h hostname
+
+Testing: dtest, dapltest - cl.sh regress.sh
-setup/known issues:
-
- First drop with uCM (without IBAT), tested with simple dtest across 2 nodes.
- hand rolled path records require remote LID and GID set via enviroment:
+Setup:
- export DAPL_REMOTE_GID "fe80:0000:0000:0000:0002:c902:0000:4071"
- export DAPL_REMOTE_LID "0002"
+ Third drop of code, includes uCM and uAT support.
+ NOTE: uAT user library and kernel code in separate branch.
- Also, hard coded (RTR) for use with port 1 only.
-
- no memory windows support in ibverbs, dat_create_rmr fails.
-
+ build uAT library from following branch:
+
+ cd gen2/branches/shaharf-ibat/src/userspace/libibat/
+ ./autogen.sh &&./configure && make && make install
+
+ copy following uat source to latest trunk kernel src:
+ gen2/branches/shaharf-ibat/src/linux-kernel/infiniband/core
+ at.c at_priv.h att.c uat.c uat.h Makefile
+ gen2/branches/shaharf-ibat/src/linux-kernel/infiniband/include
+ ib_at.h ib_user_at.h
+
+ add udev rule:
+ KERNEL="uat*", NAME="infiniband/%k"
+
+
+Known issues:
+ no memory windows support in ibverbs, dat_create_rmr fails.
+ some uCM scale up issues with an 8 thread dapltest in regress.sh
+ hard coded modify QP RTR to port 1, waiting for ib_cm_init_qp_attr call.
+
Index: dapl/openib/dapl_ib_cq.c
===================================================================
--- dapl/openib/dapl_ib_cq.c (revision 2899)
+++ dapl/openib/dapl_ib_cq.c (working copy)
@@ -50,9 +50,96 @@
#include "dapl_adapter_util.h"
#include "dapl_lmr_util.h"
#include "dapl_evd_util.h"
+#include "dapl_ring_buffer_util.h"
#include <sys/poll.h>
+#include <signal.h>
+int dapli_cq_thread_init(struct dapl_hca *hca_ptr)
+{
+ DAT_RETURN dat_status;
+
+ dapl_dbg_log(DAPL_DBG_TYPE_UTIL," cq_thread_init(%p)\n", hca_ptr);
+
+ /* create thread to process inbound connect request */
+ dat_status = dapl_os_thread_create( cq_thread, (void*)hca_ptr,
&hca_ptr->ib_trans.cq_thread);
+ if (dat_status != DAT_SUCCESS)
+ {
+ dapl_dbg_log(DAPL_DBG_TYPE_ERR,
+ " cq_thread_init: failed to create thread\n");
+ return 1;
+ }
+ return 0;
+}
+
+void dapli_cq_thread_destroy(struct dapl_hca *hca_ptr)
+{
+ dapl_dbg_log(DAPL_DBG_TYPE_UTIL," cq_thread_destroy(%p)\n", hca_ptr);
+
+ /* destroy cr_thread and lock */
+ hca_ptr->ib_trans.cq_destroy = 1;
+ pthread_kill(hca_ptr->ib_trans.cq_thread, SIGUSR1);
+ dapl_dbg_log(DAPL_DBG_TYPE_CM," cq_thread_destroy(%p) SIGUSR1 sent\n",hca_ptr);
+ while (hca_ptr->ib_trans.cq_destroy != 2) {
+ struct timespec sleep, remain;
+ sleep.tv_sec = 0;
+ sleep.tv_nsec = 10000000; /* 10 ms */
+ dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
+ " cq_thread_destroy: waiting for cq_thread\n");
+ nanosleep (&sleep, &remain);
+ }
+ dapl_dbg_log(DAPL_DBG_TYPE_UTIL," cq_thread_destroy(%d) exit\n",getpid());
+ return;
+}
+
+/* something to catch the signal */
+static void ib_cq_handler(int signum)
+{
+ return;
+}
+
+void cq_thread( void *arg )
+{
+ struct dapl_hca *hca_ptr = arg;
+ struct dapl_evd *evd_ptr;
+ struct ibv_cq *ibv_cq = NULL;
+ sigset_t sigset;
+ int status = 0;
+
+ dapl_dbg_log ( DAPL_DBG_TYPE_UTIL," cq_thread: ENTER hca %p\n",hca_ptr);
+
+ sigemptyset(&sigset);
+ sigaddset(&sigset,SIGUSR1);
+ pthread_sigmask(SIG_UNBLOCK, &sigset, NULL);
+ signal(SIGUSR1, ib_cq_handler);
+
+ /* wait on DTO event, or signal to abort */
+ while (!hca_ptr->ib_trans.cq_destroy) {
+
+ struct pollfd cq_poll = {
+ .fd = hca_ptr->ib_hca_handle->cq_fd[0],
+ .events = POLLIN,
+ .revents = 0
+ };
+ status = poll(&cq_poll, 1, -1);
+ if ((status == 1) &&
+ (!ibv_get_cq_event(hca_ptr->ib_hca_handle, 0, &ibv_cq, (void*)&evd_ptr))) {
+
+ if (DAPL_BAD_HANDLE(evd_ptr, DAPL_MAGIC_EVD))
+ continue;
+
+ /* process DTO event via callback */
+ dapl_evd_dto_callback ( evd_ptr->header.owner_ia->hca_ptr->ib_hca_handle,
+ evd_ptr->ib_cq_handle,
+ (void*)evd_ptr );
+ } else {
+
+ }
+ }
+ hca_ptr->ib_trans.cq_destroy = 2;
+ dapl_dbg_log(DAPL_DBG_TYPE_UTIL," cq_thread: EXIT: hca %p \n", hca_ptr);
+ return;
+}
/*
* Map all verbs DTO completion codes to the DAT equivelent.
*
@@ -410,9 +497,9 @@
IN DAPL_EVD *evd_ptr,
IN ib_wait_obj_handle_t *p_cq_wait_obj_handle )
{
- dapl_dbg_log ( DAPL_DBG_TYPE_UTIL,
+ dapl_dbg_log ( DAPL_DBG_TYPE_CM,
" cq_object_create: (%p)=%p\n",
- p_cq_wait_obj_handle, *p_cq_wait_obj_handle);
+ p_cq_wait_obj_handle, evd_ptr );
/* set cq_wait object to evd_ptr */
*p_cq_wait_obj_handle = evd_ptr;
@@ -447,33 +534,86 @@
{
DAPL_EVD *evd_ptr = p_cq_wait_obj_handle;
ib_cq_handle_t cq = evd_ptr->ib_cq_handle;
- struct ibv_cq *ibv_cq;
- void *ibv_ctx;
- int status = -ETIMEDOUT;
+ struct ibv_cq *ibv_cq = NULL;
+ void *ibv_ctx = NULL;
+ int status = 0;
- dapl_dbg_log ( DAPL_DBG_TYPE_UTIL,
+ dapl_dbg_log ( DAPL_DBG_TYPE_CM,
" cq_object_wait: dev %p evd %p cq %p, time %d\n",
cq->context, evd_ptr, cq, timeout );
- /* Multiple EVD's sharing one event handle for now */
- if (cq) {
- struct pollfd cq_poll = {
- .fd = cq->context->cq_fd[0],
- .events = POLLIN
+ /* Multiple EVD's sharing one event handle for now until uverbs supports more */
+
+ /*
+ * This makes it very inefficient and tricky to manage multiple CQ per device open
+ * For example: 4 threads waiting on separate CQ events will all be woke when
+ * a CQ event fires. So the poll wakes up and the first thread to get to the
+ * the get_cq_event wins and the other 3 will block. The dapl_evd_wait code
+ * above will immediately do a poll_cq after returning from CQ wait and if
+ * nothing on the queue will call this wait again and go back to sleep. So
+ * as long as they all wake up, a mutex is held around the get_cq_event
+ * so no blocking occurs and they all return then everything should work.
+ * Of course, the timeout needs adjusted on the threads that go back to sleep.
+ */
+ while (cq) {
+ struct pollfd cq_poll = {
+ .fd = cq->context->cq_fd[0],
+ .events = POLLIN,
+ .revents = 0
};
- int timeout_ms = -1;
+ int timeout_ms = -1;
if (timeout != DAT_TIMEOUT_INFINITE)
timeout_ms = timeout/1000;
-
+
+ /* check if another thread processed the event already, pending queue > 0 */
+ dapl_os_lock( &evd_ptr->header.owner_ia->hca_ptr->ib_trans.cq_lock );
+ if (dapls_rbuf_count(&evd_ptr->pending_event_queue)) {
+ dapl_os_unlock( &evd_ptr->header.owner_ia->hca_ptr->ib_trans.cq_lock );
+ break;
+ }
+ dapl_os_unlock( &evd_ptr->header.owner_ia->hca_ptr->ib_trans.cq_lock );
+
+ dapl_dbg_log ( DAPL_DBG_TYPE_CM," cq_object_wait: polling\n");
status = poll(&cq_poll, 1, timeout_ms);
- if (status == 1)
- status = ibv_get_cq_event(cq->context,
- 0, &ibv_cq, &ibv_ctx);
- }
- dapl_dbg_log (DAPL_DBG_TYPE_UTIL,
- " cq_object_wait: RET cq %p ibv_cq %p ibv_ctx %p %x\n",
- cq,ibv_cq,ibv_ctx,status);
+ dapl_dbg_log ( DAPL_DBG_TYPE_CM," cq_object_wait: poll returned
status=%d\n",status);
+
+ /*
+ * If poll with timeout wakes then hold mutex around a poll with no timeout
+ * so subsequent get_cq_events will be guaranteed not to block
+ * If the event does not belong to this EVD then put it on proper EVD pending
+ * queue under the mutex.
+ */
+ if (status == 1) {
+ dapl_os_lock( &evd_ptr->header.owner_ia->hca_ptr->ib_trans.cq_lock );
+ status = poll(&cq_poll, 1, 0);
+ if (status == 1) {
+ status = ibv_get_cq_event(cq->context,
+ 0, &ibv_cq, &ibv_ctx);
+
+ /* if event is not ours, put on proper evd pending queue */
+ /* force another wakeup */
+ if ((ibv_ctx != evd_ptr ) &&
+ (!DAPL_BAD_HANDLE(ibv_ctx, DAPL_MAGIC_EVD))) {
+ dapl_dbg_log (DAPL_DBG_TYPE_CM,
+ " cq_object_wait: ibv_ctx %p != evd %p\n",
+ ibv_ctx, evd_ptr);
+ dapls_evd_copy_cq((struct evd_ptr*)ibv_ctx);
+ dapl_os_unlock(
&evd_ptr->header.owner_ia->hca_ptr->ib_trans.cq_lock );
+ continue;
+ }
+ }
+ dapl_os_unlock( &evd_ptr->header.owner_ia->hca_ptr->ib_trans.cq_lock );
+ break;
+
+ } else if (status == 0) {
+ status = ETIMEDOUT;
+ break;
+ }
+ }
+ dapl_dbg_log (DAPL_DBG_TYPE_CM,
+ " cq_object_wait: RET evd %p cq %p ibv_cq %p ibv_ctx %p %s\n",
+ evd_ptr, cq,ibv_cq,ibv_ctx,strerror(errno));
return(dapl_convert_errno(status,"cq_wait_object_wait"));
More information about the general
mailing list