[Pkg-ofed-commits] [dapl] 03/06: Imported Upstream version 2.1.7

Ana Beatriz Guerrero López ana at moszumanska.debian.org
Fri Mar 25 17:45:17 UTC 2016


This is an automated email from the git hooks/post-receive script.

ana pushed a commit to branch master
in repository dapl.

commit c8e953b3caa599565fc0f7fb76879f194f86f153
Author: Ana Beatriz Guerrero Lopez <ana at debian.org>
Date:   Fri Mar 25 18:44:26 2016 +0100

    Imported Upstream version 2.1.7
---
 ChangeLog                                     | 135 ++++++++++++++
 configure                                     |  20 +-
 configure.in                                  |   4 +-
 dapl.spec                                     |   5 +-
 dapl.spec.in                                  |   3 +
 dapl/openib_common/collectives/fca_provider.c |  18 +-
 dapl/openib_common/dapl_ib_common.h           |   1 +
 dapl/openib_common/dapl_mic_common.h          |   9 +-
 dapl/openib_common/qp.c                       |   1 +
 dapl/openib_mcm/cm.c                          |  16 ++
 dapl/openib_mcm/device.c                      |   4 +-
 dapl/openib_mcm/proxy.c                       |  15 +-
 dapl/openib_ucm/dapl_ib_util.h                |   1 +
 dapl/openib_ucm/device.c                      |   3 +-
 dapl/svc/mcm.c                                |  12 +-
 dapl/svc/mix.c                                |  46 +++--
 dapl/svc/mpxy_in.c                            | 109 +++++++----
 dapl/svc/mpxy_out.c                           |  15 +-
 dapl/svc/mpxyd.c                              |  10 +-
 dapl/svc/mpxyd.h                              |   3 +-
 dapl/svc/util.c                               |   6 +
 dat/common/dat_api.c                          |   3 +
 test/dapltest/cmd/dapl_params.c               |   1 +
 test/dtest/dtest.c                            | 256 +++++++++++++++++++++-----
 24 files changed, 561 insertions(+), 135 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 3b10b3c..2851a0d 100755
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,138 @@
+commit 963e5d793867644c770c087f1ef443550779ca8c
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date:   Tue Sep 29 09:05:27 2015 -0700
+
+    dtest: add -a -i options, all data sizes, incremental size
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit 5410203cf0f5908540b43bfa2a219de4b9042a01
+Author: Bharat Potnuri <bharat at chelsio.com>
+Date:   Tue Sep 29 08:49:10 2015 -0700
+
+    dapl: Fix segfault while freeing qp
+    
+    In function dapls_ib_qp_free(), pointers qp and cm_ptr->cm_id->qp are pointing to the same qp
+    structure, initialized in function dapls_ib_qp_alloc(). The memory pointed by these pointers are freed
+    twice in function dapls_ib_qp_free(), using rdma_destroy_qp() for the case _OPENIB_CMA defined and
+    then further using ibv_destroy_qp(), causing a segmentation fault while freeing the qp. Therefore
+    assigned NULL value to qp to avoid freeing illegal memory.
+    
+    Fixes: 7ff4f840bf11 ("common: add CM-EP linking to support mutiple CM's and proper protection during
+    destruction")
+    
+    Signed-off-by: Bharat Potnuri <bharat at chelsio.com>
+    Acked-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit fb64e157b9dd741ba942db00ceee37ea0f4ddcab
+Author: Amir Hanania <amir.hanania at intel.com>
+Date:   Wed Sep 23 14:43:38 2015 -0700
+
+    mpxyd: add P2P inline support for data size <= 96 bytes
+    
+    Improve small message latency for proxy to proxy service
+    by including data with the proxy work request. Necessary
+    changes made to preservie order across WR's regardless
+    of size. Additional logging included. Improves single byte
+    one-way latency of about 27% on MFO configurations.
+    
+    Changes made to avoid forwarding 0-byte rdma write to
+    scif_writeto, remove CPU hand copies, and order.
+    
+    Changes for numa_node == -1 such that mic0 assumes MSS
+    and mic1 assumes MXS modes.
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+    Signed-off-by: Amir Hanania <amir.hanania at intel.com>
+
+commit 2f7f25a808d9e6b3ee613c5b8b9a9e2f5abe5c55
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date:   Mon Sep 21 15:48:15 2015 -0700
+
+    dtest: change rdma_write_ping_pong so client is always last receiver
+    
+    server always waits after test loops for DREQ event so in order
+    to gracefully shutdown client should always receive last handshake
+    message and issue DREQ. Remove logging in loop.
+    
+    Always init data and increase min rdma buffer size to 4KB.
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit 453373f018a1c823125f6dd95952b343987b1480
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date:   Mon Sep 21 08:24:01 2015 -0700
+
+    ucm: add DAPL_NETWORK_PROCESS_NUM option for total ranks
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit 779dfdfe4ebc6f287544e8aad589e1578a58537a
+Author: Amir Hanania <amir.hanania at intel.com>
+Date:   Wed Sep 16 17:31:13 2015 -0700
+
+    ucm: fca create group incorrectly using IB addr instead of socket address.
+    
+    need the socket address for socket based create group info exchange.
+    
+    Signed-off-by: Amir Hanania <amir.hanania at intel.com>
+
+commit 7476da8d3c523f9a719748a046d339ea29f41aef
+Author: Amir Hanania <amir.hanania at intel.com>
+Date:   Wed Sep 16 17:27:27 2015 -0700
+
+    ucm: fca_comm_destroy called with NULL
+    
+    In some cases dapli_free_collective_group is called without the comm was initialized.
+    fca_comm_destroy call in this func seg fault.
+    
+    Signed-off-by: Amir Hanania <amir.hanania at intel.com>
+
+commit f0d97457ba86bdc13901dc37996d2f7419f64360
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date:   Tue Sep 15 08:45:03 2015 -0700
+
+    dtest: add -W option for rdma write pinpong, similiar to ib_write_lat
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit 547d1fe1257bf4709bf38eae8c8013d320a04432
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date:   Mon Aug 31 15:14:46 2015 -0700
+
+    docs: update release notes for collective build
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit 9b52aa90fefb69d1ba8fdd689f618c2ce250825d
+Author: Amir Hanania <amir.hanania at intel.com>
+Date:   Mon Aug 24 13:22:53 2015 -0700
+
+    mpxyd: reduce log level for rcv message flush
+    
+    Signed-off-by: Amir Hanania <amir.hanania at intel.com>
+
+commit 5a1d9e7f386335dcae676b372269a061e6f5294b
+Author: Carol L Soto <clsoto at linux.vnet.ibm.com>
+Date:   Mon Aug 24 12:58:58 2015 -0700
+
+    dapltest: dapltest with no argument not working in ppc64 arch
+    
+    If dapltest is run with no args then the client was getting
+    Warning: conn_event_wait DAT_CONNECTION_EVENT_NON_PEER_REJECTED
+    Reference to RH1056487- dapltest Read and Write performance
+    tests are not working
+    
+    Signed-off-by: Carol L Soto <clsoto at linux.vnet.ibm.com>
+
+commit 91febc42f0070b2b9eaa81c0c113c6ff7ab8ea60
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date:   Thu Aug 13 09:55:47 2015 -0700
+
+    Release 2.1.6
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
 commit ad43b8d3ca9f67d3231525b2808776719686deba
 Author: Arlin Davis <arlin.r.davis at intel.com>
 Date:   Wed Aug 12 17:30:23 2015 -0700
diff --git a/configure b/configure
index 8648901..a1f1a89 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.67 for dapl 2.1.6.
+# Generated by GNU Autoconf 2.67 for dapl 2.1.7.
 #
 # Report bugs to <linux-rdma at vger.kernel.org>.
 #
@@ -562,8 +562,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='dapl'
 PACKAGE_TARNAME='dapl'
-PACKAGE_VERSION='2.1.6'
-PACKAGE_STRING='dapl 2.1.6'
+PACKAGE_VERSION='2.1.7'
+PACKAGE_STRING='dapl 2.1.7'
 PACKAGE_BUGREPORT='linux-rdma at vger.kernel.org'
 PACKAGE_URL=''
 
@@ -1318,7 +1318,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures dapl 2.1.6 to adapt to many kinds of systems.
+\`configure' configures dapl 2.1.7 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1388,7 +1388,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of dapl 2.1.6:";;
+     short | recursive ) echo "Configuration of dapl 2.1.7:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-dapl configure 2.1.6
+dapl configure 2.1.7
 generated by GNU Autoconf 2.67
 
 Copyright (C) 2010 Free Software Foundation, Inc.
@@ -1935,7 +1935,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by dapl $as_me 2.1.6, which was
+It was created by dapl $as_me 2.1.7, which was
 generated by GNU Autoconf 2.67.  Invocation command line was
 
   $ $0 $@
@@ -2803,7 +2803,7 @@ fi
 # Define the identity of the package.
 
  PACKAGE=dapl
- VERSION=2.1.6
+ VERSION=2.1.7
 
 
 cat >>confdefs.h <<_ACEOF
@@ -13281,7 +13281,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by dapl $as_me 2.1.6, which was
+This file was extended by dapl $as_me 2.1.7, which was
 generated by GNU Autoconf 2.67.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -13347,7 +13347,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-dapl config.status 2.1.6
+dapl config.status 2.1.7
 configured by $0, generated by GNU Autoconf 2.67,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.in b/configure.in
index 6a792d6..5fbbfe1 100755
--- a/configure.in
+++ b/configure.in
@@ -1,12 +1,12 @@
 dnl Process this file with autoconf to produce a configure script.
 
 AC_PREREQ(2.57)
-AC_INIT(dapl, 2.1.6, linux-rdma at vger.kernel.org)
+AC_INIT(dapl, 2.1.7, linux-rdma at vger.kernel.org)
 AC_CONFIG_SRCDIR([dat/udat/udat.c])
 AC_CONFIG_AUX_DIR(config)
 AC_CONFIG_MACRO_DIR([m4])
 AM_CONFIG_HEADER(config.h)
-AM_INIT_AUTOMAKE(dapl, 2.1.6)
+AM_INIT_AUTOMAKE(dapl, 2.1.7)
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 
 AM_PROG_LIBTOOL
diff --git a/dapl.spec b/dapl.spec
index 17e00d7..4d26fd3 100644
--- a/dapl.spec
+++ b/dapl.spec
@@ -37,7 +37,7 @@
 %{!?_CONF: %define _CONF ""}
 
 Name: dapl
-Version: 2.1.6
+Version: 2.1.7
 Release: 1%{?dist}
 Summary: A Library for userspace access to RDMA devices using OS Agnostic DAT APIs, proxy daemon for offloading RDMA 
 
@@ -153,6 +153,9 @@ fi
 mv /tmp/%{version}-dat.conf %{_sysconfdir}/dat.conf
 
 %changelog
+* Tue Sep 29 2015 Arlin Davis <ardavis at ichips.intel.com> - 2.1.7
+- DAT/DAPL Version 2.1.7 Release 1, OFED 3.18-1 GA
+
 * Wed Aug 12 2015 Arlin Davis <ardavis at ichips.intel.com> - 2.1.6
 - DAT/DAPL Version 2.1.6 Release 1, OFED 3.18-1
 
diff --git a/dapl.spec.in b/dapl.spec.in
index f6ed8fd..2f16477 100755
--- a/dapl.spec.in
+++ b/dapl.spec.in
@@ -153,6 +153,9 @@ fi
 mv /tmp/%{version}-dat.conf %{_sysconfdir}/dat.conf
 
 %changelog
+* Tue Sep 29 2015 Arlin Davis <ardavis at ichips.intel.com> - 2.1.7
+- DAT/DAPL Version 2.1.7 Release 1, OFED 3.18-1 GA
+
 * Wed Aug 12 2015 Arlin Davis <ardavis at ichips.intel.com> - 2.1.6
 - DAT/DAPL Version 2.1.6 Release 1, OFED 3.18-1
 
diff --git a/dapl/openib_common/collectives/fca_provider.c b/dapl/openib_common/collectives/fca_provider.c
index 8d504bb..51bf6a8 100755
--- a/dapl/openib_common/collectives/fca_provider.c
+++ b/dapl/openib_common/collectives/fca_provider.c
@@ -314,7 +314,18 @@ static int create_member(struct dapl_hca *hca)
 	 * only rank0 needs listen, but we don't know who is rank0 yet.
 	 * Everyone listen, start on seed port until find one unused
 	 */
+#ifdef _OPENIB_UCM_
+	if (getlocalipaddr((char*)&tp->m_addr, sizeof(DAT_SOCK_ADDR))) {
+		dapl_log(DAPL_DBG_TYPE_ERR,
+			"create_member: getlocaladdr ERR ret=%s \n", strerror(errno));
+		ret = errno;
+		goto err;
+	}
+	dapl_log(DAPL_DBG_TYPE_EXTENSION, " create_member: UCM local addr %s\n",
+			inet_ntoa(((struct sockaddr_in *)&tp->m_addr)->sin_addr));
+#else
 	memcpy((void*)&tp->m_addr, (void*)&hca->hca_address, sizeof(DAT_SOCK_ADDR));
+#endif
 
 	do {
 		tp->m_addr.sin_port = htons(lport++);
@@ -458,7 +469,7 @@ static void create_group(struct coll_group *group)
 		ret = send(group->sock, &group->id, sizeof(group->id), 0);
 		if (ret < 0) {
 			dapl_log(DAPL_DBG_TYPE_ERR,
-				 " create_grp: snd() ERR: %s g_id=\n",
+				 " create_grp: snd() ERR: %s g_id = %d\n",
 				 strerror(errno), group->id);
 			goto error;
 		}
@@ -764,7 +775,7 @@ dapli_create_collective_group(
 	int i;
 
 	dapl_log(DAPL_DBG_TYPE_EXTENSION,
-		" create_grp[%d]: enter evd=%p cq=%p pz=%p "
+		" dapli create_grp[%d]: enter evd=%p cq=%p pz=%p "
 		"m=%p *m=%p t_ranks=%d g_id=%d l_idx=%d l_ranks=%d\n",
 		self, evd, evd->ib_cq_handle, pz,
 		members, *members, ranks, id, g_info->local_rank,
@@ -921,7 +932,8 @@ dapli_free_collective_group(
 			     group->ranks *
 			     group->tp->m_size);
 
-	fca_comm_destroy(group->comm);
+	if (group->comm)
+		fca_comm_destroy(group->comm);
 
 	if (group->self == 0)
 		fca_comm_end(group->ctx, group->comm_desc.comm_id);
diff --git a/dapl/openib_common/dapl_ib_common.h b/dapl/openib_common/dapl_ib_common.h
index 1ac0c12..69ec31b 100755
--- a/dapl/openib_common/dapl_ib_common.h
+++ b/dapl/openib_common/dapl_ib_common.h
@@ -65,6 +65,7 @@ struct dcm_ib_qp {
 	struct mcm_wrc_info	 wrc;	   /* local WC info */
 	struct mcm_wrc_info	 wrc_rem;  /* remote WR info */
 	DAPL_OS_LOCK		 lock;	   /* Proxy WR and WC queues */
+	uint8_t			 p2p_data; /* Max number of bytes to pass to proxy in the WR */
 	uint8_t			 ep_map;   /* Peer EP mapping, MXS, MSS, HST */
 	uint32_t		 seg_sz;   /* Peer MXS Proxy-in segment size */
 	char			 *wr_buf_rx; /* mcm_wr_rx_t entries, devices without inline data  */
diff --git a/dapl/openib_common/dapl_mic_common.h b/dapl/openib_common/dapl_mic_common.h
index 5afc8ec..86a815e 100755
--- a/dapl/openib_common/dapl_mic_common.h
+++ b/dapl/openib_common/dapl_mic_common.h
@@ -58,6 +58,7 @@
 #define DAT_MCM_PDATA_SIZE      64
 #define DAT_MCM_PROXY_DATA	40
 #define DAT_MCM_SEG_PO2		17
+#define DAT_MCM_P2P_INLINE	96
 
 #define ALIGN_64(o)  ((o + 64 - 1) & ~(64-1))
 #define ALIGN_P64(o) ((((uintptr_t)o) + 64 - 1)& ~(64-1))
@@ -843,6 +844,7 @@ enum mcm_wr_flags {
 
 	M_READ_FROM_DONE	= 1 << 16, /* m_wr mpxyd read_from_done, ready for posting */
 	M_SEND_DIRECT		= 1 << 17, /* m_wr SEND direct from host memory, no proxy out buffer */
+	M_PROXY_INLINE		= 1 << 18, /* m_wr contains the data */
 };
 
 /* 80 bytes */
@@ -881,7 +883,7 @@ typedef struct mcm_wrc_info {
 	uint16_t	wc_end;
 } __attribute__((packed)) mcm_wrc_info_t;
 
-/* WR: 160 bytes, direct RDMA write from remote Proxy-in service */
+/* WR: 256 bytes, direct RDMA write from remote Proxy-in service */
 typedef struct mcm_wr_rx {
 	struct dat_mix_wr	wr;
 	struct dat_mix_sge   	sg[DAT_MIX_SGE_MAX];
@@ -893,8 +895,11 @@ typedef struct mcm_wr_rx {
 	uint32_t		flags;
 	uint32_t		time;
 	uint32_t		qcnt;
+	char			inline_data[DAT_MCM_P2P_INLINE];
 } __attribute__((packed)) mcm_wr_rx_t;
 
+#define MCM_WR_RX_NO_DATA	(sizeof(mcm_wr_rx_t) - DAT_MCM_P2P_INLINE)
+
 /* WC: 80 bytes, direct RDMA write from remote Proxy-in service */
 typedef struct mcm_wc_rx {
 	struct dat_mix_wc	wc;
@@ -948,7 +953,7 @@ static inline void mcm_hton_wr_rx(struct mcm_wr_rx *m_wr_rx, struct mcm_wr *m_wr
 {
 	int i;
 
-	memset((void*)m_wr_rx, 0, sizeof(*m_wr_rx));
+	memset((void*)m_wr_rx, 0, MCM_WR_RX_NO_DATA);
 	m_wr_rx->org_id = (uint64_t) htonll((uint64_t)m_wr); /* proxy_out WR */
 	m_wr_rx->flags = htonl(m_wr->flags);
 	m_wr_rx->w_idx = htonl(wc_tl); /* snd back wc tail */
diff --git a/dapl/openib_common/qp.c b/dapl/openib_common/qp.c
index 527fc1d..01f91ca 100755
--- a/dapl/openib_common/qp.c
+++ b/dapl/openib_common/qp.c
@@ -397,6 +397,7 @@ DAT_RETURN dapls_ib_qp_free(IN DAPL_IA * ia_ptr, IN DAPL_EP * ep_ptr)
 #ifdef _OPENIB_CMA_
 		rdma_destroy_qp(cm_ptr->cm_id);
 		cm_ptr->cm_id->qp = NULL;
+		qp = NULL;
 #endif
 
 #ifdef _OPENIB_MCM_
diff --git a/dapl/openib_mcm/cm.c b/dapl/openib_mcm/cm.c
index 56ed23a..f2a4b8d 100755
--- a/dapl/openib_mcm/cm.c
+++ b/dapl/openib_mcm/cm.c
@@ -1198,6 +1198,14 @@ void mcm_connect_rtu(dp_ib_cm_handle_t cm, dat_mcm_msg_t *msg)
 		if (MXF_EP(&cm->msg.daddr1)) {
 			/* save PI WR info, create local WC_q, send back WC info */
 			mcm_ntoh_wrc(&ep->qp_handle->wrc_rem, (mcm_wrc_info_t*)cm->msg.p_proxy);
+			if (ep->qp_handle->wrc_rem.wr_sz > MCM_WR_RX_NO_DATA)
+				ep->qp_handle->p2p_data = DAT_MCM_P2P_INLINE;
+			else
+				ep->qp_handle->p2p_data = 0;
+
+			dapl_log(DAPL_DBG_TYPE_CM, "CONN_RTU: qp %p set proxy max inline to %d\n",
+					ep->qp_handle, ep->qp_handle->p2p_data);
+
 			mcm_create_wc_q(ep->qp_handle, ep->qp_handle->wrc_rem.wr_end + 1);
 			mcm_hton_wrc((mcm_wrc_info_t*)cm->msg.p_proxy, &ep->qp_handle->wrc);
 			ep->qp_handle->ep_map = cm->msg.daddr1.ep_map;
@@ -1522,6 +1530,14 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data)
 		if (MXF_EP(&cm->msg.daddr1)) {
 			/* save PI WR info, create local WC_q, send back WC info */
 			mcm_ntoh_wrc(&ep->qp_handle->wrc_rem, (mcm_wrc_info_t*)cm->msg.p_proxy);
+			if (ep->qp_handle->wrc_rem.wr_sz > MCM_WR_RX_NO_DATA)
+				ep->qp_handle->p2p_data = DAT_MCM_P2P_INLINE;
+			else
+				ep->qp_handle->p2p_data = 0;
+
+			dapl_log(DAPL_DBG_TYPE_CM, "ACCEPT_USR: qp %p set proxy max inline to %d\n",
+					ep->qp_handle, ep->qp_handle->p2p_data);
+
 			mcm_create_wc_q(ep->qp_handle, ep->qp_handle->wrc_rem.wr_end + 1);
 			mcm_hton_wrc((mcm_wrc_info_t*)cm->msg.p_proxy, &ep->qp_handle->wrc);
 			ep->qp_handle->ep_map = cm->msg.daddr1.ep_map;
diff --git a/dapl/openib_mcm/device.c b/dapl/openib_mcm/device.c
index 92ab201..86c4565 100755
--- a/dapl/openib_mcm/device.c
+++ b/dapl/openib_mcm/device.c
@@ -191,8 +191,6 @@ DAT_RETURN dapls_ib_open_hca(IN IB_HCA_NAME hca_name,
 		 flags & DAPL_OPEN_QUERY ? "QUERY MODE":"STD MODE");
 
 	/* set RC tunables via enviroment or default */
-	hca_ptr->ib_trans.ib_cm.max_inline =
-	    dapl_os_get_env_val("DAPL_MAX_INLINE", INLINE_SEND_IB_DEFAULT);
 	hca_ptr->ib_trans.ib_cm.ack_retry =
 	    dapl_os_get_env_val("DAPL_ACK_RETRY", DCM_ACK_RETRY);
 	hca_ptr->ib_trans.ib_cm.ack_timer =
@@ -316,7 +314,7 @@ DAT_RETURN dapls_ib_open_hca(IN IB_HCA_NAME hca_name,
 	if (dapl_ib_inline_data(hca_ptr->ib_hca_handle)) {
 		hca_ptr->ib_trans.ib_cm.max_inline =
 			dapl_os_get_env_val("DAPL_MAX_INLINE",
-					    INLINE_SEND_IB_DEFAULT);
+					    sizeof(mcm_wr_rx_t));
 	}
 	else
 		hca_ptr->ib_trans.ib_cm.max_inline = 0;
diff --git a/dapl/openib_mcm/proxy.c b/dapl/openib_mcm/proxy.c
index 824f575..5abb8b1 100755
--- a/dapl/openib_mcm/proxy.c
+++ b/dapl/openib_mcm/proxy.c
@@ -110,14 +110,21 @@ int mcm_send_pi(struct dcm_ib_qp *m_qp,
 				sge.lkey = m_qp->wr_buf_rx_mr->lkey;
 			}
 			sge.addr = (uint64_t)(uintptr_t) wr_rx_ptr;
-			sge.length = (uint32_t) sizeof(struct mcm_wr_rx); /* 160 byte WR */
+			sge.length = (uint32_t)m_qp->wrc_rem.wr_sz;
+
+			if (m_qp->p2p_data && seg_len < m_qp->p2p_data) {
+				dapl_log(DAPL_DBG_TYPE_EP,
+					 " Sending p2p data, len %d\n", seg_len);
+				wr_flags |= M_PROXY_INLINE;
+				memcpy(wr_rx_ptr->inline_data, (void *)(l_addr + l_off), seg_len);
+			}
 
 			dapl_log(DAPL_DBG_TYPE_EP,
 				 " mcm_send_pi[%d]: seg_ln %d wr_idx %d, tl %d hd %d\n",
 				 i, seg_len, wr_idx, m_qp->wr_tl, m_qp->wr_hd);
 
 			/* build local m_wr_rx for remote PI */
-			memset((void*)wr_rx_ptr, 0, sizeof(struct mcm_wr_rx));
+			memset((void*)wr_rx_ptr, 0, MCM_WR_RX_NO_DATA);
 			wr_rx_ptr->org_id = (uint64_t) htonll((uint64_t)wr->wr_id);
 			wr_rx_ptr->flags = htonl(wr_flags);
 			wr_rx_ptr->w_idx = htonl(m_qp->wc_tl); /* snd back wc tail */
@@ -168,7 +175,7 @@ int mcm_send_pi(struct dcm_ib_qp *m_qp,
 				 i, wr_imm.wr_id, m_qp->qp2->qp_num, wr_imm.opcode,
 				 wr_flags, ntohl(wr_imm.imm_data),
 				 l_addr + l_off, wr_imm.wr.rdma.remote_addr,
-				 wr_imm.wr.rdma.rkey, sizeof(struct mcm_wr_rx), l_len);
+				 wr_imm.wr.rdma.rkey, sge.length, l_len);
 			dapl_log(DAPL_DBG_TYPE_EP,
 				 " mcm_send_pi[%d]: WR wr_id %Lx qn %x op %d flgs %x"
 				 " imm %x raddr %p rkey %x sg_ln %d tl %d me %d hd %d\n",
@@ -437,7 +444,7 @@ int mcm_create_wc_q(struct dcm_ib_qp *m_qp, int entries)
 		m_qp->wrc.wc_addr, m_qp->wc_mr->addr, ALIGN_PAGE(m_qp->wrc.wc_len),
 		entries, m_qp->wc_mr->rkey, m_qp->wc_mr->lkey);
 
-	if (!m_qp->ep->header.owner_ia->hca_ptr->ib_trans.ib_cm.max_inline) {
+	if (!m_qp->tp->ib_cm.max_inline) {
 
 		if (posix_memalign((void **)&m_qp->wr_buf_rx,
 				   4096, entries * sizeof(mcm_wr_rx_t))) {
diff --git a/dapl/openib_ucm/dapl_ib_util.h b/dapl/openib_ucm/dapl_ib_util.h
index ac74bab..1abe1dc 100755
--- a/dapl/openib_ucm/dapl_ib_util.h
+++ b/dapl/openib_ucm/dapl_ib_util.h
@@ -131,6 +131,7 @@ typedef struct _ib_hca_transport
 	int			drep_time;
 	int			nodes;
 	int			ppn;
+	int			ranks;
 	int 			threshold;
 	DAPL_OS_LOCK		slock;	
 	int			s_hd;
diff --git a/dapl/openib_ucm/device.c b/dapl/openib_ucm/device.c
index 98693b2..f23c77b 100755
--- a/dapl/openib_ucm/device.c
+++ b/dapl/openib_ucm/device.c
@@ -520,6 +520,7 @@ static int ucm_service_create(IN DAPL_HCA *hca)
 	/* setup CM timers and queue sizes based on cluster size */
 	tp->nodes = dapl_os_get_env_val("DAPL_NETWORK_NODES", UCM_DEF_NODES);
 	tp->ppn = dapl_os_get_env_val("DAPL_NETWORK_PPN", UCM_DEF_PPN);
+	tp->ranks = dapl_os_get_env_val("DAPL_NETWORK_PROCESS_NUM", tp->nodes * tp->ppn);
 	tp->threshold = dapl_os_get_env_val("DAPL_NETWORK_TRESHOLD", UCM_DEF_THRESHOLD);
 	tp->retries = dapl_os_get_env_val("DAPL_UCM_RETRY", DCM_RETRY_CNT);
 	tp->wait_time = dapl_os_get_env_val("DAPL_UCM_WAIT_TIME", DCM_WAIT_TIME);
@@ -527,7 +528,7 @@ static int ucm_service_create(IN DAPL_HCA *hca)
 	tp->drep_time = dapl_os_get_env_val("DAPL_UCM_DREP_TIME", DCM_DREP_TIME);
 	tp->cm_timer = dapl_os_get_env_val("DAPL_UCM_TIMER", DCM_CM_TIMER);
 
-	if ((tp->nodes * tp->ppn) <= tp->threshold) {
+	if (tp->ranks <= tp->threshold) {
 		tp->rep_time = dapl_os_get_env_val("DAPL_UCM_REP_TIME", DCM_REP_TIME);
 		tp->rtu_time = dapl_os_get_env_val("DAPL_UCM_RTU_TIME", DCM_RTU_TIME);
 		tp->qpe = dapl_os_get_env_val("DAPL_UCM_QP_SIZE", DCM_QP_SIZE);
diff --git a/dapl/svc/mcm.c b/dapl/svc/mcm.c
index bd3149f..4b91090 100755
--- a/dapl/svc/mcm.c
+++ b/dapl/svc/mcm.c
@@ -49,6 +49,8 @@ int mcm_rep_ms = 4000;
 int mcm_rtu_ms = 2000;
 int mcm_dreq_ms = 1000;
 int mcm_proxy_in = 1;
+int mcm_mic0_mss = 1;  /* numa_node invalid, default mic0 == MSS */
+int mcm_mic1_mss = 0;  /* numa_node invalid, default mic1 == MXS */
 
 extern int mix_buffer_sg_po2;
 extern int mcm_rx_entries;
@@ -199,7 +201,13 @@ int mcm_init_cm_service(mcm_ib_dev_t *md)
 			  md->mc->ver == MIX_COMP || mcm_proxy_in == 0)
 			md->addr.ep_map = MIC_SSOCK_DEV;
 		else
-		md->addr.ep_map = MIC_XSOCK_DEV;
+			md->addr.ep_map = MIC_XSOCK_DEV;
+
+		/* Invalid numa, check settings MSS for mic0, MXS for mic1 */
+		if (md->numa_node == -1 && md->mc->scif_id == 1 && mcm_mic0_mss)
+			md->addr.ep_map = MIC_SSOCK_DEV;
+		if (md->numa_node == -1 && md->mc->scif_id == 2 && mcm_mic1_mss)
+			md->addr.ep_map = MIC_SSOCK_DEV;
 	}
 
 	/* setup CM timers and queue sizes */
@@ -468,7 +476,7 @@ void mcm_flush_qp(struct mcm_qp *m_qp)
 	while (m_qp->sr_tl != m_qp->sr_hd) {
 		m_sr = (struct mcm_sr *)(m_qp->sr_buf + (m_qp->sr_sz * m_qp->sr_tl));
 		if (m_sr->wr_id) {
-			mlog(1, " QP %p SR[%d] %p wr_id %Lx dto_event -> IBV_WC_FLUSH_ERR\n",
+			mlog(2, " QP %p SR[%d] %p wr_id %Lx dto_event -> IBV_WC_FLUSH\n",
 				m_qp, m_sr->s_idx, m_sr, m_sr->wr_id);
 			wc.wr_id = m_sr->wr_id;
 			wc.imm_data = 0;
diff --git a/dapl/svc/mix.c b/dapl/svc/mix.c
index ec715f3..cb82499 100755
--- a/dapl/svc/mix.c
+++ b/dapl/svc/mix.c
@@ -207,7 +207,6 @@ void mix_scif_accept(scif_epd_t listen_ep)
 	if (smd)
 		return;
 err:
-	mlog(0, " ERR: open_device -> closing SCIF client EPs %d %d %d \n", op_ep, tx_ep, ev_ep);
 
 out_close_tx_ep:
 	scif_close(tx_ep);
@@ -1594,6 +1593,10 @@ int mix_cm_rep_in(mcm_cm_t *m_cm, dat_mcm_msg_t *pkt, int pkt_len)
 	memcpy(&m_cm->msg.daddr1, &pkt->saddr1, sizeof(dat_mcm_addr_t));
 	memcpy(&m_cm->msg.daddr2, &pkt->saddr2, sizeof(dat_mcm_addr_t));
 	mcm_ntoh_wrc(&m_cm->m_qp->wrc_rem, (mcm_wrc_info_t *)m_cm->msg.p_proxy); /* peer RI WRC info */
+	if (m_cm->m_qp->wrc_rem.wr_sz > MCM_WR_RX_NO_DATA)
+		m_cm->m_qp->p2p_data = DAT_MCM_P2P_INLINE;
+	else
+		m_cm->m_qp->p2p_data = 0;
 
 	mlog(2, " WRC: m_qp %p - WR 0x%Lx rkey 0x%x ln %d, sz %d end %d"
 		" WC 0x%Lx rkey 0x%x ln %d, sz %d end %d\n",
@@ -1603,13 +1606,14 @@ int mix_cm_rep_in(mcm_cm_t *m_cm, dat_mcm_msg_t *pkt, int pkt_len)
 	     m_cm->m_qp->wrc.wc_rkey, m_cm->m_qp->wrc.wc_len,
 	     m_cm->m_qp->wrc.wc_sz, m_cm->m_qp->wrc.wc_end);
 
-	mlog(2, " WRC_rem: m_qp %p - WR 0x%Lx rkey 0x%x ln %d, sz %d end %d"
-		" WC 0x%Lx rkey 0x%x ln %d, sz %d end %d\n",
-	     m_cm->m_qp, m_cm->m_qp->wrc_rem.wr_addr, m_cm->m_qp->wrc_rem.wr_rkey,
+	mlog(2, " WRC_rem: WR 0x%Lx rkey 0x%x ln %d, sz %d end %d"
+		" WC 0x%Lx rkey 0x%x ln %d, sz %d end %d p2p %d\n",
+	     m_cm->m_qp->wrc_rem.wr_addr, m_cm->m_qp->wrc_rem.wr_rkey,
 	     m_cm->m_qp->wrc_rem.wr_len, m_cm->m_qp->wrc_rem.wr_sz,
 	     m_cm->m_qp->wrc_rem.wr_end, m_cm->m_qp->wrc_rem.wc_addr,
 	     m_cm->m_qp->wrc_rem.wc_rkey, m_cm->m_qp->wrc_rem.wc_len,
-	     m_cm->m_qp->wrc_rem.wc_sz, m_cm->m_qp->wrc_rem.wc_end);
+	     m_cm->m_qp->wrc_rem.wc_sz, m_cm->m_qp->wrc_rem.wc_end,
+	     m_cm->m_qp->p2p_data);
 
 	/* MXS <- MSS or HOST, fabric: TX: QP2->QP1 direct, RX: QP1<-QP2 proxy */
 	if ((MXF_EP(&m_cm->md->addr) && !MXF_EP(&m_cm->msg.daddr1)) &&
@@ -1815,10 +1819,15 @@ int mix_cm_rtu_in(mcm_cm_t *m_cm, dat_mcm_msg_t *pkt, int pkt_len)
 	/* MXF_EP <- HST_EP, host sends WC on RTU, save WRC info */
 	if (MXF_EP(&pkt->daddr1) && HST_EP(&pkt->saddr2)) {
 		mcm_ntoh_wrc(&m_cm->m_qp->wrc_rem, (mcm_wrc_info_t *)pkt->p_proxy);
-		mlog(2, " WRC_rem: m_qp %p - addr 0x%Lx rkey 0x%x len %d, sz %d end %d\n",
+		if (m_cm->m_qp->wrc_rem.wr_sz > MCM_WR_RX_NO_DATA)
+			m_cm->m_qp->p2p_data = DAT_MCM_P2P_INLINE;
+		else
+			m_cm->m_qp->p2p_data = 0;
+
+		mlog(2, " WRC_rem: WC addr 0x%Lx rkey 0x%x len %d, sz %d end %d p2p %d\n",
 		     m_cm->m_qp, m_cm->m_qp->wrc_rem.wc_addr, m_cm->m_qp->wrc_rem.wc_rkey,
 		     m_cm->m_qp->wrc_rem.wc_len, m_cm->m_qp->wrc_rem.wc_sz,
-		     m_cm->m_qp->wrc_rem.wc_end);
+		     m_cm->m_qp->wrc_rem.wc_end, m_cm->m_qp->p2p_data);
 	}
 
 	mpxy_lock(&m_cm->lock);
@@ -1902,6 +1911,10 @@ static int mix_cm_rep_out(mcm_scif_dev_t *smd, dat_mix_cm_t *pmsg, scif_epd_t sc
 	m_cm->ref_cnt++; /* Passive: QP ref */
 	m_cm->m_qp->cm = m_cm;
 	mcm_ntoh_wrc(&m_cm->m_qp->wrc_rem, (mcm_wrc_info_t *)m_cm->msg.p_proxy); /* save peer PI WRC info */
+	if (m_cm->m_qp->wrc_rem.wr_sz > MCM_WR_RX_NO_DATA)
+		m_cm->m_qp->p2p_data = DAT_MCM_P2P_INLINE;
+	else
+		m_cm->m_qp->p2p_data = 0;
 
 	mlog(2, " WRC: m_qp %p - WR 0x%Lx rkey 0x%x ln %d, sz %d end %d"
 		" WC 0x%Lx rkey 0x%x ln %d, sz %d end %d sg_po2 %d\n",
@@ -1912,13 +1925,14 @@ static int mix_cm_rep_out(mcm_scif_dev_t *smd, dat_mix_cm_t *pmsg, scif_epd_t sc
 	     m_cm->m_qp->wrc.wc_sz, m_cm->m_qp->wrc.wc_end,
 	     m_cm->msg.seg_sz);
 
-	mlog(2, " WRC_rem: m_qp %p - WR 0x%Lx rkey 0x%x ln %d, sz %d end %d"
-		" WC 0x%Lx rkey 0x%x ln %d, sz %d end %d\n",
-	     m_cm->m_qp, m_cm->m_qp->wrc_rem.wr_addr, m_cm->m_qp->wrc_rem.wr_rkey,
+	mlog(2, " WRC_rem: WR 0x%Lx rkey 0x%x ln %d, sz %d end %d"
+		" WC 0x%Lx rkey 0x%x ln %d, sz %d end %d p2p %d\n",
+	     m_cm->m_qp->wrc_rem.wr_addr, m_cm->m_qp->wrc_rem.wr_rkey,
 	     m_cm->m_qp->wrc_rem.wr_len, m_cm->m_qp->wrc_rem.wr_sz,
 	     m_cm->m_qp->wrc_rem.wr_end, m_cm->m_qp->wrc_rem.wc_addr,
 	     m_cm->m_qp->wrc_rem.wc_rkey, m_cm->m_qp->wrc_rem.wc_len,
-	     m_cm->m_qp->wrc_rem.wc_sz, m_cm->m_qp->wrc_rem.wc_end);
+	     m_cm->m_qp->wrc_rem.wc_sz, m_cm->m_qp->wrc_rem.wc_end,
+	     m_cm->m_qp->p2p_data);
 
 	/* MXS -> MSS or HOST, remote: need QPr1, saddr1 on mpxyd */
 	if ((MXF_EP(&m_cm->md->addr) && !MXF_EP(&m_cm->msg.daddr1)) &&
@@ -2174,17 +2188,14 @@ static int mix_proxy_out(mcm_scif_dev_t *smd, dat_mix_sr_t *pmsg, mcm_qp_t *m_qp
 	m_qp->wr_hd = (m_qp->wr_hd + 1) & m_qp->wr_end; /* move hd */
 	m_wr = (struct mcm_wr *)(m_qp->wr_buf + (m_qp->wr_sz * m_qp->wr_hd));
 
-	mlog(4, " inline, m_wr %p m_sge %p len %d hd %d tl %d\n",
-		m_wr, m_wr->sg, len, m_qp->wr_hd, m_qp->wr_tl);
-
 	/* IB rdma write WR */
 	const_ib_rw(&m_wr->wr, &pmsg->wr, m_wr->sg);
 	m_wr->wr.sg_list = m_wr->sg;
 	m_wr->wr.num_sge = len ? 1:0;
 
-	mlog(4, " INLINE m_wr (%p)raddr %p rkey 0x%llx, ib_wr raddr %p rkey 0x%llx \n",
-		&pmsg->wr.wr.rdma.remote_addr, pmsg->wr.wr.rdma.remote_addr, pmsg->wr.wr.rdma.rkey,
-		&m_wr->wr.wr.rdma.remote_addr, m_wr->wr.wr.rdma.remote_addr, m_wr->wr.wr.rdma.rkey);
+	mlog(4, " INLINE m_wr[%d] %p raddr %p rkey 0x%x, ib_wr raddr %p rkey 0x%x %d bytes\n",
+		m_qp->wr_hd, m_wr, pmsg->wr.wr.rdma.remote_addr, pmsg->wr.wr.rdma.rkey,
+		m_wr->wr.wr.rdma.remote_addr, m_wr->wr.wr.rdma.rkey, len);
 
 	/* M_WR */
 	m_wr->org_id = pmsg->wr.wr_id;
@@ -2290,7 +2301,6 @@ retry_mr:
 			goto bail;
 		}
 	}
-	mlog(4, " inline data rcv'ed %d bytes\n", len);
 
 	if ((smd->md->indata) && (len <= mcm_ib_inline))
 		m_wr->wr.send_flags |= IBV_SEND_INLINE;
diff --git a/dapl/svc/mpxy_in.c b/dapl/svc/mpxy_in.c
index adf5021..54cc62a 100755
--- a/dapl/svc/mpxy_in.c
+++ b/dapl/svc/mpxy_in.c
@@ -529,10 +529,10 @@ static int m_pi_send_wc(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_rx, int status
 		wr.send_flags |= IBV_SEND_INLINE;
 
 	mlog(4, " WC: RW_imm post: wr_id[%d] %Lx sglist %p sge %d op %d flgs %x"
-		" idata %x WR_rem = raddr %p rkey %x ln %d op %x\n",
+		" idata %x WR_rem = raddr %p rkey %x io_ln %d op %x\n",
 		wr_rx->w_idx, wr.wr_id, wr.sg_list, wr.num_sge, wr.opcode,
 		wr.send_flags, ntohl(wr.imm_data), wr.wr.rdma.remote_addr,
-		wr.wr.rdma.rkey, sge.length, wr_rx->wr.opcode);
+		wr.wr.rdma.rkey, wr_rx->sg[0].length, wr_rx->wr.opcode);
 
 	/* MXS -> MSS or HST, PI service will be on QP1 */
 	if (MXF_EP(&m_qp->smd->md->addr) &&
@@ -562,7 +562,7 @@ static int m_pi_send_wc(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_rx, int status
 }
 
 /* called with rxlock, process all RR's up to signal marker at wr_last */
-static void m_pi_post_writeto(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_sig, struct ibv_wc *wc)
+static void m_pi_post_writeto(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_sig)
 {
 	mcm_scif_dev_t *smd = m_qp->smd;
 	struct mcm_wr_rx *wr_rx;
@@ -576,9 +576,13 @@ static void m_pi_post_writeto(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_sig, str
 	while (m_qp->pi_rr_cnt) { /* RR's pending */
 		wr_rx = (struct mcm_wr_rx *)(m_qp->wrc.wr_addr + (m_qp->wrc.wr_sz * wr_idx));
 
-		if (!(wr_rx->flags & M_READ_POSTED)) {
-			/* reached RR signaled marker, or head pointer */
-			if (wr_idx == wr_sig->w_idx || wr_idx == m_qp->wr_hd_r)
+		if (!(wr_rx->flags & M_READ_DONE)) {
+			/* reached head pointer */
+			if (wr_idx == m_qp->wr_hd_r)
+				break;
+
+			/* maintain order */
+			if (wr_rx->flags & M_READ_POSTED)
 				break;
 
 			wr_idx = (wr_idx + 1) & m_qp->wrc.wr_end; /* next WR */
@@ -589,9 +593,8 @@ static void m_pi_post_writeto(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_sig, str
 		if (wr_rx == wr_sig)
 			mcm_qp_prof_ts(m_qp, MCM_QP_IB_RR, wr_rx->time, wr_rx->qcnt, wr_cnt);
 #endif
-		mlog(4, " WR_rx[%d-%d] %p m_qp %p wc %p wc->op %x wr_rx->wr.op %x\n",
-			wr_rx->w_idx, wr_sig->w_idx, wr_rx, m_qp, wc,
-			wc->opcode, wr_rx->wr.opcode);
+		mlog(4, " WR_rx[%d-%d] %p m_qp %p wr_rx->wr.op %x\n",
+			wr_rx->w_idx, wr_sig->w_idx, wr_rx, m_qp, wr_rx->wr.opcode);
 
 		m_qp->pi_rr_cnt--; /* rdma read complete */
 		MCNTR(smd->md, MCM_QP_READ_DONE);
@@ -622,11 +625,27 @@ static void m_pi_post_writeto(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_sig, str
 			sg_len = wr_rx->sg[2].length;
 			r_off = m_pi_mr_trans(smd, wr_rx->wr.wr.rdma.remote_addr,
 					      wr_rx->wr.wr.rdma.rkey, sg_len);
-			if (!r_off)
+			if (!r_off && sg_len)
 				goto bail;
 
-			mlog(4, " RDMA_WRITE op: wr_rx[%d] %p -> scif r_off %Lx len %d\n",
-				 wr_rx->w_idx, wr_rx, r_off, sg_len, 0);
+			if (!sg_len) {  /* 0 byte rdma write, no scif */
+				mlog(1, " RDMA_WRITE op: wr_rx[%d] %p,"
+					 " raddr %p rkey %x 0 bytes\n",
+					 wr_rx->w_idx, wr_rx,
+					 wr_rx->wr.wr.rdma.remote_addr,
+					 wr_rx->wr.wr.rdma.rkey);
+
+				m_qp->post_cnt_wt++;
+				wr_rx->flags &= ~M_READ_DONE;
+				wr_rx->flags |= M_READ_WRITE_TO;
+				wr_rx->wr.wr_id = wr_rx->org_id; /* mark done */
+
+				if (wr_idx == m_qp->wr_hd_r)
+					break;
+
+				wr_idx = (wr_idx + 1) & m_qp->wrc.wr_end; /* next WR */
+				continue;
+			}
 		}
 
 		/* sg[0] entry == proxy-out buffer, src for IB RR */
@@ -650,8 +669,8 @@ static void m_pi_post_writeto(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_sig, str
 			wr_rx->time = mcm_ts_us();
 			wr_rx->qcnt = m_qp->post_cnt_wt;
 #endif
-			if (w_len < 256)
-				wt_flag = SCIF_RMA_USECPU;
+			if (wr_rx->flags & M_SEND_LS)
+				wt_flag |= SCIF_RMA_ORDERED;
 
 			ret = scif_writeto(smd->scif_tx_ep, l_off, w_len, r_off, wt_flag);
 
@@ -688,13 +707,12 @@ static void m_pi_post_writeto(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_sig, str
 			goto bail;
 		}
 		MCNTR(smd->md, MCM_SCIF_SIGNAL);
-		wr_rx->flags &= ~M_READ_POSTED; /* reset READ_POSTED */
-		wr_rx->flags |= M_READ_DONE;
+		wr_rx->flags &= ~M_READ_DONE;
 		wr_rx->flags |= M_READ_WRITE_TO;
 		m_qp->post_cnt_wt++;
 
-		/* reached RR signaled marker, or head */
-		if (wr_idx == wr_sig->w_idx || wr_idx == m_qp->wr_hd_r)
+		/* reached head */
+		if (wr_idx == m_qp->wr_hd_r)
 			break;
 
 		wr_idx = (wr_idx + 1) & m_qp->wrc.wr_end; /* next WR */
@@ -702,10 +720,13 @@ static void m_pi_post_writeto(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_sig, str
 	return;
 bail:
 	/* report error via WC back to proxy-out */
-	mlog(0, " ERR: writeto: wr_rx[%d] %p -> raddr %Lx rkey %x (scif r_off %Lx) len %d\n",
+	mlog(0, " ERR: writeto: wr_rx[%d] %p -> IB raddr %Lx rkey %x"
+		" SCIF r_off %Lx, len %d wr_flags %x wt_pend %d\n",
 		wr_rx->w_idx, wr_rx, wr_rx->wr.wr.rdma.remote_addr,
-		wr_rx->wr.wr.rdma.rkey, r_off, sg_len);
+		wr_rx->wr.wr.rdma.rkey, r_off, sg_len, wr_rx->flags,
+		m_qp->post_cnt_wt);
 
+	m_pi_send_wc(m_qp, wr_rx, IBV_WC_REM_ACCESS_ERR);
 	return;
 }
 
@@ -750,7 +771,9 @@ void m_pi_req_event(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_rx, struct ibv_wc
 	/* RR complete, ready for SCIF_writeto to complete RW or SR */
 	if (type == WRID_RX_RR) {
 		mpxy_lock(&m_qp->rxlock);
-		m_pi_post_writeto(m_qp, wr_rx, wc);
+		wr_rx->flags &= ~M_READ_POSTED; /* reset READ_POSTED */
+		wr_rx->flags |= M_READ_DONE;
+		m_pi_post_writeto(m_qp, wr_rx);
 		mpxy_unlock(&m_qp->rxlock);
 		write(m_qp->smd->md->mc->rx_pipe[1], "w", sizeof "w"); /* signal rx_thread */
 		sched_yield();
@@ -923,25 +946,35 @@ static void m_pi_post_read(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_rx)
 	wr_rx->time = mcm_ts_us();
 	wr_rx->qcnt = m_qp->pi_rr_cnt;
 #endif
-	wr_rx->flags |= M_READ_POSTED;
-	errno = 0;
-	ret = ibv_post_send(ib_qp, &ib_wr, &bad_wr);
-	if (ret)
-		goto bail;
 
-	m_qp->pi_rr_cnt++;
-	m_qp->post_cnt_rr++;
-	MCNTR(smd->md, MCM_QP_READ);
+	if (wr_rx->flags & M_PROXY_INLINE) {
+		mlog(0x10, "wr_rx flag PROXY_INLINE is set. data len %d\n", wr_rx->sg[0].length);
+		memcpy((void *)rbuf, wr_rx->inline_data, wr_rx->sg[0].length);
+		m_qp->pi_rr_cnt++;
+		m_qp->post_cnt_rr++;
+		wr_rx->flags |= M_READ_DONE;
+		m_pi_post_writeto(m_qp, wr_rx);
+	}
+	else {
+		errno = 0;
+		wr_rx->flags |= M_READ_POSTED;
+		ret = ibv_post_send(ib_qp, &ib_wr, &bad_wr);
+		if (ret)
+			goto bail;
 
-	mlog(0x10, "[%d:%d:%d] WR[%d] %p RR(%d,%d,%d): wr_id %Lx qn %x flgs %x,%x ln %d "
-		   "r_addr,key %Lx %x to l_addr,key %Lx %x tl %d hd %d, m_idx %x\n",
-		smd->md->mc->scif_id, smd->entry.tid, m_qp->r_entry.tid,
-		wr_rx->w_idx, wr_rx, m_qp->post_cnt_rr, m_qp->stall_cnt_rr,
-		m_qp->pi_rr_cnt, ib_wr.wr_id, ib_qp->qp_num, ib_wr.send_flags,
-		wr_rx->flags, l_len, ib_wr.wr.rdma.remote_addr,
-		ib_wr.wr.rdma.rkey, ib_wr.sg_list->addr, ib_wr.sg_list->lkey,
-		m_qp->wr_tl_r, m_qp->wr_hd_r, wr_rx->m_idx);
+		m_qp->pi_rr_cnt++;
+		m_qp->post_cnt_rr++;
+		MCNTR(smd->md, MCM_QP_READ);
 
+		mlog(0x10, "[%d:%d:%d] WR[%d] %p RR(%d,%d,%d): wr_id %Lx qn %x flgs %x,%x ln %d "
+			   "r_addr,key %Lx %x to l_addr,key %Lx %x tl %d hd %d, m_idx %x\n",
+			smd->md->mc->scif_id, smd->entry.tid, m_qp->r_entry.tid,
+			wr_rx->w_idx, wr_rx, m_qp->post_cnt_rr, m_qp->stall_cnt_rr,
+			m_qp->pi_rr_cnt, ib_wr.wr_id, ib_qp->qp_num, ib_wr.send_flags,
+			wr_rx->flags, l_len, ib_wr.wr.rdma.remote_addr,
+			ib_wr.wr.rdma.rkey, ib_wr.sg_list->addr, ib_wr.sg_list->lkey,
+			m_qp->wr_tl_r, m_qp->wr_hd_r, wr_rx->m_idx);
+	}
 	write(smd->md->mc->tx_pipe[1], "w", sizeof "w");
 	return;
 bail:
@@ -969,6 +1002,8 @@ buf_err:
 		ib_wr.send_flags, l_len, ib_wr.wr.rdma.remote_addr,
 		ib_wr.wr.rdma.rkey, ib_wr.sg_list->addr, ib_wr.sg_list->lkey,
 		m_qp->wr_tl_r, m_qp->wr_tl_r_wt, m_qp->wr_hd_r);
+
+	m_pi_send_wc(m_qp, wr_rx, IBV_WC_REM_ACCESS_ERR); /* report error */
 }
 
 void m_pi_rcv_event(struct mcm_qp *m_qp, wrc_idata_t *wrc)
diff --git a/dapl/svc/mpxy_out.c b/dapl/svc/mpxy_out.c
index eff81fc..d015dc3 100755
--- a/dapl/svc/mpxy_out.c
+++ b/dapl/svc/mpxy_out.c
@@ -274,9 +274,15 @@ static int m_po_send_pi(struct mcm_qp *m_qp, struct mcm_wr *m_wr, int wr_idx)
 		sge.lkey = m_qp->wr_buf_rx_mr->lkey;
 	}
 	sge.addr = (uint64_t)(uintptr_t) wr_rx_ptr;
-	sge.length = (uint32_t) sizeof(struct mcm_wr_rx);
+	sge.length = (uint32_t) m_qp->wrc_rem.wr_sz;
 
 	/* proxy m_wr over to remote m_wr_rem slot, remote will initiate RR and send back WC */
+	if (m_qp->p2p_data && m_wr->sg[0].length < m_qp->p2p_data) {
+		mlog(0x4, " Sending the proxy data ( len %d ) inside the WR.\n", m_wr->sg[0].length);
+		m_wr->flags |= M_PROXY_INLINE;
+		memcpy (wr_rx_ptr->inline_data, (void *)m_wr->sg[0].addr, m_wr->sg[0].length);
+	}
+
 	m_wr->flags |= M_SEND_PI;
 	mcm_hton_wr_rx(wr_rx_ptr, m_wr, m_qp->wc_tl); /* build rx_wr for wire transfer, send it */
 
@@ -336,10 +342,11 @@ static int m_po_send_pi(struct mcm_qp *m_qp, struct mcm_wr *m_wr, int wr_idx)
 	ret = ibv_post_send(ib_qp, &wr, &bad_wr);
 	if (ret) {
 		mlog(0, " ERR: m_wr %p idx %d laddr=%p ln=%d lkey=%x flgs %x"
-			" tl %d hd %d pp %d sig %d\n",
+			" tl %d hd %d pp %d sig %d ret %d %s\n",
 			m_wr, wr_idx, sge.addr, sge.length, sge.lkey,
 			m_wr->flags, m_qp->wr_tl, m_qp->wr_hd,
-			m_qp->wr_pp_rem, m_qp->post_sig_cnt);
+			m_qp->wr_pp_rem, m_qp->post_sig_cnt,
+			ret, strerror(errno));
 		mlog(0, " ERR: wr_id %Lx %p sglist %p sge %d op %d flgs %x"
 			" idata 0x%x raddr %p rkey %x \n",
 			m_wr->wr.wr_id, m_wr->wr.sg_list,
@@ -1086,7 +1093,7 @@ retry:
 		/* Proxy_out ->  */
 		m_wr = (struct mcm_wr *)WRID_ADDR(wc[i].wr_id);
 		m_qp = (struct mcm_qp *)m_wr->context;
-		if (!MXF_EP(&m_qp->cm->msg.daddr1))
+		if (m_qp->cm && !MXF_EP(&m_qp->cm->msg.daddr1))
 			m_qp->comp_cnt++;
 		MCNTR(m_qp->smd->md, MCM_QP_WRITE_DONE);
 
diff --git a/dapl/svc/mpxyd.c b/dapl/svc/mpxyd.c
index b04d823..922eeae 100755
--- a/dapl/svc/mpxyd.c
+++ b/dapl/svc/mpxyd.c
@@ -146,8 +146,8 @@ static int init_scif()
 		scif_close(scif_listen_ep);
 		return -1;
 	}
-	mlog(1," MPXYD: Listening on reserved SCIF OFED port %d, listen_EP %d, backlog %d\n",
-		(uint16_t)scif_id.port, scif_sport, scif_listen_qlen);
+	mlog(1," MPXYD: Listening on reserved SCIF OFED port %d, backlog %d\n",
+		(uint16_t)scif_id.port, scif_listen_qlen);
 
 	return 0;
 }
@@ -801,7 +801,7 @@ found:
 	}
 err:
 	if (!smd) {
-		mlog(0, " ERR: mix_open_device failed for %s - %d\n", msg->name, msg->port);
+		mlog(1, " WARN: open failed for %s - %d\n", msg->name, msg->port);
 		msg->hdr.status = MIX_ENODEV;
 	}
 
@@ -1345,8 +1345,8 @@ int main(int argc, char **argv)
 	logfile = mpxy_open_log();
 	mpxy_log_options();
 
-	mlog(0, "CCL Proxy - SCIF/IB DAPL RDMA Proxy Service, Mix Version %d (Build-%u) v2\n",
-		DAT_MIX_VER, PACKAGE_DATE);
+	mlog(0, "CCL Proxy - SCIF/IB DAPL RDMA Proxy Service %s (%u)\n",
+		 PACKAGE_VERSION, PACKAGE_DATE);
 
 	if (init_scif()) {
 		mlog(0, "ERROR - unable to open/init SCIF device\n");
diff --git a/dapl/svc/mpxyd.h b/dapl/svc/mpxyd.h
index e444f5f..c733157 100755
--- a/dapl/svc/mpxyd.h
+++ b/dapl/svc/mpxyd.h
@@ -58,7 +58,7 @@
 #define min(a, b) ((a < b) ? (a) : (b))
 #define max(a, b) ((a > b) ? (a) : (b))
 
-#define MCM_IB_INLINE		160
+#define MCM_IB_INLINE		(sizeof(mcm_wr_rx_t))
 #define MIX_MAX_MSG_SIZE	(8*1024*1024)
 
 #define MIX_MIN  4 		/* oldest version supported */
@@ -211,6 +211,7 @@ typedef struct mcm_qp {
 	int			comp_cnt;
 	char			*wr_buf_rx;	/* mcm_wr_rx_t entries, for devices without inline data  */
 	struct ibv_mr		*wr_buf_rx_mr;
+	int			p2p_data; 	/* Max number of bytes to pass from proxy to proxy in the WR */
 	/* Proxy-in: WR management, remote view from TX side */
 	mcm_wrc_info_t		wrc_rem;	/* WR and WC buffers: remote, in CM req and reply */
 	int			wr_pp_rem;	/* work request pending */
diff --git a/dapl/svc/util.c b/dapl/svc/util.c
index 8b5db68..010678c 100755
--- a/dapl/svc/util.c
+++ b/dapl/svc/util.c
@@ -57,6 +57,8 @@ extern int mcm_rep_ms;
 extern int mcm_rtu_ms;
 extern int mcm_dreq_ms;
 extern int mcm_proxy_in;
+extern int mcm_mic0_mss;
+extern int mcm_mic1_mss;
 
 /* mix.c */
 extern int mix_align;
@@ -451,6 +453,10 @@ void mpxy_set_options( int debug_mode )
 			while (mcm_rx_entries < rsize)
 				mcm_rx_entries <<= 1;
 		}
+		else if (!strcasecmp("mcm_mic0_mss", opt))
+			mcm_mic0_mss = atoi(value);
+		else if (!strcasecmp("mcm_mic1_mss", opt))
+			mcm_mic1_mss = atoi(value);
 	}
 
 	fclose(f);
diff --git a/dat/common/dat_api.c b/dat/common/dat_api.c
index 0c28c11..4804a6b 100755
--- a/dat/common/dat_api.c
+++ b/dat/common/dat_api.c
@@ -1130,6 +1130,9 @@ DAT_RETURN DAT_API dat_extension_op(IN DAT_HANDLE handle,
 				 " call udat_ext_close: handle %p\n", handle);
 		status = udat_extension_close(handle, ext_op, args);
 	} else {
+		if (dapl_handle == NULL)
+			return DAT_ERROR(DAT_INVALID_HANDLE, DAT_INVALID_HANDLE1);
+
 		dat_os_dbg_print(DAT_OS_DBG_TYPE_CONSUMER_API,
 				 " call dat_ext_op: handle %p\n", dapl_handle);
 
diff --git a/test/dapltest/cmd/dapl_params.c b/test/dapltest/cmd/dapl_params.c
index f038324..fc4697b 100755
--- a/test/dapltest/cmd/dapl_params.c
+++ b/test/dapltest/cmd/dapl_params.c
@@ -72,6 +72,7 @@ bool DT_Params_Parse(int argc, char *argv[], Params_t * params_ptr)
 			DT_Mdep_printf("can't get default device name\n");
 			return false;
 		}
+		params_ptr->server_port = Server_Cmd->port;
 		return true;
 	}
 	/* check for a script file */
diff --git a/test/dtest/dtest.c b/test/dtest/dtest.c
index c0c82ee..6894a2c 100755
--- a/test/dtest/dtest.c
+++ b/test/dtest/dtest.c
@@ -197,15 +197,19 @@ struct dt_time {
 struct dt_time ts;
 
 /* defaults */
+static int all_data = 0;
+static int increment = 0;
 static int failed = 0;
 static int uni_direction = 0;
 static int align_data=1;
 static int rdma_read = 0;
 static int write_only = 0;
+static int write_only_pp = 0;
 static int write_immed = 0;
 static int performance_times = 0;
 static int connected = 0;
-static int burst = 100;
+static int burst = 1000;
+static int msg_burst = 100;
 static int signal_rate = 10;
 static int server = 1;
 static int verbose = 0;
@@ -216,6 +220,7 @@ static int conn_poll_count = 0;
 static int rdma_rd_poll_count[MAX_RDMA_RD] = { 0 };
 static int delay = 0;
 static int buf_len = RDMA_BUFFER_SIZE;
+static int buf_len_p2;
 static int use_cno = 0;
 static int recv_msg_index = 0;
 static int burst_msg_posted = 0;
@@ -246,6 +251,7 @@ DAT_RETURN create_events(void);
 DAT_RETURN destroy_events(void);
 DAT_RETURN do_rdma_write_imm_with_msg(void);
 DAT_RETURN do_rdma_write_with_msg(void);
+DAT_RETURN do_rdma_write_ping_pong(int p2, int bytes);
 DAT_RETURN do_rdma_read_with_msg(void);
 DAT_RETURN do_ping_pong_msg(void);
 
@@ -260,25 +266,25 @@ void flush_evds(void)
 	DAT_EVENT event;
 
 	/* Flush async error queue */
-	printf("%d: Checking ASYNC EVD...\n", getpid());
+	LOGPRINTF("%d: Checking ASYNC EVD...\n", getpid());
 	while (dat_evd_dequeue(h_async_evd, &event) == DAT_SUCCESS) {
-		printf("%d ERR: ASYNC EVD ENTRY: handle=%p reason=%d\n", getpid(),
+		LOGPRINTF("%d ERR: ASYNC EVD ENTRY: handle=%p reason=%d\n", getpid(),
 			event.event_data.asynch_error_event_data.dat_handle,
 			event.event_data.asynch_error_event_data.reason);
 	}
 	/* Flush receive queue */
-	printf("%d: Checking RECEIVE EVD...\n", getpid());
+	LOGPRINTF("%d: Checking RECEIVE EVD...\n", getpid());
 	while (dat_evd_dequeue(h_dto_rcv_evd, &event) == DAT_SUCCESS) {
-		printf(" RCV EVD ENTRY: op=%d stat=%d ln=%d ck="F64x"\n",
+		LOGPRINTF(" RCV EVD ENTRY: op=%d stat=%d ln=%d ck="F64x"\n",
 			event.event_data.dto_completion_event_data.operation,
 			event.event_data.dto_completion_event_data.status,
 			event.event_data.dto_completion_event_data.transfered_length,
 			event.event_data.dto_completion_event_data.user_cookie.as_64);
 	}
 	/* Flush request queue */
-	printf("%d: Checking REQUEST EVD...\n", getpid());
+	LOGPRINTF("%d: Checking REQUEST EVD...\n", getpid());
 	while (dat_evd_dequeue(h_dto_req_evd, &event) == DAT_SUCCESS) {
-		printf(" REQ EVD ENTRY: op=%d stat=%d ln=%d ck="F64x"\n",
+		LOGPRINTF(" REQ EVD ENTRY: op=%d stat=%d ln=%d ck="F64x"\n",
 			event.event_data.dto_completion_event_data.operation,
 			event.event_data.dto_completion_event_data.status,
 			event.event_data.dto_completion_event_data.transfered_length,
@@ -560,10 +566,12 @@ int main(int argc, char **argv)
 	DAT_PROVIDER_ATTR pr_attr;
 
 	/* parse arguments */
-	while ((c = getopt(argc, argv, "auwtscvpb:d:B:h:P:S:")) != -1) {
+	while ((c = getopt(argc, argv, "auwWtscvpb:d:B:h:P:S:i:")) != -1) {
 		switch (c) {
+		case 'i':
+			increment = atoi(optarg);
 		case 'a':
-			align_data = 1;
+			all_data = 1;
 			fflush(stdout);
 			break;
 		case 'u':
@@ -574,6 +582,13 @@ int main(int argc, char **argv)
 			write_only = 1;
 			fflush(stdout);
 			break;
+		case 'W':
+			write_only_pp = 1;
+			uni_direction = 1;
+			signal_rate = 1;
+			burst = 1000;
+			fflush(stdout);
+			break;
 		case 't':
 			performance_times = 1;
 			fflush(stdout);
@@ -640,8 +655,13 @@ int main(int argc, char **argv)
 	if (signal_rate > burst)
 		signal_rate = burst;
 
-	rq_cnt = MSG_BUF_COUNT + (burst);
-	sq_cnt = MSG_BUF_COUNT + MAX_RDMA_RD + signal_rate;
+	if (write_only || write_only_pp) {
+		rq_cnt = MSG_BUF_COUNT * 2;
+		sq_cnt = MSG_BUF_COUNT + MAX_RDMA_RD + signal_rate;
+	} else {
+		rq_cnt = MSG_BUF_COUNT + msg_burst;
+		sq_cnt = MSG_BUF_COUNT + MAX_RDMA_RD + msg_burst;
+	}
 
 	if (!server) {
 		printf("%d Running as client - waiting for server input\n",
@@ -658,23 +678,39 @@ int main(int argc, char **argv)
 	}
 	fflush(stdout);
 
+	if (write_only_pp) {
+		/* rdma write pingpong, default == 1 byte */
+		if (!all_data) {
+			buf_len = 1;
+		} else if (!increment) { /* power of 2 */
+			buf_len_p2 = 1;
+			i = 0;
+			while (buf_len_p2 < buf_len) {
+				buf_len_p2 <<= 1;
+				i++;
+			}
+			buf_len_p2 = i;
+		}
+	}
+
 	if (align_data) {
 		/* allocate send and receive buffers */
-		if (posix_memalign((void**)&rbuf, 4096, max(64, buf_len * (burst+1))) ||
-		    posix_memalign((void**)&sbuf, 4096, max(64, buf_len * (burst+1)))) {
+		if (posix_memalign((void**)&rbuf, 4096, max(4096, buf_len * rq_cnt)) ||
+		    posix_memalign((void**)&sbuf, 4096, max(4096, buf_len * rq_cnt))) {
 			perror("malloc");
 			exit(1);
 		}
 	} else {
 		/* allocate send and receive buffers */
-		if (((rbuf = malloc(max(64, buf_len * (burst+1)))) == NULL) ||
-		    ((sbuf = malloc(max(64, buf_len * (burst+1)))) == NULL)) {
+		if (((rbuf = malloc(max(64, buf_len * rq_cnt))) == NULL) ||
+		    ((sbuf = malloc(max(64, buf_len * rq_cnt))) == NULL)) {
 			perror("malloc");
 			exit(1);
 		}
 	}
-	LOGPRINTF("%d Allocated RDMA buffers (r:%p,s:%p) len %d \n",
-		  getpid(), rbuf, sbuf, buf_len);
+	init_data();
+	LOGPRINTF("%d Allocated RDMA buffers (r:%p=%d,s:%p=%d) len %d \n",
+		  getpid(), rbuf, *rbuf, sbuf, *sbuf, buf_len);
 
 	if (posix_memalign((void**)&p_rmr_rcv, 4096, 4096) ||
 	    posix_memalign((void**)&p_rmr_snd, 4096, 4096)) {
@@ -690,6 +726,7 @@ int main(int argc, char **argv)
 	ret = dat_ia_open(provider, 8, &h_async_evd, &h_ia);
 	stop = get_time();
 	ts.open += ((stop - start) * 1.0e6);
+	ts.total += ts.open;
 	if (ret != DAT_SUCCESS) {
 		fprintf(stderr, "%d: Error Adaptor open: %s\n",
 			getpid(), DT_RetToStr(ret));
@@ -730,6 +767,7 @@ int main(int argc, char **argv)
 	ret = dat_pz_create(h_ia, &h_pz);
 	stop = get_time();
 	ts.pzc += ((stop - start) * 1.0e6);
+	ts.total += ts.pzc;
 	if (ret != DAT_SUCCESS) {
 		fprintf(stderr, "%d Error creating Protection Zone: %s\n",
 			getpid(), DT_RetToStr(ret));
@@ -763,8 +801,8 @@ int main(int argc, char **argv)
 	ep_attr.max_rdma_size = 0x10000;
 	ep_attr.qos = 0;
 	ep_attr.recv_completion_flags = 0;
-	ep_attr.max_recv_dtos = MSG_BUF_COUNT + (burst * 3);
-	ep_attr.max_request_dtos = MSG_BUF_COUNT + (burst * 3) + MAX_RDMA_RD;
+	ep_attr.max_recv_dtos = rq_cnt;
+	ep_attr.max_request_dtos = sq_cnt;
 	ep_attr.max_recv_iov = MSG_IOV_COUNT;
 	ep_attr.max_request_iov = MSG_IOV_COUNT;
 	ep_attr.max_rdma_read_in = MAX_RDMA_RD;
@@ -844,10 +882,36 @@ int main(int argc, char **argv)
 #endif
 
 	/*********** RDMA write data *************/
-	if ((write_immed) && (write_only))
+	if (write_only_pp) {
+		int max, inc;
+
+		if (all_data) {
+			if (increment) {
+				i = 1;
+				inc = increment;
+				max = buf_len/inc;
+			} else {
+				i = 0;
+				inc = 0;
+				max = buf_len_p2;
+			}
+		} else {
+			i = buf_len;
+			max = buf_len;
+			inc = buf_len;
+		}
+		printf("\n %d RDMA WRITE PINGPONG\n\n", getpid());
+		for (; i <= max; i++) {
+			if (do_rdma_write_ping_pong(i, i*inc))
+				break;
+		}
+	}
+	else if (write_immed && write_only) {
 		ret = do_rdma_write_imm_with_msg();
-	else
+	}
+	else {
 		ret = do_rdma_write_with_msg();
+	}
 
 	if (ret != DAT_SUCCESS) {
 		fprintf(stderr, "%d Error do_rdma_write_%swith_msg: %s\n",
@@ -855,10 +919,9 @@ int main(int argc, char **argv)
 			DT_RetToStr(ret));
 		goto cleanup;
 	} else
-		LOGPRINTF("%d do_rdma_write_%swith_msg complete\n",
-			  getpid(), write_immed && write_only ? "imm_":"");
+		LOGPRINTF("%d rdma_write test complete\n", getpid());
 
-	if (write_only || !rdma_read)
+	if (write_only_pp || write_only || !rdma_read)
 		goto complete;
 
 	/*********** RDMA read data *************/
@@ -932,6 +995,7 @@ complete:
 	ret = dat_pz_free(h_pz);
 	stop = get_time();
 	ts.pzf += ((stop - start) * 1.0e6);
+	ts.total += ts.pzf;
 	if (ret != DAT_SUCCESS) {
 		fprintf(stderr, "%d Error freeing PZ: %s\n",
 			getpid(), DT_RetToStr(ret));
@@ -946,6 +1010,7 @@ complete:
 	ret = dat_ia_close(h_ia, DAT_CLOSE_ABRUPT_FLAG);
 	stop = get_time();
 	ts.close += ((stop - start) * 1.0e6);
+	ts.total += ts.close;
 	if (ret != DAT_SUCCESS) {
 		fprintf(stderr, "%d: Error Adaptor close: %s\n",
 			getpid(), DT_RetToStr(ret));
@@ -956,9 +1021,16 @@ complete:
 	free(rbuf);
 	free(sbuf);
 
-	if (ts.rtt)
-		printf("%d: Message RTT: Total=%6.2lf usec, %d bursts, itime=%6.2lf usec, pc=%d\n",
-			getpid(), ts.rtt, burst, ts.rtt / burst, poll_count);
+	if (!all_data) {
+		printf( "%d: %s PingPong: (%d x %d) Total %6.2lf us:"
+			" latency %3.2lf us, BW %4.2lf MB/s\n",
+			getpid(), write_only_pp ? "RDMA write":"Message",
+			write_only_pp ? burst : msg_burst, buf_len, ts.rtt,
+			write_only_pp ? ts.rtt/burst/2:ts.rtt/msg_burst/2,
+			write_only_pp ? (double)(1/(ts.rtt/burst/2/buf_len)):
+					(double)(1/(ts.rtt/msg_burst/2/buf_len)));
+	}
+
 	if (ts.rdma_wr && (!server || (server && !uni_direction))) {
 		int msgs = uni_direction ? burst : burst * 2;
 
@@ -1920,6 +1992,101 @@ acked:
 	return (DAT_SUCCESS);
 }
 
+/* always uni-direction */
+DAT_RETURN do_rdma_write_ping_pong(int p2, int bytes)
+{
+	DAT_EVENT event;
+	DAT_LMR_TRIPLET l_iov[MSG_IOV_COUNT];
+	DAT_RMR_TRIPLET r_iov;
+	DAT_DTO_COOKIE cookie;
+	DAT_RETURN ret;
+	int i, len, suppress = DAT_COMPLETION_SUPPRESS_FLAG;
+	DAT_DTO_COMPLETION_EVENT_DATA *dto_event =
+		&event.event_data.dto_completion_event_data;
+	volatile char *tx_buf, *rx_buf;
+	uint32_t rx_cnt = 0;
+	uint32_t tx_cnt = 0;
+
+	len = bytes ? bytes : 1 << p2;
+
+	tx_buf = (char*)&sbuf[len-1];
+	rx_buf = (char*)&rbuf[len-1];
+
+	/* RMR information from previously received message */
+	r_iov = p_rmr_rcv[recv_msg_index - 1];
+
+	for (i = 0; i < MSG_IOV_COUNT; i++) {
+		l_iov[i].lmr_context = lmr_context_send;
+		l_iov[i].segment_length = len / MSG_IOV_COUNT;
+		l_iov[i].virtual_address = (DAT_VADDR) (uintptr_t)
+					   (&sbuf[l_iov[i].segment_length*i]);
+		LOGPRINTF("%d rdma_write iov[%d] buf=%p,len=%d\n",
+			  getpid(), i,
+			  &sbuf[l_iov[i].segment_length * i],
+			  l_iov[i].segment_length);
+	}
+	start = get_time();
+	for (i = 0; i <= burst; i++) {
+		if (rx_cnt < burst && !(!server && !tx_cnt)) {
+			rx_cnt++;
+			while (*rx_buf != (char)rx_cnt);
+		}
+
+		if (!((i+1) % signal_rate))
+			suppress =  DAT_COMPLETION_DEFAULT_FLAG;
+		else
+			suppress = DAT_COMPLETION_SUPPRESS_FLAG;
+
+		if (tx_cnt == burst)
+			break;
+
+		*tx_buf = (char)++tx_cnt;
+		cookie.as_64 = tx_cnt;
+		ret = dat_ep_post_rdma_write(h_ep, MSG_IOV_COUNT,
+					     l_iov, cookie, &r_iov,
+					     suppress);
+		if (ret) {
+			fprintf(stderr, "%d: ERROR: dat_rdma_write() %s\n",
+					getpid(), DT_RetToStr(ret));
+			return (DAT_ABORT);
+		}
+		if (!suppress) {
+			while (dat_evd_dequeue(h_dto_req_evd, &event));
+			if (dto_event->status) {
+				fprintf(stderr,
+					"ERROR rdma_write: status=0x%x ck="
+					" "F64x " exp 0x%x\n",
+					dto_event->status,
+					dto_event->user_cookie.as_64, tx_cnt);
+				return (DAT_ABORT);
+			}
+		}
+		LOGPRINTF("%d %s RW pingpong: %p, *rbuf %d rcnt %d\n",
+			  getpid(), server ? "SERVER:" : "CLIENT:",
+			  rx_buf, (unsigned char)*rx_buf,
+			  (unsigned char)rx_cnt);
+	}
+	stop = get_time();
+	ts.rtt = ((stop - start) * 1.0e6);
+
+	if ((unsigned char)*rx_buf != (unsigned char)rx_cnt) {
+		printf( "%d %s RW pingpong: %p, last *buf %d != cnt %d\n",
+			getpid(), server ? "SERVER:" : "CLIENT:",
+			rx_buf, (unsigned char)*rx_buf,
+			(unsigned char)rx_cnt);
+		return (DAT_ABORT);
+	}
+
+	if (all_data) {
+		printf( "%d: RDMA write PingPong: (%d x %d) Total %6.2lf us:"
+			" latency %3.2lf us, BW %4.2lf MB/s\n",
+			getpid(), burst, len, ts.rtt, ts.rtt/burst/2,
+			(double)(1/(ts.rtt/burst/2/len)));
+	}
+
+	return (DAT_SUCCESS);
+}
+
 DAT_RETURN do_rdma_read_with_msg(void)
 {
 	DAT_EVENT event;
@@ -2106,15 +2273,15 @@ DAT_RETURN do_ping_pong_msg()
 	rcv_buf = rbuf;
 
 	/* pre-post all buffers */
-	for (i = 0; i < burst; i++) {
+	for (i = 0; i < msg_burst; i++) {
 		burst_msg_posted++;
 		cookie.as_64 = i;
 		l_iov.lmr_context = lmr_context_recv;
 		l_iov.virtual_address = (DAT_VADDR) (uintptr_t) rcv_buf;
 		l_iov.segment_length = buf_len;
 
-		LOGPRINTF("%d Pre-posting Receive Message Buffers %p\n",
-			  getpid(), rcv_buf);
+		LOGPRINTF("%d Pre-posting Receive Message Buffer[%d] %p\n",
+			  getpid(), i, rcv_buf);
 
 		ret = dat_ep_post_recv(h_ep,
 				       1,
@@ -2146,14 +2313,14 @@ DAT_RETURN do_ping_pong_msg()
 
 	/* client ping 0x55, server pong 0xAA in first byte */
 	start = get_time();
-	for (i = 0; i < burst; i++) {
+	for (i = 0; i < msg_burst; i++) {
 		/* walk the send and recv buffers */
 		if (!server) {
 			*snd_buf = 0x55;
 
 			LOGPRINTF("%d %s SND buffer %p contains: 0x%x len=%d\n",
 				  getpid(), server ? "SERVER:" : "CLIENT:",
-				  snd_buf, *snd_buf, buf_len);
+				  snd_buf, *(unsigned char *)snd_buf, buf_len);
 
 			ret = send_msg(snd_buf,
 				       buf_len,
@@ -2170,6 +2337,7 @@ DAT_RETURN do_ping_pong_msg()
 		}
 
 		/* recv message, send completions suppressed */
+		event.event_number = 0;
 		if (collect_event(h_dto_rcv_evd, 
 				  &event, 
 				  DTO_TIMEOUT, 
@@ -2184,8 +2352,9 @@ DAT_RETURN do_ping_pong_msg()
 		/* validate event number and status */
 		LOGPRINTF("%d inbound message; message arrived!\n", getpid());
 		if (event.event_number != DAT_DTO_COMPLETION_EVENT) {
-			fprintf(stderr, "%d Error unexpected DTO event : %s\n",
-				getpid(), DT_EventToStr(event.event_number));
+			fprintf(stderr, "%d Error DTO event (0x%x): %s\n",
+				getpid(), event.event_number,
+				DT_EventToStr(event.event_number));
 			return (DAT_ABORT);
 		}
 		if ((event.event_data.dto_completion_event_data.
@@ -2204,9 +2373,9 @@ DAT_RETURN do_ping_pong_msg()
 			return (DAT_ABORT);
 		}
 
-		LOGPRINTF("%d %s RCV buffer %p contains: 0x%x len=%d\n",
+		LOGPRINTF("%d %s RCV buffer[%d] %p contains: 0x%x len=%d\n",
 			  getpid(), server ? "SERVER:" : "CLIENT:",
-			  rcv_buf, *rcv_buf, buf_len);
+			  i, rcv_buf, *(unsigned char *)rcv_buf, buf_len);
 
 		burst_msg_index++;
 
@@ -2214,9 +2383,9 @@ DAT_RETURN do_ping_pong_msg()
 		if (server) {
 			*snd_buf = 0xaa;
 
-			LOGPRINTF("%d %s SND buffer %p contains: 0x%x len=%d\n",
+			LOGPRINTF("%d %s SND buffer[%d] %p contains: 0x%x len=%d\n",
 				  getpid(), server ? "SERVER:" : "CLIENT:",
-				  snd_buf, *snd_buf, buf_len);
+				  i, snd_buf, *(unsigned char *)snd_buf, buf_len);
 
 			ret = send_msg(snd_buf,
 				       buf_len,
@@ -2253,7 +2422,7 @@ DAT_RETURN register_rdma_memory(void)
 	ret = dat_lmr_create(h_ia,
 			     DAT_MEM_TYPE_VIRTUAL,
 			     region,
-			     buf_len * (burst+1),
+			     buf_len * rq_cnt,
 			     h_pz,
 			     DAT_MEM_PRIV_ALL_FLAG,
 			     DAT_VA_TYPE_VA,
@@ -2280,7 +2449,7 @@ DAT_RETURN register_rdma_memory(void)
 	ret = dat_lmr_create(h_ia,
 			     DAT_MEM_TYPE_VIRTUAL,
 			     region,
-			     buf_len * (burst + 1),
+			     buf_len * rq_cnt,
 			     h_pz,
 			     DAT_MEM_PRIV_ALL_FLAG,
 			     DAT_VA_TYPE_VA,
@@ -2602,13 +2771,16 @@ void print_usage(void)
 	printf("\n DAPL USAGE \n\n");
 	printf("s: server\n");
 	printf("u: unidirectional bandwidth (default=bidirectional\n");
-	printf("w: rdma write only\n");
+	printf("w: rdma write only, streaming\n");
+	printf("W: rdma write only, ping pong\n");
 	printf("t: performance times\n");
 	printf("c: use cno\n");
+	printf("a: all data sizes with rdma write pingpong \n");
+	printf("i: increment size for all data size option\n");
 	printf("v: verbose\n");
 	printf("p: polling\n");
 	printf("d: delay before accept\n");
-	printf("b: buf length to allocate\n");
+	printf("b: buf length, upper bound for -W -a -i (WR_pp, all sizes, increment)\n");
 	printf("B: burst count, rdma and msgs \n");
 	printf("h: hostname/address of server, specified on client\n");
 	printf("P: provider name (default = ofa-v2-mlx4_0-1u)\n");

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ofed/dapl.git



More information about the Pkg-ofed-commits mailing list