[Pkg-ofed-commits] [dapl] 05/06: Imported Upstream version 2.1.8

Ana Beatriz Guerrero López ana at moszumanska.debian.org
Fri Mar 25 17:45:17 UTC 2016


This is an automated email from the git hooks/post-receive script.

ana pushed a commit to branch master
in repository dapl.

commit 3961331ed4db369eaf21cea5612b78470d26a5f8
Author: Ana Beatriz Guerrero Lopez <ana at debian.org>
Date:   Fri Mar 25 18:44:40 2016 +0100

    Imported Upstream version 2.1.8
---
 ChangeLog                            |  212 +++++++
 Makefile.am                          |    1 +
 Makefile.in                          |    1 +
 configure                            |   20 +-
 configure.in                         |    4 +-
 dapl.spec                            |    5 +-
 dapl.spec.in                         |    3 +
 dapl/openib_cma/device.c             |    3 -
 dapl/openib_common/dapl_ib_common.h  |   11 +-
 dapl/openib_common/dapl_mic_common.h |    4 +-
 dapl/openib_common/mem.c             |    1 -
 dapl/openib_common/qp.c              |   17 +-
 dapl/openib_common/util.c            |   79 ++-
 dapl/openib_mcm/cm.c                 |   30 +-
 dapl/openib_mcm/mix.c                |    7 +-
 dapl/openib_scm/cm.c                 |   78 ++-
 dapl/openib_scm/dapl_ib_util.h       |    1 -
 dapl/openib_scm/device.c             |    2 -
 dapl/openib_ucm/cm.c                 |   40 +-
 dapl/openib_ucm/device.c             |    2 -
 dapl/svc/mcm.c                       |   10 +-
 dapl/svc/mix.c                       |   90 ++-
 dapl/svc/mpxy_in.c                   |   59 +-
 dapl/svc/mpxy_out.c                  |   35 +-
 dapl/svc/mpxyd.c                     |   16 +-
 dapl/svc/mpxyd.h                     |    3 +
 doc/dat.conf                         |    8 +
 doc/mpxyd.conf                       |    2 +-
 test/dtest/dtest.c                   |  160 ++++-
 test/dtest/dtestx.c                  |   29 +-
 test/dtest/scripts/dtest_suite.sh    | 1117 ++++++++++++++++++++++++++++++++++
 31 files changed, 1899 insertions(+), 151 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 2851a0d..f59ee56 100755
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,215 @@
+commit 58d757c07c8a3fcf81bdd0529f633bdc5251a06c
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date:   Tue Feb 16 13:12:16 2016 -0800
+
+    mpxyd: fix segfault in proxy_out debug logging
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit b6a33ad0afe846749287f1636fa352e7609256e2
+Author: Amir Hanania <amir.hanania at intel.com>
+Date:   Tue Feb 16 13:04:56 2016 -0800
+
+    mpxyd: fix debug memory buffer log function
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit dc4e874b9ff65533100c4797cabf29980a0b0bbe
+Author: Amir Hanania <amir.hanania at intel.com>
+Date:   Tue Feb 16 12:53:53 2016 -0800
+
+    dtest: -D option is not valid with scif providers
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit d1b5d4836ad6e89f5ec739596dea502953a0bdcf
+Author: Amir Hanania <amir.hanania at intel.com>
+Date:   Tue Feb 16 12:47:04 2016 -0800
+
+    dtest/dapltest: add new automated test suite for HOST to MIC testing
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+    Signed-off-by: Amir Hanania <amir.hanania at intel.com>
+
+commit 722ffc3bf1045ff7113f08cbfce42cb8c84d6e67
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date:   Tue Feb 16 12:15:08 2016 -0800
+
+    openib: update attributes correctly for iWARP transports
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit ab67173b8024e14009c266d76ab9ec0bdd0c5d1f
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date:   Wed Feb 10 14:45:12 2016 -0800
+
+    openib_common: set providers mtu to active_mtu instead of 2048
+    
+    Better out of the box performance when setting mtu to active_mtu
+    instead of default settings of 2K. The new mtu settings are applied
+    on a per QP basis and negotiated via CM mtu 8-bit field. One of the
+    reserved 8 bit CM message fields is used to insure compatibility
+    with older versions.
+    
+    If older endpoints are mixed with newer versions it will fallback to
+    the pre-existing 2K MTU settings, unless overriden by DAPL_IB_MTU.
+    
+    The change has been made across all providers including ucm, scm, mcm,
+    and cma (rdma_cm). The mcm provider on a MIC will notify the CCL Proxy
+    service of a DAPL_IB_MTU override via a new MIX_OP_FLAGS bit
+    MIX_OP_MTU during the open call.
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit c399977f71d02e4c37d71d9b8ed5ba3e2b0ac33b
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date:   Wed Feb 10 14:44:46 2016 -0800
+
+    mpxyd: set affinity default to 2 for best performance
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit 0f2353e62df7dddc9a31323ffea97ac08d8b8cf6
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date:   Tue Feb 9 09:37:47 2016 -0800
+
+    mcm: cleanup unused variable in dapls_ib_mr_register
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit 11c6ae4a1abe1faf56f935cfd95f87cac2a17798
+Author: Amir Hanania <amir.hanania at intel.com>
+Date:   Tue Jan 26 14:03:16 2016 -0800
+
+    dtest: enhancement to test, -D option for data check
+    
+    With -D option, dtest will run pingpong rdma write test
+    with data validation. Changes pattern during iterations.
+    Aborts and reports location/pattern with any miscompare.
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+    Signed-off-by: Amir Hanania <amir.hanania at intel.com>
+
+commit b9860007fc6800dead92303ed9fecccfb465a229
+Author: Amir Hanania <amir.hanania at intel.com>
+Date:   Mon Jan 25 12:30:38 2016 -0800
+
+    mcm: add support for Intel Omni-Path driver (hfi) via mic MFO mode
+    
+    Set MIC based consumer to MFO (full offload) mode for both qib and new hfi devices.
+    Add to dat.conf entries for hfi verbs support. This can be run from mic or host
+    endpoints.
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+    Signed-off-by: Amir Hanania <amir.hanania at intel.com>
+
+commit 400550c8b4a4519ef4467f20cb23d4ac80dccd5e
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date:   Mon Jan 25 11:51:33 2016 -0800
+
+    mpxyd: fix ordering issues with the CCL Proxy receive side forwarding mechanism
+    
+    scif_writeto doesn't guarantee ordering on DMA posting like IB rdma writes.
+    Since CCL Proxy is emulating IB semantics we must perserve order of
+    the rdma write request from MIC consumers via any proxy scif operations.
+    
+    Changes made to proxy-in to defer forwarding RR completed segments
+    unless they are middle segments of a larger write operation. On FS or LS
+    the previous scif_writeto DMA operations must be completed and signaled
+    before posting a first or last segment. Last segment scif_writeto
+    operation is ordered to insure last byte is the last byte of
+    complete rdma write proxied operation.
+    
+    During scif_wt errors send WC error status for each pending segment
+    with rdma write operation for accurate proxy-out error processing.
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit c266f94af627e395c0f060005078b8152c8afe99
+Author: Amir Hanania <amir.hanania at intel.com>
+Date:   Thu Dec 10 15:17:03 2015 -0800
+
+    dtest: report results only if one of the pingpong tests are run
+    
+    There are two diff ping pong test cases.
+    It was possible to run dtest with none of them.
+    
+    Signed-off-by: Amir Hanania <amir.hanania at intel.com>
+
+commit efa6bed3e44f445c68b011662c75e59265805c74
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date:   Thu Dec 10 14:48:05 2015 -0800
+
+    mpxyd: with abnormal CM termination a CM object can be referenced after QP destroy
+    
+    The proxy-in CQ is not flushed and processes properly during
+    mix_qp_destroy. Depending on the EP mode there can be 2 seperate
+    connections with multiple CQs to process. Add new mix_cq_flush
+    function that will flush all pending work on TX and RX side of
+    proxy engine. CM object is destroyed and reset only after all
+    pending work is processed on ALL endpoint CQ associations.
+    Add error logging when WR resources are exhausted.
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit 2b294cd7dcdbccdc235c056791f36bd2821c2b9b
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date:   Thu Dec 10 14:36:22 2015 -0800
+
+    mpxyd: proxy out WR resources exhausted with MFO mode endpoints
+    
+    WC status of IBV_WC_RETRY_EXC_ERR reported back to MIC client
+    
+    Operation processing thread doesn't yield properly
+    to enable tx thread to process completions and replenish
+    WR resources. Retries occur to quickly.
+    
+    add some new error logs for resource issues.
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit a912661e87650e2313757e0b02cbbfbf35570bb7
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date:   Wed Oct 21 09:49:45 2015 -0700
+
+    release note update for CCL Proxy and Platform BIOS recommendations
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit cb4cb9e300216a2c94082b9fe5df939c4972e1e9
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date:   Fri Oct 16 13:08:11 2015 -0700
+
+    dtestx: add dat_ib_open_query only option with -q
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit 1f4baf860cf2c17960885df7ff49cc0021fe317e
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date:   Fri Oct 16 10:21:19 2015 -0700
+
+    scm: CONN_PENDING: SOCKOPT ERR Connection refused ->
+    
+    Error caused by cm_msg size compatability issue with new v8
+    protocol and older socket cm providers (2.1.4 and older).
+    The ucm, cma, and mcm providers are not affected.
+    
+    Modify socket data sizes for SCM request/reply to interoperate
+    between new v8 with smaller private data and older protocols.
+    
+    Adjust SCM reply/rtu based on remote CM version and retry a failed
+    request with pre-v8 adjusted size in case of server side failure.
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit 0494ec10176e07804c26b28484535252e47c3f99
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date:   Tue Sep 29 20:23:58 2015 -0700
+
+    Release 2.1.7
+    
+    Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
 commit 963e5d793867644c770c087f1ef443550779ca8c
 Author: Arlin Davis <arlin.r.davis at intel.com>
 Date:   Tue Sep 29 09:05:27 2015 -0700
diff --git a/Makefile.am b/Makefile.am
index 7adaf43..ea99e6e 100755
--- a/Makefile.am
+++ b/Makefile.am
@@ -723,6 +723,7 @@ EXTRA_DIST = dat/common/dat_dictionary.h \
 	     dapl.spec.in \
 	     mpxyd.init.in \
 	     $(man_MANS) \
+	     test/dtest/scripts/dtest_suite.sh \
 	     test/dapltest/scripts/cl.sh \
 	     test/dapltest/scripts/srv.sh \
 	     test/dapltest/scripts/regress.sh \
diff --git a/Makefile.in b/Makefile.in
index e2ed238..0acf971 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -1862,6 +1862,7 @@ EXTRA_DIST = dat/common/dat_dictionary.h \
 	     dapl.spec.in \
 	     mpxyd.init.in \
 	     $(man_MANS) \
+	     test/dtest/scripts/dtest_suite.sh \
 	     test/dapltest/scripts/cl.sh \
 	     test/dapltest/scripts/srv.sh \
 	     test/dapltest/scripts/regress.sh \
diff --git a/configure b/configure
index a1f1a89..fca96a4 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.67 for dapl 2.1.7.
+# Generated by GNU Autoconf 2.67 for dapl 2.1.8.
 #
 # Report bugs to <linux-rdma at vger.kernel.org>.
 #
@@ -562,8 +562,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='dapl'
 PACKAGE_TARNAME='dapl'
-PACKAGE_VERSION='2.1.7'
-PACKAGE_STRING='dapl 2.1.7'
+PACKAGE_VERSION='2.1.8'
+PACKAGE_STRING='dapl 2.1.8'
 PACKAGE_BUGREPORT='linux-rdma at vger.kernel.org'
 PACKAGE_URL=''
 
@@ -1318,7 +1318,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures dapl 2.1.7 to adapt to many kinds of systems.
+\`configure' configures dapl 2.1.8 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1388,7 +1388,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of dapl 2.1.7:";;
+     short | recursive ) echo "Configuration of dapl 2.1.8:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-dapl configure 2.1.7
+dapl configure 2.1.8
 generated by GNU Autoconf 2.67
 
 Copyright (C) 2010 Free Software Foundation, Inc.
@@ -1935,7 +1935,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by dapl $as_me 2.1.7, which was
+It was created by dapl $as_me 2.1.8, which was
 generated by GNU Autoconf 2.67.  Invocation command line was
 
   $ $0 $@
@@ -2803,7 +2803,7 @@ fi
 # Define the identity of the package.
 
  PACKAGE=dapl
- VERSION=2.1.7
+ VERSION=2.1.8
 
 
 cat >>confdefs.h <<_ACEOF
@@ -13281,7 +13281,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by dapl $as_me 2.1.7, which was
+This file was extended by dapl $as_me 2.1.8, which was
 generated by GNU Autoconf 2.67.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -13347,7 +13347,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-dapl config.status 2.1.7
+dapl config.status 2.1.8
 configured by $0, generated by GNU Autoconf 2.67,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.in b/configure.in
index 5fbbfe1..56c99e5 100755
--- a/configure.in
+++ b/configure.in
@@ -1,12 +1,12 @@
 dnl Process this file with autoconf to produce a configure script.
 
 AC_PREREQ(2.57)
-AC_INIT(dapl, 2.1.7, linux-rdma at vger.kernel.org)
+AC_INIT(dapl, 2.1.8, linux-rdma at vger.kernel.org)
 AC_CONFIG_SRCDIR([dat/udat/udat.c])
 AC_CONFIG_AUX_DIR(config)
 AC_CONFIG_MACRO_DIR([m4])
 AM_CONFIG_HEADER(config.h)
-AM_INIT_AUTOMAKE(dapl, 2.1.7)
+AM_INIT_AUTOMAKE(dapl, 2.1.8)
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 
 AM_PROG_LIBTOOL
diff --git a/dapl.spec b/dapl.spec
index 4d26fd3..74761ca 100644
--- a/dapl.spec
+++ b/dapl.spec
@@ -37,7 +37,7 @@
 %{!?_CONF: %define _CONF ""}
 
 Name: dapl
-Version: 2.1.7
+Version: 2.1.8
 Release: 1%{?dist}
 Summary: A Library for userspace access to RDMA devices using OS Agnostic DAT APIs, proxy daemon for offloading RDMA 
 
@@ -153,6 +153,9 @@ fi
 mv /tmp/%{version}-dat.conf %{_sysconfdir}/dat.conf
 
 %changelog
+* Tue Feb 16 2016 Arlin Davis <ardavis at ichips.intel.com> - 2.1.8
+- DAT/DAPL Version 2.1.8 Release 1, OFED 3.18-2, MPSS 3.7 
+
 * Tue Sep 29 2015 Arlin Davis <ardavis at ichips.intel.com> - 2.1.7
 - DAT/DAPL Version 2.1.7 Release 1, OFED 3.18-1 GA
 
diff --git a/dapl.spec.in b/dapl.spec.in
index 2f16477..fbec001 100755
--- a/dapl.spec.in
+++ b/dapl.spec.in
@@ -153,6 +153,9 @@ fi
 mv /tmp/%{version}-dat.conf %{_sysconfdir}/dat.conf
 
 %changelog
+* Tue Feb 16 2016 Arlin Davis <ardavis at ichips.intel.com> - 2.1.8
+- DAT/DAPL Version 2.1.8 Release 1, OFED 3.18-2, MPSS 3.7 
+
 * Tue Sep 29 2015 Arlin Davis <ardavis at ichips.intel.com> - 2.1.7
 - DAT/DAPL Version 2.1.7 Release 1, OFED 3.18-1 GA
 
diff --git a/dapl/openib_cma/device.c b/dapl/openib_cma/device.c
index 9e87355..ff6c174 100755
--- a/dapl/openib_cma/device.c
+++ b/dapl/openib_cma/device.c
@@ -394,9 +394,6 @@ DAT_RETURN dapls_ib_open_hca(IN IB_HCA_NAME hca_name,
 #endif
 
 done:
-	/* set default IB MTU */
-	hca_ptr->ib_trans.ib_cm.mtu = dapl_ib_mtu(2048);
-
 	return DAT_SUCCESS;
 }
 
diff --git a/dapl/openib_common/dapl_ib_common.h b/dapl/openib_common/dapl_ib_common.h
index 69ec31b..8ff898f 100755
--- a/dapl/openib_common/dapl_ib_common.h
+++ b/dapl/openib_common/dapl_ib_common.h
@@ -71,6 +71,7 @@ struct dcm_ib_qp {
 	char			 *wr_buf_rx; /* mcm_wr_rx_t entries, devices without inline data  */
 	struct ibv_mr		 *wr_buf_rx_mr;
 #endif
+	uint8_t			 mtu;      /* RC QP MTU, cm exchange, min(local,peer) */
 };
 
 #define DCM_CQ_TX 		0x1
@@ -150,7 +151,8 @@ typedef struct _ib_cm_msg
 	uint8_t			sportx; /* extend to 24 bits */
 	uint8_t			dportx; /* extend to 24 bits */
 	uint8_t			rtns; 	/* retransmissions */
-	uint8_t			resv[2];
+	uint8_t			mtu;	/* MTU */
+	uint8_t			resv[1];
 	union dcm_addr		saddr;
 	union dcm_addr		daddr;
 	union dcm_addr		saddr_alt;
@@ -168,6 +170,11 @@ typedef struct _ib_named_attr
 	 const char *mtu;
 	 const char *port;
 	 const char *port_num;
+	 const char *iw_ext;
+	 const char *ib_ext;
+	 const char *i_data;
+	 const char *f_add;
+	 const char *c_swap;
 
 } ib_named_attr_t;
 
@@ -243,7 +250,7 @@ typedef uint16_t		ib_hca_port_t;
 #define DCM_ACK_RETRY	7  /* 3 bits, 7 * 4.2  == 30 seconds */
 #define DCM_RNR_TIMER	12 /* 5 bits, 12 =.64ms, 28 =163ms, 31 =491ms */
 #define DCM_RNR_RETRY	7  /* 3 bits, 7 == infinite */
-#define DCM_IB_MTU	2048
+#define DCM_IB_MTU 	4096	/* new default MTU size */
 
 /* Global routing defaults */
 #define DCM_GLOBAL	0       /* global routing is disabled */
diff --git a/dapl/openib_common/dapl_mic_common.h b/dapl/openib_common/dapl_mic_common.h
index 86a815e..0231013 100755
--- a/dapl/openib_common/dapl_mic_common.h
+++ b/dapl/openib_common/dapl_mic_common.h
@@ -234,7 +234,8 @@ typedef struct dat_mcm_msg
 	uint32_t		s_id;  /* src pid */
 	uint32_t		d_id;  /* dst pid */
 	uint8_t			rd_in; /* atomic_rd_in */
-	uint8_t			rsvd[4];
+	uint8_t			mtu;   /* mtu */
+	uint8_t			rsvd[3];
 	uint8_t			seg_sz; /* data segment size in power of 2 */
 	dat_mcm_addr_t		saddr1;	/* QPt local,  MPXY or MCM on non-MIC node */
 	dat_mcm_addr_t		saddr2; /* QPr local,  MIC  or MCM on non-MIC node or MPXY */
@@ -369,6 +370,7 @@ typedef enum dat_mix_op_flags
     MIX_OP_ASYNC  = 0x08,
     MIX_OP_INLINE = 0x10,
     MIX_OP_SET    = 0x20,
+    MIX_OP_MTU    = 0x40,
 
 } dat_mix_op_flags_t;
 
diff --git a/dapl/openib_common/mem.c b/dapl/openib_common/mem.c
index 7f5ea6a..34e4234 100755
--- a/dapl/openib_common/mem.c
+++ b/dapl/openib_common/mem.c
@@ -164,7 +164,6 @@ dapls_ib_mr_register(IN DAPL_IA * ia_ptr,
 		     IN DAT_MEM_PRIV_FLAGS privileges, IN DAT_VA_TYPE va_type)
 #ifdef _OPENIB_MCM_
 {
-	struct ibv_device *ibv_dev;
 	int ib_access = dapls_convert_privileges(privileges);
 
 	dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
diff --git a/dapl/openib_common/qp.c b/dapl/openib_common/qp.c
index 01f91ca..3d622ab 100755
--- a/dapl/openib_common/qp.c
+++ b/dapl/openib_common/qp.c
@@ -648,19 +648,22 @@ dapls_modify_qp_state(IN struct ibv_qp		*qp_handle,
 
 		qp_attr.dest_qp_num = ntohl(qpn);
 		qp_attr.rq_psn = 1;
-		qp_attr.path_mtu = ia_ptr->hca_ptr->ib_trans.ib_cm.mtu;
 		qp_attr.min_rnr_timer =	ia_ptr->hca_ptr->ib_trans.ib_cm.rnr_timer;
+		qp_attr.path_mtu = ep_ptr->qp_handle->mtu ?
+				   ep_ptr->qp_handle->mtu :
+				   ia_ptr->hca_ptr->ib_trans.ib_cm.mtu;
 #ifdef _OPENIB_MCM_
 		qp_attr.max_dest_rd_atomic = ia_ptr->hca_ptr->ib_trans.ib_cm.rd_atom_in;
 #else
 		qp_attr.max_dest_rd_atomic = ep_ptr->param.ep_attr.max_rdma_read_in;
 #endif
-		dapl_dbg_log(DAPL_DBG_TYPE_EP,
-				" QPS_RTR: type %d l_qpn %x qpn %x lid 0x%x"
-				" port %d ep %p qp_state %d rd_atomic %d\n",
-				qp_handle->qp_type, qp_handle->qp_num,
-				ntohl(qpn), ntohs(lid), ia_ptr->hca_ptr->port_num,
-				ep_ptr, ep_ptr->qp_state, qp_attr.max_dest_rd_atomic);
+		dapl_log(DAPL_DBG_TYPE_EP,
+		  	 " QPS_RTR: type %d l_qpn %x qpn %x lid 0x%x"
+			 " port %d ep %p qp_state %d rd_atomic %d mtu %d lmtu %d\n",
+			 qp_handle->qp_type, qp_handle->qp_num,
+			 ntohl(qpn), ntohs(lid), ia_ptr->hca_ptr->port_num,
+			 ep_ptr, ep_ptr->qp_state, qp_attr.max_dest_rd_atomic,
+			 qp_attr.path_mtu, ia_ptr->hca_ptr->ib_trans.ib_cm.mtu);
 
 		/* address handle. RC and UD */
 		qp_attr.ah_attr.dlid = ntohs(lid);
diff --git a/dapl/openib_common/util.c b/dapl/openib_common/util.c
index 55bda3b..ec5d72f 100755
--- a/dapl/openib_common/util.c
+++ b/dapl/openib_common/util.c
@@ -285,7 +285,7 @@ enum ibv_mtu dapl_ib_mtu(int mtu)
 	case 4096:
 		return IBV_MTU_4096;
 	default:
-		return IBV_MTU_1024;
+		return IBV_MTU_4096;
 	}
 }
 
@@ -303,7 +303,7 @@ const char *dapl_ib_mtu_str(enum ibv_mtu mtu)
 	case IBV_MTU_4096:
 		return "4096";
 	default:
-		return "1024";
+		return "4096";
 	}
 }
 
@@ -424,6 +424,13 @@ DAT_RETURN dapls_ib_query_hca(IN DAPL_HCA * hca_ptr,
 	dev_attr.max_qp_wr = DAPL_MIN(dev_attr.max_qp_wr,
 				      dapl_os_get_env_val("DAPL_WR_MAX", dev_attr.max_qp_wr));
 
+	/* MTU to active by default, reset if env set and <= active_mtu */
+	if (getenv("DAPL_IB_MTU"))
+		tp->ib_cm.mtu = DAPL_MIN(port_attr.active_mtu,
+					 dapl_ib_mtu(dapl_os_get_env_val("DAPL_IB_MTU", DCM_IB_MTU)));
+	else
+		tp->ib_cm.mtu = port_attr.active_mtu;
+
 #ifdef _OPENIB_MCM_
 	/* Adjust for CCL Proxy; limited sge's, no READ support, reduce QP and RDMA limits */
 	dev_attr.max_sge = DAPL_MIN(dev_attr.max_sge, DAT_MIX_SGE_MAX);
@@ -483,21 +490,32 @@ DAT_RETURN dapls_ib_query_hca(IN DAPL_HCA * hca_ptr,
 #ifdef _OPENIB_MCM_
 		if (!MFO_EP(&hca_ptr->ib_trans.addr))
 #endif
-			if (hca_ptr->ib_hca_handle->device->transport_type == IBV_TRANSPORT_IWARP)
-				ia_attr->max_iov_segments_per_rdma_read = 1;
+		if (hca_ptr->ib_hca_handle->device->transport_type == IBV_TRANSPORT_IWARP)
+			ia_attr->max_iov_segments_per_rdma_read = 1;
 		ia_attr->max_iov_segments_per_rdma_write = dev_attr.max_sge;
 		ia_attr->num_transport_attr = 0;
 		ia_attr->transport_attr = NULL;
 		ia_attr->num_vendor_attr = 0;
 		ia_attr->vendor_attr = NULL;
 #ifdef DAT_EXTENSIONS
-		ia_attr->extension_supported = DAT_EXTENSION_IB;
-		ia_attr->extension_version = DAT_IB_EXTENSION_VERSION;
+		if (hca_ptr->ib_hca_handle->device->transport_type == IBV_TRANSPORT_IWARP) {
+			ia_attr->extension_supported = DAT_EXTENSION_NONE;
+			tp->na.ib_ext = "FALSE";
+			tp->na.i_data = "FALSE";
+			tp->na.f_add  = "FALSE";
+			tp->na.c_swap = "FALSE";
+		} else {
+			ia_attr->extension_supported = DAT_EXTENSION_IB;
+			ia_attr->extension_version = DAT_IB_EXTENSION_VERSION;
+			tp->na.ib_ext = "TRUE";
+			tp->na.i_data = "TRUE";
+			tp->na.f_add  = "TRUE";
+			tp->na.c_swap = "TRUE";
+		}
 #endif
 		/* save key device attributes for CM exchange */
 		tp->ib_cm.rd_atom_in  = dev_attr.max_qp_rd_atom;
 		tp->ib_cm.rd_atom_out = dev_attr.max_qp_init_rd_atom;
-		tp->ib_cm.mtu = DAPL_MIN(port_attr.active_mtu, tp->ib_cm.mtu);
 		tp->ib_cm.ack_timer = DAPL_MAX(dev_attr.local_ca_ack_delay, tp->ib_cm.ack_timer);
 
 		/* set provider/transport specific named attributes */
@@ -888,28 +906,47 @@ DAT_NAMED_ATTR ib_attrs[] = {
 void dapls_query_provider_specific_attr(IN DAPL_IA * ia_ptr,
 					IN DAT_PROVIDER_ATTR * attr_ptr)
 {
+	int i;
+
 	attr_ptr->num_provider_specific_attr = SPEC_ATTR_SIZE(ib_attrs);
 	attr_ptr->provider_specific_attr = ib_attrs;
 
 	dapl_log(DAPL_DBG_TYPE_UTIL,
 		 " prov_attr: %p sz %d\n", ib_attrs, SPEC_ATTR_SIZE(ib_attrs));
 
-	/* update common attributes from providers */
-	ib_attrs[1].value = ia_ptr->hca_ptr->ib_trans.na.dev;
-	ib_attrs[2].value = ia_ptr->hca_ptr->ib_trans.na.mode;
-	ib_attrs[3].value = ia_ptr->hca_ptr->ib_trans.na.read;
-	ib_attrs[4].value = ia_ptr->hca_ptr->ib_trans.guid_str;
-	ib_attrs[5].value = ia_ptr->hca_ptr->ib_trans.na.mtu;
-	ib_attrs[6].value = ia_ptr->hca_ptr->ib_trans.na.port;
-	ib_attrs[7].value = ia_ptr->hca_ptr->ib_trans.na.port_num;
+	for (i=0; i<SPEC_ATTR_SIZE(ib_attrs); i++) {
+		/* update attributes from IA query results */
+		if (!strcmp(ib_attrs[i].name, "DAT_IB_DEVICE_NAME"))
+			ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.dev;
+		else if (!strcmp(ib_attrs[i].name, "DAT_IB_CONNECTIVITY_MODE"))
+			ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.mode;
+		else if (!strcmp(ib_attrs[i].name, "DAT_IB_RDMA_READ"))
+			ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.read;
+		else if (!strcmp(ib_attrs[i].name, "DAT_IB_NODE_GUID"))
+			ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.guid_str;
+		else if (!strcmp(ib_attrs[i].name, "DAT_IB_TRANSPORT_MTU"))
+			ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.mtu;
+		else if (!strcmp(ib_attrs[i].name, "DAT_IB_PORT_STATUS"))
+			ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.port;
+		else if (!strcmp(ib_attrs[i].name, "DAT_IB_PORT_NUMBER"))
+			ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.port_num;
+		else if (!strcmp(ib_attrs[i].name, "DAT_EXTENSION_INTERFACE"))
+			ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.ib_ext;
+		else if (!strcmp(ib_attrs[i].name, "DAT_IB_IMMED_DATA"))
+			ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.i_data;
+		else if (!strcmp(ib_attrs[i].name, "DAT_IB_FETCH_AND_ADD"))
+			ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.f_add;
+		else if (!strcmp(ib_attrs[i].name, "DAT_IB_CMP_AND_SWAP"))
+			ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.c_swap;
 #ifdef _OPENIB_MCM_
-{
-	int i = attr_ptr->num_provider_specific_attr;
-	ib_attrs[i-3].value = ia_ptr->hca_ptr->ib_trans.fam_str;
-	ib_attrs[i-2].value = ia_ptr->hca_ptr->ib_trans.mod_str;
-	ib_attrs[i-1].value = ia_ptr->hca_ptr->ib_trans.ver_str;
-}
+		else if (!strcmp(ib_attrs[i].name, "DAT_IB_PROXY_CPU_FAMILY"))
+			ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.fam_str;
+		else if (!strcmp(ib_attrs[i].name, "DAT_IB_PROXY_CPU_MODEL"))
+			ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.mod_str;
+		else if (!strcmp(ib_attrs[i].name, "DAT_IB_PROXY_VERSION"))
+			ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.ver_str;
 #endif
+	}
 }
 
 /*
diff --git a/dapl/openib_mcm/cm.c b/dapl/openib_mcm/cm.c
index f2a4b8d..48ff0b3 100755
--- a/dapl/openib_mcm/cm.c
+++ b/dapl/openib_mcm/cm.c
@@ -1104,6 +1104,11 @@ void mcm_connect_rtu(dp_ib_cm_handle_t cm, dat_mcm_msg_t *msg)
 	if (msg->seg_sz) /* set po2 seg_sz, if provided */
 		cm->msg.seg_sz = msg->seg_sz;
 
+	/* Set QP MTU, if negotiated. 2K for compatibility */
+	ep->qp_handle->mtu = msg->mtu ?
+			     DAPL_MIN(msg->mtu, cm->hca->ib_trans.ib_cm.mtu):
+			     getenv("DAPL_IB_MTU") ? cm->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048;
+
 	cm->msg.d_id = msg->s_id;
 	dapl_os_memcpy(&ep->remote_ia_address, &msg->saddr2, sizeof(dat_mcm_addr_t));
 	dapl_os_memcpy(&cm->msg.daddr2, &msg->saddr2, sizeof(dat_mcm_addr_t));
@@ -1129,10 +1134,12 @@ void mcm_connect_rtu(dp_ib_cm_handle_t cm, dat_mcm_msg_t *msg)
 	}
 		
 	dapl_dbg_log(DAPL_DBG_TYPE_CM,
-		     " CONN_RTU: DST lid=%x, QPr=%x, QPt=%x qp_type=%d, port=%x psize=%d\n",
+		     " CONN_RTU: DST lid=%x, QPr=%x, QPt=%x qp_type=%d,"
+		     " port=%x psize=%d mtu=%d,%d\n",
 		     ntohs(cm->msg.daddr1.lid), ntohl(cm->msg.daddr1.qpn),
 		     ntohl(cm->msg.daddr2.qpn), cm->msg.daddr1.qp_type,
-		     ntohs(msg->sport), ntohs(msg->p_size));
+		     ntohs(msg->sport), ntohs(msg->p_size),
+		     cm->tp->ib_cm.mtu, cm->msg.mtu);
 
 	if (ntohs(msg->op) == MCM_REP)
 		event = IB_CME_CONNECTED;
@@ -1227,6 +1234,7 @@ void mcm_connect_rtu(dp_ib_cm_handle_t cm, dat_mcm_msg_t *msg)
 	
 	/* Send RTU, no private data */
 	cm->msg.op = htons(MCM_RTU);
+	cm->msg.mtu = ep->qp_handle->mtu; /* send negotiated MTU */
 	
 	dapl_os_lock(&cm->lock);
 	cm->state = MCM_CONNECTED;
@@ -1249,11 +1257,12 @@ void mcm_connect_rtu(dp_ib_cm_handle_t cm, dat_mcm_msg_t *msg)
 				     cm->msg.p_data, ntohs(cm->msg.p_size), cm->ep);
 
 	dapl_log(DAPL_DBG_TYPE_CM_EST,
-		 " mcm_ACTIVE_CONN %p %d [lid port qpn] %x %x %x -> %x %x %x %s\n",
+		 " mcm_ACTIVE_CONN %p %d [lid port qpn] %x %x %x -> %x %x %x %s mtu %d\n",
 		 cm->hca, cm->retries, ntohs(cm->msg.saddr1.lid),
 		 ntohs(cm->msg.sport), ntohl(cm->msg.saddr1.qpn),
 		 ntohs(cm->msg.daddr1.lid), ntohs(cm->msg.dport),
-		 ntohl(cm->msg.dqpn), mcm_map_str(cm->msg.daddr1.ep_map));
+		 ntohl(cm->msg.dqpn), mcm_map_str(cm->msg.daddr1.ep_map),
+		 cm->ep->qp_handle->mtu);
 
 	mcm_log_addrs(DAPL_DBG_TYPE_CM_EST, &cm->msg, cm->state, 0);
 
@@ -1291,6 +1300,7 @@ static void mcm_accept(ib_cm_srvc_handle_t cm, dat_mcm_msg_t *msg)
 	acm->msg.p_size = msg->p_size;
 	acm->msg.d_id = msg->s_id;
 	acm->msg.rd_in = msg->rd_in;
+	acm->msg.mtu = msg->mtu; /* save peer MTU */
 	if (msg->seg_sz) /* set po2 seg_sz, if provided */
 		acm->msg.seg_sz = msg->seg_sz;
 
@@ -1359,11 +1369,12 @@ static void mcm_accept_rtu(dp_ib_cm_handle_t cm, dat_mcm_msg_t *msg)
 	dapls_cr_callback(cm, IB_CME_CONNECTED, NULL, 0, cm->sp);
 
 	dapl_log(DAPL_DBG_TYPE_CM_EST,
-		 " PASSIVE_CONN %p %d [lid port qpn] %x %x %x <- %x %x %x %s\n",
+		 " PASSIVE_CONN %p %d [lid port qpn] %x %x %x <- %x %x %x %s mtu %d\n",
 		 cm->hca, cm->retries, ntohs(cm->msg.saddr1.lid),
 		 ntohs(cm->msg.sport), ntohl(cm->msg.saddr1.qpn),
 		 ntohs(cm->msg.daddr1.lid), ntohs(cm->msg.dport),
-		 ntohl(cm->msg.dqpn), mcm_map_str(cm->msg.daddr1.ep_map));
+		 ntohl(cm->msg.dqpn), mcm_map_str(cm->msg.daddr1.ep_map),
+		 cm->ep->qp_handle->mtu);
 
 	mcm_log_addrs(DAPL_DBG_TYPE_CM_EST, &cm->msg, cm->state, 1);
 	return;
@@ -1489,6 +1500,11 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data)
 	ep->param.ep_attr.max_rdma_read_out =
 		DAPL_MIN(ep->param.ep_attr.max_rdma_read_out, cm->msg.rd_in);
 
+	/* Set QP MTU, if negotiated. 2K for compatibility */
+	ep->qp_handle->mtu = cm->msg.mtu ?
+			     DAPL_MIN(cm->msg.mtu, cm->hca->ib_trans.ib_cm.mtu):
+			     getenv("DAPL_IB_MTU") ? cm->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048;
+
 	/* modify QPr to RTR and then to RTS, QPr (qp) to remote QPt (daddr2), !xsocket */
 	dapl_os_lock(&ep->header.lock);
 	if (!MXF_EP(&cm->hca->ib_trans.addr)) {
@@ -1567,6 +1583,7 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data)
 	/* setup local QPr info (if !KR) and type from EP, copy pdata, for reply */
 	cm->msg.op = htons(MCM_REP);
 	cm->msg.rd_in = ep->param.ep_attr.max_rdma_read_in;
+	cm->msg.mtu = ep->qp_handle->mtu; /* send negotiated MTU */
 
 	if (!MXF_EP(&cm->hca->ib_trans.addr)) {
 		cm->msg.saddr1.qpn = htonl(ep->qp_handle->qp->qp_num);
@@ -1680,6 +1697,7 @@ dapls_ib_connect(IN DAT_EP_HANDLE ep_handle,
 	
         /* set max rdma inbound requests */
         cm->msg.rd_in = ep->param.ep_attr.max_rdma_read_in;
+        cm->msg.mtu = cm->tp->ib_cm.mtu; /* local MTU to peer */
 
 	if (p_size) {
 		cm->msg.p_size = htons(p_size);
diff --git a/dapl/openib_mcm/mix.c b/dapl/openib_mcm/mix.c
index 970b372..5d96eb5 100755
--- a/dapl/openib_mcm/mix.c
+++ b/dapl/openib_mcm/mix.c
@@ -62,8 +62,8 @@ int dapli_mix_mode(ib_hca_transport_t *tp, char *name)
 		return 0;
 	}
 
-	/*  MIC node: "qib" device requires full offload */
-	mfo_dev = !dapl_os_pstrcmp("qib", name);
+	/*  MIC node: "qib" and "hfi" devices requires full offload */
+	mfo_dev = !dapl_os_pstrcmp("qib", name) || !dapl_os_pstrcmp("hfi", name);
 	if (mfo_mode || mfo_dev) {
 		tp->addr.ep_map = MIC_FULL_DEV; /* MIC with full proxy offload, no direct verbs */
 	}
@@ -171,6 +171,9 @@ int dapli_mix_open(ib_hca_transport_t *tp, char *name, int port, int query_only)
 	msg.port = port;
 	strcpy((char*)&msg.name, name);
 
+	if (getenv("DAPL_IB_MTU"))
+		msg.hdr.flags |= MIX_OP_MTU;
+
 	/* send any overridden attributes to proxy */
 	msg.dev_attr.ack_timer = tp->ib_cm.ack_timer;
 	msg.dev_attr.ack_retry = tp->ib_cm.ack_retry;
diff --git a/dapl/openib_scm/cm.c b/dapl/openib_scm/cm.c
index ecc5418..35164ef 100755
--- a/dapl/openib_scm/cm.c
+++ b/dapl/openib_scm/cm.c
@@ -63,6 +63,8 @@
 
 extern char *gid_str;
 
+#define SCM_BC_DIFF 2 /* cm_msg adjust, pre v8 */
+
 /* forward declarations */
 static DAT_RETURN
 dapli_socket_connect(DAPL_EP * ep_ptr,
@@ -561,6 +563,11 @@ static void dapli_socket_connected(dp_ib_cm_handle_t cm_ptr, int err)
 
 	/* send qp info and pdata to remote peer */
 	exp = sizeof(ib_cm_msg_t) - DCM_MAX_PDATA_SIZE;
+	if (cm_ptr->retry == SCM_CR_RETRY-1) {
+		exp += SCM_BC_DIFF; /* retry with pre-v8 req */
+		dapl_log(DAPL_DBG_TYPE_CM_WARN,
+			 " CONN_REQ: retry pre-v8 protocol; peer != v8\n");
+	}
 	iov[0].iov_base = (void *)&cm_ptr->msg;
 	iov[0].iov_len = exp;
 	if (cm_ptr->msg.p_size) {
@@ -583,16 +590,14 @@ static void dapli_socket_connected(dp_ib_cm_handle_t cm_ptr, int err)
 		goto bail;
 	}
 
- 	dapl_dbg_log(DAPL_DBG_TYPE_CM,
-		     " CONN_PENDING: sending SRC lid=0x%x,"
-		     " qpn=0x%x, psize=%d\n",
-		     ntohs(cm_ptr->msg.saddr.ib.lid),
-		     ntohl(cm_ptr->msg.saddr.ib.qpn), 
-		     ntohs(cm_ptr->msg.p_size));
-	dapl_dbg_log(DAPL_DBG_TYPE_CM,
-		     " CONN_PENDING: SRC GID %s\n",
-		     inet_ntop(AF_INET6, &cm_ptr->hca->ib_trans.gid,
-			       gid_str, sizeof(gid_str)));
+	dapl_log(DAPL_DBG_TYPE_CM,
+		 " CONN_REQ: (%d) SRC lid=0x%x,"
+		 " qpn=0x%x, psize=%d GID %s\n",
+		 exp, ntohs(cm_ptr->msg.saddr.ib.lid),
+		 ntohl(cm_ptr->msg.saddr.ib.qpn),
+		 ntohs(cm_ptr->msg.p_size),
+		 inet_ntop(AF_INET6, &cm_ptr->hca->ib_trans.gid,
+		           gid_str, sizeof(gid_str)));
 
 	DAPL_CNTR(((DAPL_IA *)dapl_llist_peek_head(&cm_ptr->hca->ia_list_head)), DCNT_IA_CM_REQ_TX);
 	return;
@@ -666,6 +671,7 @@ dapli_socket_connect(DAPL_EP * ep_ptr,
 	/* REQ: QP info in msg.saddr, IA address in msg.daddr, and pdata */
 	cm_ptr->hca = ia_ptr->hca_ptr;
 	cm_ptr->msg.op = ntohs(DCM_REQ);
+	cm_ptr->msg.mtu = ia_ptr->hca_ptr->ib_trans.ib_cm.mtu; /* local MTU to peer */
 	cm_ptr->msg.saddr.ib.qpn = htonl(ep_ptr->qp_handle->qp->qp_num);
 	cm_ptr->msg.saddr.ib.qp_type = ep_ptr->qp_handle->qp->qp_type;
 	cm_ptr->msg.saddr.ib.lid = ia_ptr->hca_ptr->ib_trans.lid;
@@ -730,9 +736,16 @@ static void dapli_socket_connect_rtu(dp_ib_cm_handle_t cm_ptr)
 	socklen_t sl;
 
 	/* read DST information into cm_ptr, overwrite SRC info */
-	dapl_dbg_log(DAPL_DBG_TYPE_EP, " connect_rtu: recv peer QP data\n");
-
 	len = recv(cm_ptr->socket, (char *)&cm_ptr->msg, exp, 0);
+
+	if (ntohs(cm_ptr->msg.ver) < DCM_VER_XPS) {
+		len += recv(cm_ptr->socket, (char *)&cm_ptr->msg+len, SCM_BC_DIFF, 0);
+		exp += SCM_BC_DIFF;
+	}
+
+	dapl_log(DAPL_DBG_TYPE_CM, " CONN_REP_IN: ver %d cm_sz %d, p_sz %d\n",
+		ntohs(cm_ptr->msg.ver), exp, ntohs(cm_ptr->msg.p_size));
+
 	if (len != exp || ntohs(cm_ptr->msg.ver) < DCM_VER_MIN) {
 		int err = dapl_socket_errno();
 		dapl_log(DAPL_DBG_TYPE_CM_WARN,
@@ -745,9 +758,10 @@ static void dapli_socket_connect_rtu(dp_ib_cm_handle_t cm_ptr)
 			 cm_ptr->retry);
 
 		/* Retry; corner case where server tcp stack resets under load */
-		if (err == ECONNRESET && --cm_ptr->retry) {
+		if ((err == ECONNRESET && --cm_ptr->retry) || (--cm_ptr->retry == SCM_CR_RETRY-1)) {
 			closesocket(cm_ptr->socket);
 			cm_ptr->socket = DAPL_INVALID_SOCKET;
+			dapl_log(DAPL_DBG_TYPE_CM_WARN, "CONN_REQ: retry %d\n", cm_ptr->retry);
 			dapli_socket_connect(cm_ptr->ep, (DAT_IA_ADDRESS_PTR)&cm_ptr->addr, 
 					     ntohs(((struct sockaddr_in *)&cm_ptr->addr)->sin_port) - 1000,
 					     ntohs(cm_ptr->msg.p_size), &cm_ptr->msg.p_data, cm_ptr->retry);
@@ -800,9 +814,10 @@ static void dapli_socket_connect_rtu(dp_ib_cm_handle_t cm_ptr)
 		len = recv(cm_ptr->socket, cm_ptr->msg.p_data, exp, 0);
 		if (len != exp) {
 			int err = dapl_socket_errno();
-			dapl_log(DAPL_DBG_TYPE_ERR,
-				 " CONN_RTU read pdata: ERR 0x%x %s, rcnt=%d -> %s\n",
-				 err, strerror(err), len,
+			dapl_log(DAPL_DBG_TYPE_CM_WARN,
+				 " CONN_RTU read pdata: ERR 0x%x %s"
+				 " rcv %d !=  exp %d -> %s\n",
+				 err, strerror(err), len, exp,
 				 inet_ntoa(((struct sockaddr_in *)
 					    ep_ptr->param.
 					    remote_ia_address_ptr)->sin_addr));
@@ -834,6 +849,11 @@ static void dapli_socket_connect_rtu(dp_ib_cm_handle_t cm_ptr)
 				DAPL_MIN(ep_ptr->param.ep_attr.max_rdma_read_out,
 					 cm_ptr->msg.rd_in);
 
+	/* Set QP MTU, if negotiated. 2K for compatibility */
+	ep_ptr->qp_handle->mtu = cm_ptr->msg.mtu ?
+				 DAPL_MIN(cm_ptr->msg.mtu, cm_ptr->hca->ib_trans.ib_cm.mtu):
+				 getenv("DAPL_IB_MTU") ? cm_ptr->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048;
+
 	/* modify QP to RTR and then to RTS with remote info */
 	dapl_os_lock(&ep_ptr->header.lock);
 	if (dapls_modify_qp_state(ep_ptr->qp_handle->qp,
@@ -881,6 +901,7 @@ static void dapli_socket_connect_rtu(dp_ib_cm_handle_t cm_ptr)
 	dapl_os_unlock(&cm_ptr->lock);
 
 	cm_ptr->msg.op = ntohs(DCM_RTU);
+	cm_ptr->msg.mtu = ep_ptr->qp_handle->mtu; /* send negotiated MTU */
 	if (send(cm_ptr->socket, (char *)&cm_ptr->msg, 4, 0) == -1) {
 		int err = dapl_socket_errno();
 		dapl_log(DAPL_DBG_TYPE_ERR,
@@ -954,10 +975,11 @@ ud_bail:
 					     DCM_MAX_PDATA_SIZE, ep_ptr);
 	}
 	dapl_log(DAPL_DBG_TYPE_CM_EST,
-		 " SCM ACTIVE CONN: %x -> %s %x\n",
+		 " SCM ACTIVE CONN: %x -> %s %x mtu %d\n",
 		 ntohs(((struct sockaddr_in *) &cm_ptr->addr)->sin_port),
 		 inet_ntoa(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_addr),
-		 ntohs(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_port)-1000);
+		 ntohs(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_port)-1000,
+		 ep_ptr->qp_handle->mtu);
 	return;
 
 bail:
@@ -1121,6 +1143,12 @@ static void dapli_socket_accept_data(ib_cm_srvc_handle_t acm_ptr)
 
 	/* read in DST QP info, IA address. check for private data */
 	len = recv(acm_ptr->socket, (char *)&acm_ptr->msg, exp, 0);
+
+	if (ntohs(acm_ptr->msg.ver) < DCM_VER_XPS) {
+		len += recv(acm_ptr->socket, (char *)&acm_ptr->msg+len, SCM_BC_DIFF, 0);
+		exp += SCM_BC_DIFF;
+	}
+
 	if (len != exp || ntohs(acm_ptr->msg.ver) < DCM_VER_MIN) {
 		int err = dapl_socket_errno();
 		dapl_log(DAPL_DBG_TYPE_ERR,
@@ -1251,6 +1279,14 @@ dapli_socket_accept_usr(DAPL_EP * ep_ptr,
 				DAPL_MIN(ep_ptr->param.ep_attr.max_rdma_read_out,
 					 cm_ptr->msg.rd_in);
 
+	if (ntohs(cm_ptr->msg.ver) <  DCM_VER_XPS)
+		exp += SCM_BC_DIFF;
+
+	/* Set QP MTU, if negotiated. 2K for compatibility */
+	ep_ptr->qp_handle->mtu = cm_ptr->msg.mtu ?
+				 DAPL_MIN(cm_ptr->msg.mtu, cm_ptr->hca->ib_trans.ib_cm.mtu):
+				 getenv("DAPL_IB_MTU") ? cm_ptr->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048;
+
 	/* modify QP to RTR and then to RTS with remote info already read */
 	dapl_os_lock(&ep_ptr->header.lock);
 	if (dapls_modify_qp_state(ep_ptr->qp_handle->qp,
@@ -1290,6 +1326,7 @@ dapli_socket_accept_usr(DAPL_EP * ep_ptr,
 	local.ver = htons(DCM_VER);
 	local.op = htons(DCM_REP);
 	local.rd_in = ep_ptr->param.ep_attr.max_rdma_read_in;
+	local.mtu = ep_ptr->qp_handle->mtu; /* send negotiated MTU */
 	local.saddr.ib.qpn = htonl(ep_ptr->qp_handle->qp->qp_num);
 	local.saddr.ib.qp_type = ep_ptr->qp_handle->qp->qp_type;
 	local.saddr.ib.lid = ia_ptr->hca_ptr->ib_trans.lid;
@@ -1435,10 +1472,11 @@ ud_bail:
 		dapls_cr_callback(cm_ptr, event, NULL, 0, cm_ptr->sp);
 	}
 	dapl_log(DAPL_DBG_TYPE_CM_EST,
-		 " SCM PASSIVE CONN: %x <- %s %x\n",
+		 " SCM PASSIVE CONN: %x <- %s %x mtu %d\n",
 		 cm_ptr->sp->conn_qual,
 		 inet_ntoa(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_addr),
-		 ntohs(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_port));
+		 ntohs(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_port),
+		 cm_ptr->ep->qp_handle->mtu);
 	return;
       
 bail:
diff --git a/dapl/openib_scm/dapl_ib_util.h b/dapl/openib_scm/dapl_ib_util.h
index b03018b..ad5bc60 100755
--- a/dapl/openib_scm/dapl_ib_util.h
+++ b/dapl/openib_scm/dapl_ib_util.h
@@ -65,7 +65,6 @@ typedef dp_ib_cm_handle_t	ib_cm_srvc_handle_t;
 #define SCM_RNR_TIMER 12 /* 5 bits, 12 =.64ms, 28 =163ms, 31 =491ms */
 #define SCM_RNR_RETRY 7  /* 3 bits, 7 == infinite */
 #define SCM_CR_RETRY  5  /* retries for busy server, connect refused */
-#define SCM_IB_MTU    2048
 
 /* Global routing defaults */
 #define SCM_GLOBAL	0	/* global routing is disabled */
diff --git a/dapl/openib_scm/device.c b/dapl/openib_scm/device.c
index 43f9eaf..b210a15 100755
--- a/dapl/openib_scm/device.c
+++ b/dapl/openib_scm/device.c
@@ -371,8 +371,6 @@ DAT_RETURN dapls_ib_open_hca(IN IB_HCA_NAME hca_name,
 	    dapl_os_get_env_val("DAPL_HOP_LIMIT", SCM_HOP_LIMIT);
 	hca_ptr->ib_trans.ib_cm.tclass =
 	    dapl_os_get_env_val("DAPL_TCLASS", SCM_TCLASS);
-	hca_ptr->ib_trans.ib_cm.mtu =
-	    dapl_ib_mtu(dapl_os_get_env_val("DAPL_IB_MTU", SCM_IB_MTU));
 
 	if (flags & DAPL_OPEN_QUERY)
 		goto done;
diff --git a/dapl/openib_ucm/cm.c b/dapl/openib_ucm/cm.c
index 3d06c82..88dd890 100755
--- a/dapl/openib_ucm/cm.c
+++ b/dapl/openib_ucm/cm.c
@@ -622,14 +622,15 @@ dp_ib_cm_handle_t ucm_cm_find(ib_hca_transport_t *tp, ib_cm_msg_t *msg)
 	lock = &tp->lock;
 
 	dapl_log(DAPL_DBG_TYPE_CM,
-		" ucm_recv: %s %d %x %x i %x c %x < %d %x %x i %x c %x\n",
+		" ucm_recv: %s %d %x %x i %x c %x < %d %x %x i %x c %x (%d,%d)\n",
 		dapl_cm_op_str(msg_op),
 		ntohl(msg->d_id), ntohs(msg->daddr.ib.lid),
 		UCM_PORT_NTOH(msg->dportx, msg->dport),
 		ntohl(msg->daddr.ib.qpn), ntohl(msg->dqpn),
 		ntohl(msg->s_id), ntohs(msg->saddr.ib.lid),
 		UCM_PORT_NTOH(msg->sportx, msg->sport),
-		ntohl(msg->saddr.ib.qpn), ntohl(msg->sqpn));
+		ntohl(msg->saddr.ib.qpn), ntohl(msg->sqpn),
+		tp->ib_cm.mtu, msg->mtu);
 
 retry_listenq:
 	dapl_os_lock(lock);
@@ -1467,12 +1468,13 @@ dapli_cm_connect(DAPL_EP *ep, dp_ib_cm_handle_t cm)
 {
 	dapl_log(DAPL_DBG_TYPE_EP, 
 		 " connect: lid %x i_qpn %x lport %x p_sz=%d -> "
-		 " lid %x c_qpn %x rport %x\n",
+		 " lid %x c_qpn %x rport %x l_mtu %d\n",
 		 htons(cm->msg.saddr.ib.lid), htonl(cm->msg.saddr.ib.qpn),
 		 UCM_PORT_NTOH(cm->msg.sportx,cm->msg.sport),
 		 htons(cm->msg.p_size),
 		 htons(cm->msg.daddr.ib.lid), htonl(cm->msg.dqpn),
-		 UCM_PORT_NTOH(cm->msg.dportx,cm->msg.dport));
+		 UCM_PORT_NTOH(cm->msg.dportx,cm->msg.dport),
+		 cm->hca->ib_trans.ib_cm.mtu);
 
 	dapl_os_lock(&cm->lock);
 	if (cm->state != DCM_INIT && cm->state != DCM_REP_PENDING) {
@@ -1513,6 +1515,8 @@ dapli_cm_connect(DAPL_EP *ep, dp_ib_cm_handle_t cm)
 
 	cm->state = DCM_REP_PENDING;
 	cm->msg.op = htons(DCM_REQ);
+	cm->msg.mtu = cm->hca->ib_trans.ib_cm.mtu; /* local MTU to peer */
+
 	if (ucm_send(&cm->hca->ib_trans, &cm->msg, 
 		     &cm->msg.p_data, ntohs(cm->msg.p_size))) {
 		dapl_os_unlock(&cm->lock);
@@ -1638,6 +1642,10 @@ static void ucm_connect_rtu(dp_ib_cm_handle_t cm, ib_cm_msg_t *msg)
 		cm->ep->param.ep_attr.max_rdma_read_out =
 				DAPL_MIN(cm->ep->param.ep_attr.max_rdma_read_out,
 					 cm->msg.rd_in);
+	/* Set QP MTU, if negotiated. 2K for compatibility */
+	ep->qp_handle->mtu = msg->mtu ?
+			     DAPL_MIN(msg->mtu, cm->hca->ib_trans.ib_cm.mtu):
+			     getenv("DAPL_IB_MTU") ? cm->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048;
 
 	/* modify QP to RTR and then to RTS with remote info */
 	dapl_os_lock(&cm->ep->header.lock);
@@ -1671,6 +1679,7 @@ static void ucm_connect_rtu(dp_ib_cm_handle_t cm, ib_cm_msg_t *msg)
 	
 	/* Send RTU, no private data */
 	cm->msg.op = htons(DCM_RTU);
+	cm->msg.mtu = ep->qp_handle->mtu; /* send negotiated MTU */
 
 	dapl_os_lock(&cm->lock);
 	cm->state = DCM_CONNECTED;
@@ -1760,11 +1769,11 @@ ud_bail:
 	}
 
 	dapl_log(DAPL_DBG_TYPE_CM_EST,
-		 " UCM_ACTIVE_CONN %p %d [lid port qpn] %x %x %x -> %x %x %x xevent=%d\n",
+		 " UCM_ACTIVE_CONN %p %d [lid port qpn] %x %x %x -> %x %x %x mtu %d\n",
 		 cm->hca, cm->retries, ntohs(cm->msg.saddr.ib.lid),
 		 ntohs(cm->msg.sport), ntohl(cm->msg.saddr.ib.qpn),
 		 ntohs(cm->msg.daddr.ib.lid), ntohs(cm->msg.dport),
-		 ntohl(cm->msg.dqpn), sizeof(DAT_IB_EXTENSION_EVENT_DATA));
+		 ntohl(cm->msg.dqpn), ep->qp_handle->mtu);
 	return;
 bail:
 	if (ntohs(msg->op) != DCM_REJ_USER) {
@@ -1812,6 +1821,7 @@ static void ucm_accept(ib_cm_srvc_handle_t cm, ib_cm_msg_t *msg)
 	acm->msg.p_size = msg->p_size;
 	acm->msg.d_id = msg->s_id;
 	acm->msg.rd_in = msg->rd_in;
+	acm->msg.mtu = msg->mtu; /* save peer MTU */
 
 	/* CR saddr is CM daddr info, need EP for local saddr */
 	dapl_os_memcpy(&acm->msg.daddr, &msg->saddr, sizeof(union dcm_addr));
@@ -1832,14 +1842,15 @@ static void ucm_accept(ib_cm_srvc_handle_t cm, ib_cm_msg_t *msg)
 
 	dapl_log(DAPL_DBG_TYPE_CM,
 		 " accepting: op %s [id lid, port, cqp, iqp]:"
-		 " %d %x %x %x %x <- %d %x %x %x %x\n",
+		 " %d %x %x %x %x <- %d %x %x %x %x mtu %d\n",
 		 dapl_cm_op_str(ntohs(msg->op)),
 		 ntohl(acm->msg.s_id), ntohs(msg->daddr.ib.lid),
 		 UCM_PORT_NTOH(msg->dportx, msg->dport),
 		 ntohl(msg->dqpn), ntohl(msg->daddr.ib.qpn),
 		 ntohl(msg->s_id), ntohs(msg->saddr.ib.lid),
 		 UCM_PORT_NTOH(msg->sportx, msg->sport),
-		 ntohl(msg->sqpn), ntohl(msg->saddr.ib.qpn));
+		 ntohl(msg->sqpn), ntohl(msg->saddr.ib.qpn),
+		 acm->msg.mtu);
 
 #ifdef DAT_EXTENSIONS
 	if (acm->msg.daddr.ib.qp_type == IBV_QPT_UD) {
@@ -1958,13 +1969,13 @@ static void ucm_accept_rtu(dp_ib_cm_handle_t cm, ib_cm_msg_t *msg)
 	}
 
 	dapl_log(DAPL_DBG_TYPE_CM_EST,
-		 " UCM_PASSIVE_CONN %p %d [lid port qpn] %x %x %x <- %x %x %x\n",
+		 " UCM_PASSIVE_CONN %p %d [lid port qpn] %x %x %x <- %x %x %x mtu %d\n",
 		 cm->hca, cm->retries, ntohs(cm->msg.saddr.ib.lid),
 		 UCM_PORT_NTOH(cm->msg.sportx, cm->msg.sport),
 		 ntohl(cm->msg.saddr.ib.qpn),
 		 ntohs(cm->msg.daddr.ib.lid),
 		 UCM_PORT_NTOH(cm->msg.dportx, cm->msg.dport),
-		 ntohl(cm->msg.dqpn));
+		 ntohl(cm->msg.dqpn), cm->ep->qp_handle->mtu);
 	return;
 bail:
 	dapl_log(DAPL_DBG_TYPE_CM_WARN,
@@ -2090,11 +2101,11 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data)
 
 	dapl_dbg_log(DAPL_DBG_TYPE_CM,
 		     " ACCEPT_USR: s_id %d r_id %d lid=%x"
-		     " iqp=%x qp_type %d, psize=%d\n",
+		     " iqp=%x qp_type %d, psize=%d r_mtu %d l_mtu %d\n",
 		     ntohl(cm->msg.s_id), ntohl(cm->msg.d_id),
 		     ntohs(cm->msg.daddr.ib.lid),
 		     ntohl(cm->msg.daddr.ib.qpn), cm->msg.daddr.ib.qp_type, 
-		     p_size);
+		     p_size, cm->msg.mtu, cm->hca->ib_trans.ib_cm.mtu);
 
 #ifdef DAT_EXTENSIONS
 	if (cm->msg.daddr.ib.qp_type == IBV_QPT_UD &&
@@ -2110,6 +2121,10 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data)
 		ep->param.ep_attr.max_rdma_read_out =
 				DAPL_MIN(ep->param.ep_attr.max_rdma_read_out,
 					 cm->msg.rd_in);
+	/* Set QP MTU, if negotiated. 2K for compatibility */
+	ep->qp_handle->mtu = cm->msg.mtu ?
+			     DAPL_MIN(cm->msg.mtu, cm->hca->ib_trans.ib_cm.mtu):
+			     getenv("DAPL_IB_MTU") ? cm->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048;
 
 	/* modify QP to RTR and then to RTS with remote info already read */
 	dapl_os_lock(&ep->header.lock);
@@ -2146,6 +2161,7 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data)
 	/* setup local QP info and type from EP, copy pdata, for reply */
 	cm->msg.op = htons(DCM_REP);
 	cm->msg.rd_in = ep->param.ep_attr.max_rdma_read_in;
+	cm->msg.mtu = ep->qp_handle->mtu; /* send negotiated MTU */
 	cm->msg.saddr.ib.qpn = htonl(ep->qp_handle->qp->qp_num);
 	cm->msg.saddr.ib.qp_type = ep->qp_handle->qp->qp_type;
 	cm->msg.saddr.ib.lid = cm->hca->ib_trans.addr.ib.lid; 
diff --git a/dapl/openib_ucm/device.c b/dapl/openib_ucm/device.c
index f23c77b..71fee5f 100755
--- a/dapl/openib_ucm/device.c
+++ b/dapl/openib_ucm/device.c
@@ -292,8 +292,6 @@ DAT_RETURN dapls_ib_open_hca(IN IB_HCA_NAME hca_name,
 	    dapl_os_get_env_val("DAPL_HOP_LIMIT", DCM_HOP_LIMIT);
 	hca_ptr->ib_trans.ib_cm.tclass =
 	    dapl_os_get_env_val("DAPL_TCLASS", DCM_TCLASS);
-	hca_ptr->ib_trans.ib_cm.mtu =
-	    dapl_ib_mtu(dapl_os_get_env_val("DAPL_IB_MTU", DCM_IB_MTU));
 
 	if (flags & DAPL_OPEN_QUERY)
 		goto done;
diff --git a/dapl/svc/mcm.c b/dapl/svc/mcm.c
index 4b91090..7be40b8 100755
--- a/dapl/svc/mcm.c
+++ b/dapl/svc/mcm.c
@@ -346,7 +346,9 @@ int mcm_modify_qp(struct ibv_qp	*qp_handle,
 
 		qp_attr.dest_qp_num = ntohl(qpn);
 		qp_attr.rq_psn = 1;
-		qp_attr.path_mtu = m_qp->smd->md->dev_attr.mtu;
+		qp_attr.path_mtu = m_qp->mtu ?
+				   min(m_qp->mtu, m_qp->smd->md->dev_attr.mtu):
+				   m_qp->smd->md->dev_attr.mtu;
 		qp_attr.max_dest_rd_atomic = 16;
 		qp_attr.min_rnr_timer = m_qp->smd->md->dev_attr.rnr_timer;
 		qp_attr.ah_attr.dlid = ntohs(lid);
@@ -1491,6 +1493,7 @@ int mcm_cm_req_out(mcm_cm_t *m_cm)
 	m_cm->state = MCM_REP_PENDING;
 	m_cm->msg.op = htons(MCM_REQ);
 	m_cm->timer = mcm_time_us(); /* reset reply timer */
+	m_cm->msg.mtu = m_cm->smd->md->dev_attr.mtu; /* local MTU to peer */
 
 	if (mcm_send(m_cm->md, &m_cm->msg, &m_cm->msg.p_data, ntohs(m_cm->msg.p_size)))
 		return -1;
@@ -1508,7 +1511,7 @@ int mcm_cm_rtu_out(mcm_cm_t *m_cm)
 
 	MCNTR(m_cm->md, MCM_CM_RTU_OUT);
 
-	mlog(1, "[%d:%d] CONN_EST[%d]: %p 0x%x %x 0x%x %Lx %s -> 0x%x %x 0x%x %Lx %s\n",
+	mlog(1, "[%d:%d] CONN_EST[%d]: %p 0x%x %x 0x%x %Lx %s -> 0x%x %x 0x%x %Lx %s mtu %d\n",
 		m_cm->md->mc->scif_id, m_cm->smd->entry.tid,
 		m_cm->md->cntrs ? (uint32_t)((uint64_t *)m_cm->md->cntrs)[MCM_CM_RTU_OUT]:0,
 		m_cm, htons(m_cm->msg.saddr2.lid), htonl(m_cm->msg.saddr2.qpn),
@@ -1516,7 +1519,8 @@ int mcm_cm_rtu_out(mcm_cm_t *m_cm)
 		htons(m_cm->msg.daddr1.lid),
 		MXF_EP(&m_cm->msg.saddr1) && MXF_EP(&m_cm->msg.daddr1) ?
 				htonl(m_cm->msg.daddr2.qpn):htonl(m_cm->msg.daddr1.qpn),
-		htons(m_cm->msg.dport), ntohll(r_guid), mcm_map_str(m_cm->msg.daddr1.ep_map));
+		htons(m_cm->msg.dport), ntohll(r_guid), mcm_map_str(m_cm->msg.daddr1.ep_map),
+		m_cm->m_qp->mtu);
 
 	mpxy_lock(&m_cm->lock);
 	if (m_cm->state != MCM_REP_RCV) {
diff --git a/dapl/svc/mix.c b/dapl/svc/mix.c
index cb82499..8e4e622 100755
--- a/dapl/svc/mix.c
+++ b/dapl/svc/mix.c
@@ -186,8 +186,6 @@ void mix_scif_accept(scif_epd_t listen_ep)
 	mlog(8, " SCIF client: device open client_pid 0x%x - mlen %d - ep %d\n",
 		ntohl(msg.hdr.req_id), len, op_ep);
 
-	msg.hdr.flags = MIX_OP_RSP;
-
 	if (msg.hdr.ver < MIX_MIN || msg.hdr.ver > MIX_MAX || msg.hdr.op != MIX_IA_OPEN) {
 		mlog(0, " ERR: MIC client incompatible with MPXYD (exp %d,rcvd %d) or OP (exp %d,rcvd %d)\n",
 			DAT_MIX_VER, msg.hdr.ver, msg.hdr.op, MIX_IA_OPEN);
@@ -525,6 +523,27 @@ void m_cq_free(struct mcm_cq *m_cq)
 	free(m_cq);
 }
 
+/* smd->cqlock/cqrlock held */
+void m_cq_flush(struct mcm_cq *m_cq)
+{
+	struct ibv_cq *ib_cq = NULL;
+	void *cq_ctx;
+	int ret, cnt=0;
+	struct ibv_wc wc;
+
+	mlog(8, " m_cq %p enter:\n", m_cq);
+	ret = ibv_get_cq_event(m_cq->ib_ch, &ib_cq, (void *)&cq_ctx);
+	if (ret == 0)
+		ibv_ack_cq_events(m_cq->ib_cq, 1);
+
+	do {
+		ret = ibv_poll_cq(m_cq->ib_cq, 1, &wc);
+		cnt += ret;
+	}
+	while (ret > 0);
+	mlog(8, " m_cq %p exit: %d events flushed\n", m_cq, cnt);
+}
+
 /* destroy proxy CQ, fits in header */
 static int mix_cq_destroy(mcm_scif_dev_t *smd, dat_mix_hdr_t *pmsg)
 {
@@ -778,8 +797,26 @@ void m_qp_free(struct mcm_qp *m_qp)
 		mpxy_unlock(&m_qp->smd->qprlock);
 	}
 
-	mlog(8, " m_qp %p m_cm %p cm_id %d\n",
-		m_qp, m_qp->cm, m_qp->cm ? m_qp->cm->entry.tid:0);
+	mlog(8, " m_qp %p m_cm %p cm_id %d cm_state %d\n",
+		m_qp, m_qp->cm, m_qp->cm ? m_qp->cm->entry.tid:0,
+		m_qp->cm ? m_qp->cm->state:0);
+
+	if (m_qp->cm)
+		m_qp->cm->state = MCM_DISCONNECTED;
+
+	mcm_flush_qp(m_qp); /* QP to error, flush consumer messages */
+
+	if (m_qp->m_cq_tx) { /* flush pending PO WRs on cq_tx */
+		mpxy_lock(&m_qp->smd->cqlock);
+		m_cq_flush(m_qp->m_cq_tx);
+		mpxy_unlock(&m_qp->smd->cqlock);
+	}
+
+	if (m_qp->m_cq_rx) { /* flush pending PI WRs on cq_rx */
+		mpxy_lock(&m_qp->smd->cqrlock);
+		m_cq_flush(m_qp->m_cq_rx);
+		mpxy_unlock(&m_qp->smd->cqrlock);
+	}
 
 	if (m_qp->cm) { /* unlink CM, serialized */
 		struct mcm_cm *cm = m_qp->cm;
@@ -791,7 +828,6 @@ void m_qp_free(struct mcm_qp *m_qp)
 		mpxy_unlock(&cm->lock);
 		mcm_dqconn_free(m_qp->smd, cm);
 	}
-	mcm_flush_qp(m_qp); /* move QP to error, flush */
 
 	if (m_qp->ib_qp1) {
 		ibv_destroy_qp(m_qp->ib_qp1);
@@ -812,6 +848,7 @@ void m_qp_free(struct mcm_qp *m_qp)
 	if (m_qp->m_cq_rx) {
 		mpxy_lock(&m_qp->smd->cqrlock);
 		m_cq_free(m_qp->m_cq_rx);
+		m_qp->m_cq_rx = NULL;
 		mpxy_unlock(&m_qp->smd->cqrlock);
 	}
 	mpxy_lock_destroy(&m_qp->txlock); /* proxy out */
@@ -1266,12 +1303,16 @@ void mix_dto_event(struct mcm_cq *m_cq, struct dat_mix_wc *wc, int nc)
 
 		if (msg.wc[i].status != IBV_WC_SUCCESS) {
 			if (msg.wc[i].status  != IBV_WC_WR_FLUSH_ERR) {
-				mlog(0, " [%d:%d] ERROR (ep=%d): cq %p id %d ctx %p stat %d"
-					"  op 0x%x ln %d wr_id %p wc's %d verr 0x%x errno=%d,%s\n",
-					m_cq->smd->md->mc->scif_id, m_cq->smd->entry.tid,
-					m_cq->smd->scif_op_ep, m_cq, msg.cq_id, msg.cq_ctx,
-					msg.wc[i].status, msg.wc[i].opcode, msg.wc[i].byte_len,
-					msg.wc[i].wr_id, msg.wc_cnt, msg.wc[i].vendor_err,
+				mlog(0, " [%d:%d] ERROR (ep=%d): id %d stat %d"
+					" op %x flg %x ln %d wr_id %p wc's %d"
+					" verr 0x%x errno=%d,%s\n",
+					m_cq->smd->md->mc->scif_id,
+					m_cq->smd->entry.tid,
+					m_cq->smd->scif_op_ep, msg.cq_id,
+					msg.wc[i].status, msg.wc[i].opcode,
+					msg.wc[i].wc_flags, msg.wc[i].byte_len,
+					msg.wc[i].wr_id, msg.wc_cnt,
+					msg.wc[i].vendor_err,
 					errno, strerror(errno));
 			}
 		} else {
@@ -1494,6 +1535,7 @@ static int mix_cm_rtu_out(mcm_scif_dev_t *smd, dat_mix_cm_t *pmsg, scif_epd_t sc
 		 ntohs(m_cm->msg.daddr1.lid), ntohll(m_cm->msg.sys_guid));
 
 	/* send RTU on wire */
+	m_cm->msg.mtu = m_cm->m_qp->mtu; /* send negotiated MTU */
 	mcm_cm_rtu_out(m_cm);
 
 	return 0;
@@ -1598,6 +1640,12 @@ int mix_cm_rep_in(mcm_cm_t *m_cm, dat_mcm_msg_t *pkt, int pkt_len)
 	else
 		m_cm->m_qp->p2p_data = 0;
 
+	/* Set QP MTU, if negotiated. 2K for compatibility */
+	m_cm->m_qp->mtu = pkt->mtu ?
+			  min(pkt->mtu, m_cm->md->dev_attr.mtu):
+			  m_cm->md->mtu_env ? m_cm->md->mtu_env : IBV_MTU_2048;
+	m_cm->msg.mtu = m_cm->m_qp->mtu; /* forward negotiated MTU */
+
 	mlog(2, " WRC: m_qp %p - WR 0x%Lx rkey 0x%x ln %d, sz %d end %d"
 		" WC 0x%Lx rkey 0x%x ln %d, sz %d end %d\n",
 	     m_cm->m_qp, m_cm->m_qp->wrc.wr_addr, m_cm->m_qp->wrc.wr_rkey,
@@ -1754,6 +1802,7 @@ int mix_cm_req_in(mcm_cm_t *cm, dat_mcm_msg_t *pkt, int pkt_len)
 	acm->msg.p_size = pkt->p_size;
 	acm->msg.d_id = pkt->s_id;
 	acm->msg.rd_in = pkt->rd_in;
+	acm->msg.mtu = pkt->mtu;
 #ifdef MPXYD_LOCAL_SUPPORT
 	acm->msg.sys_guid = pkt->sys_guid; /* remote system guid */;
 #else
@@ -1765,13 +1814,14 @@ int mix_cm_req_in(mcm_cm_t *cm, dat_mcm_msg_t *pkt, int pkt_len)
 	memcpy(&acm->msg.daddr1, &pkt->saddr1, sizeof(dat_mcm_addr_t));
 	memcpy(&acm->msg.daddr2, &pkt->saddr2, sizeof(dat_mcm_addr_t));
 
-	mlog(2, " [%d:%d] cm %p ep %d sPORT %x %s <- dPORT %x lid=%x psz=%d %s %s %Lx (msg %p %d)\n",
+	mlog(2, " [%d:%d] cm %p ep %d: %x %s <- %x lid=%x psz=%d %s %s %Lx (%p %d) lmtu %d rmtu %d\n",
 		 cm->md->mc->scif_id, cm->smd->entry.tid, acm, acm->smd->scif_ev_ep,
 		 ntohs(acm->msg.sport), mcm_map_str(acm->md->addr.ep_map),
 		 ntohs(acm->msg.dport), ntohs(acm->msg.daddr1.lid), htons(acm->msg.p_size),
 		 mcm_map_str(acm->msg.daddr2.ep_map),
 		 acm->md->addr.lid == acm->msg.daddr1.lid ? "platform":"fabric",
-		 ntohll(acm->msg.sys_guid), &msg, sizeof(dat_mcm_msg_t));
+		 ntohll(acm->msg.sys_guid), &msg, sizeof(dat_mcm_msg_t),
+		 cm->md->dev_attr.mtu, pkt->mtu);
 
 	if (pkt->p_size)
 		memcpy(acm->msg.p_data, pkt->p_data, ntohs(pkt->p_size));
@@ -1806,7 +1856,7 @@ int mix_cm_rtu_in(mcm_cm_t *m_cm, dat_mcm_msg_t *pkt, int pkt_len)
 	dat_mix_cm_t msg;
 	int len;
 
-	mlog(1, "[%d:%d] CONN_EST[%d]: %p 0x%x %x 0x%x %Lx %s <- 0x%x %x 0x%x %Lx %s\n",
+	mlog(1, "[%d:%d] CONN_EST[%d]: %p 0x%x %x 0x%x %Lx %s <- 0x%x %x 0x%x %Lx %s mtu %d\n",
 		m_cm->md->mc->scif_id, m_cm->smd->entry.tid,
 		m_cm->md->cntrs ? (uint32_t)((uint64_t *)m_cm->md->cntrs)[MCM_CM_RTU_IN]:0,
 		m_cm, htons(pkt->daddr1.lid),
@@ -1814,7 +1864,8 @@ int mix_cm_rtu_in(mcm_cm_t *m_cm, dat_mcm_msg_t *pkt, int pkt_len)
 			htonl(m_cm->msg.daddr2.qpn):htonl(m_cm->msg.daddr1.qpn),
 		htons(pkt->dport), system_guid, mcm_map_str(pkt->daddr1.ep_map),
 		htons(pkt->saddr2.lid), htonl(pkt->saddr2.qpn),
-		htons(pkt->sport), ntohll(pkt->sys_guid), mcm_map_str(pkt->saddr2.ep_map));
+		htons(pkt->sport), ntohll(pkt->sys_guid), mcm_map_str(pkt->saddr2.ep_map),
+		m_cm->m_qp->mtu);
 
 	/* MXF_EP <- HST_EP, host sends WC on RTU, save WRC info */
 	if (MXF_EP(&pkt->daddr1) && HST_EP(&pkt->saddr2)) {
@@ -2056,6 +2107,11 @@ static int mix_cm_rep_out(mcm_scif_dev_t *smd, dat_mix_cm_t *pmsg, scif_epd_t sc
 	m_cm->msg.sys_guid = rand();
 #endif
 
+	/* Set QP MTU, if negotiated. 2K for compatibility */
+	m_cm->m_qp->mtu = m_cm->msg.mtu ?
+			  min(m_cm->msg.mtu, m_cm->md->dev_attr.mtu):
+			  m_cm->md->mtu_env ? m_cm->md->mtu_env : IBV_MTU_2048;
+
 	if (qp) {
 		if (mcm_modify_qp(qp, IBV_QPS_RTR, dqpn, dlid, dgid))
 			goto err;
@@ -2071,8 +2127,9 @@ static int mix_cm_rep_out(mcm_scif_dev_t *smd, dat_mix_cm_t *pmsg, scif_epd_t sc
 			goto err;
 	}
 
-	/* send RTU on wire, monitor for retries */
+	/* send REP on wire, monitor for retries */
 	m_cm->state = MCM_RTU_PENDING;
+	m_cm->msg.mtu = m_cm->m_qp->mtu; /* send negotiated MTU */
 	mpxy_unlock(&m_cm->lock);
 	mcm_cm_rep_out(m_cm);
 	return 0;
@@ -2183,6 +2240,7 @@ static int mix_proxy_out(mcm_scif_dev_t *smd, dat_mix_sr_t *pmsg, mcm_qp_t *m_qp
 	mpxy_lock(&m_qp->txlock);
 	if (((m_qp->wr_hd + 1) & m_qp->wr_end) == m_qp->wr_tl) { /* full */
 		ret = ENOMEM;
+		mlog(0, " ERR: WR full hd %d tl %d\n", m_qp->wr_hd, m_qp->wr_tl);
 		goto bail;
 	}
 	m_qp->wr_hd = (m_qp->wr_hd + 1) & m_qp->wr_end; /* move hd */
diff --git a/dapl/svc/mpxy_in.c b/dapl/svc/mpxy_in.c
index 54cc62a..bb48c69 100755
--- a/dapl/svc/mpxy_in.c
+++ b/dapl/svc/mpxy_in.c
@@ -476,11 +476,10 @@ static int m_pi_send_wc(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_rx, int status
 	int wc_idx, ret;
 
 	mlog(0x10,"[%d:%d:%d] WC_rem: wr_rx[%d] %p wc_hd %d flgs %x WR_r tl %d-%d"
-		  " wt %d hd %d wr_id %Lx org_id %Lx\n",
+		  " wt %d hd %d oid %Lx st %d\n",
 		m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, m_qp->r_entry.tid,
 		wr_rx->w_idx, wr_rx, m_qp->wc_hd_rem, wr_rx->flags, m_qp->wr_tl_r,
-		wr_rx->w_idx, m_qp->wr_tl_r_wt, m_qp->wr_hd_r, wr_rx->wr.wr_id,
-		wr_rx->org_id);
+		wr_rx->w_idx, m_qp->wr_tl_r_wt, m_qp->wr_hd_r, wr_rx->org_id, status);
 
 	/* local WR and remote WR are serialized, should never reach tail of remote WR */
 	if (((m_qp->wc_hd_rem + 1) & m_qp->wrc.wc_end) == m_qp->wc_tl_rem) {
@@ -576,6 +575,13 @@ static void m_pi_post_writeto(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_sig)
 	while (m_qp->pi_rr_cnt) { /* RR's pending */
 		wr_rx = (struct mcm_wr_rx *)(m_qp->wrc.wr_addr + (m_qp->wrc.wr_sz * wr_idx));
 
+		/* SCIF sync required on IB RW, multiple SCIF writes are not ordered */
+		if (m_qp->post_cnt_wt &&
+		    (wr_rx->flags & (M_SEND_FS|M_SEND_LS)) &&
+		    (!(wr_rx->flags & (M_READ_WRITE_TO_DONE|M_READ_WRITE_TO)))) {
+			break;
+		}
+
 		if (!(wr_rx->flags & M_READ_DONE)) {
 			/* reached head pointer */
 			if (wr_idx == m_qp->wr_hd_r)
@@ -675,8 +681,18 @@ static void m_pi_post_writeto(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_sig)
 			ret = scif_writeto(smd->scif_tx_ep, l_off, w_len, r_off, wt_flag);
 
 			if (ret) {
-				mlog(0, " ERR: scif_sendto, ret %d err: %d %s\n",
+				mlog(0," [%d:%d:%d] ERR: scif_sendto, ret %d err: %d %s\n",
+					smd->md->mc->scif_id, smd->entry.tid, m_qp->r_entry.tid,
 					ret, errno, strerror(errno));
+				mlog(0," PI: wc %d rr %d stall %d wt %d\n",
+				     m_qp->pi_rw_cnt, m_qp->pi_rr_cnt,
+				     m_qp->stall_cnt_rr, m_qp->post_cnt_wt);
+				mlog(0," PO: wr %d wr_rem %d pst_sig %d cmp_sig %d\n",
+				     m_qp->wr_pp, m_qp->wr_pp_rem,
+				     m_qp->post_sig_cnt, m_qp->comp_cnt);
+				mlog(0, " WR_rx[%d] %p l_o %Lx r_o %Lx rb 0x%x-0x%x ln %d id %Lx tl %d hd %d\n",
+					wr_rx->w_idx, wr_rx, l_off, r_off, l_start, l_end, w_len, wr_rx->org_id,
+					m_qp->wr_tl_r, m_qp->wr_hd_r);
 				goto bail;
 			}
 			MCNTR(smd->md, MCM_SCIF_WRITE_TO);
@@ -719,14 +735,22 @@ static void m_pi_post_writeto(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_sig)
 	}
 	return;
 bail:
-	/* report error via WC back to proxy-out */
-	mlog(0, " ERR: writeto: wr_rx[%d] %p -> IB raddr %Lx rkey %x"
-		" SCIF r_off %Lx, len %d wr_flags %x wt_pend %d\n",
-		wr_rx->w_idx, wr_rx, wr_rx->wr.wr.rdma.remote_addr,
-		wr_rx->wr.wr.rdma.rkey, r_off, sg_len, wr_rx->flags,
-		m_qp->post_cnt_wt);
-
-	m_pi_send_wc(m_qp, wr_rx, IBV_WC_REM_ACCESS_ERR);
+	/* report error via WC back to proxy-out, all pending WRs */
+	wr_idx = m_qp->wr_tl_r_wt;
+	do {
+		wr_rx = (struct mcm_wr_rx *)(m_qp->wrc.wr_addr + (m_qp->wrc.wr_sz * wr_idx));
+
+		mlog(0, " ERR: wr_rx[%d] %p -> IB raddr %Lx %x"
+			" SCIF r_o %Lx, ln %d fl %x wt_pnd %d\n",
+			wr_rx->w_idx, wr_rx, wr_rx->wr.wr.rdma.remote_addr,
+			wr_rx->wr.wr.rdma.rkey, r_off, sg_len, wr_rx->flags,
+			m_qp->post_cnt_wt);
+
+		m_pi_send_wc(m_qp, wr_rx, IBV_WC_REM_ACCESS_ERR);
+		wr_idx = (wr_idx + 1) & m_qp->wrc.wr_end; /* next WR */
+
+	} while (wr_idx != m_qp->wr_hd_r);
+
 	return;
 }
 
@@ -1164,6 +1188,17 @@ retry:
 	goto retry;
 }
 
+/* Process scif_writeto DMAs waiting for previous WTs completions, order */
+void m_pi_pending_wt(struct mcm_qp *m_qp)
+{
+	struct mcm_wr_rx *wr_rx;
+
+	mpxy_lock(&m_qp->rxlock);
+	wr_rx = (struct mcm_wr_rx *)(m_qp->wrc.wr_addr + (m_qp->wrc.wr_sz * m_qp->wr_hd_r));
+	m_pi_post_writeto(m_qp, wr_rx);
+	mpxy_unlock(&m_qp->rxlock);
+}
+
 /*
  * Pending Proxy-in services for RDMA Writes from remote peer
  *
diff --git a/dapl/svc/mpxy_out.c b/dapl/svc/mpxy_out.c
index d015dc3..5c82703 100755
--- a/dapl/svc/mpxy_out.c
+++ b/dapl/svc/mpxy_out.c
@@ -487,6 +487,23 @@ void m_po_pending_wr(struct mcm_qp *m_qp, int *data)
 				else
 					wc.wc_flags = 0;
 				wc.vendor_err = ret;
+				if (ret) {
+					mlog(0, "[%d:%d:%d] ERR %s_RW_post: WR[%d] wr_id %p flgs 0x%x,"
+						" pcnt %d sg_rate %d hd %d tl %d sz %d m_idx %x\n",
+						m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid,
+						m_qp->r_entry.tid,
+						(MXF_EP(&m_qp->cm->msg.daddr1)) ? "po_pi":"po_direct",
+						m_wr->w_idx, m_wr->wr.wr_id, m_wr->wr.send_flags,
+						m_qp->post_cnt,	mcm_rw_signal, m_qp->wr_hd, m_qp->wr_tl,
+						m_wr->wr.sg_list->length, m_wr->m_idx);
+					mlog(0, "[%d:%d:%d] ERR wr_id %Lx next %p sglist %p sge %d op %d flgs"
+						" %d idata 0x%x raddr %p rkey %x \n",
+						m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid,
+						m_qp->r_entry.tid, m_wr->wr.wr_id, m_wr->wr.next,
+						m_wr->wr.sg_list, m_wr->wr.num_sge, m_wr->wr.opcode,
+						m_wr->wr.send_flags, m_wr->wr.imm_data,
+						m_wr->wr.wr.rdma.remote_addr, m_wr->wr.wr.rdma.rkey);
+				}
 				mix_dto_event(m_qp->ib_qp2->send_cq->cq_context, &wc, 1);
 			}
 
@@ -565,7 +582,7 @@ int m_po_proxy_data(mcm_scif_dev_t *smd, dat_mix_sr_t *pmsg, struct mcm_qp *m_qp
 	off_t l_off, r_off;
 	uint64_t total_offset;
 	int  l_start, l_end, l_len, cacheln_off, seg_len;
-	struct mcm_wr *m_wr;
+	struct mcm_wr *m_wr = NULL;
 	struct ibv_sge *m_sge;
 
 	mlog(4, " q_id %d, q_ctx %p, len %d, wr_id %p, sge %d, op %x flgs %x wr_idx %d\n",
@@ -609,7 +626,7 @@ int m_po_proxy_data(mcm_scif_dev_t *smd, dat_mix_sr_t *pmsg, struct mcm_qp *m_qp
 		}
 		write(smd->md->mc->tx_pipe[1], "w", sizeof("w"));
 		mpxy_unlock(&m_qp->txlock);
-		sched_yield();
+		sleep_usec(1000);
 		mpxy_lock(&m_qp->txlock);
 	}
 	if (retries) {
@@ -803,8 +820,6 @@ retry_mr:
 					mlog(0x10, "[%d:%d:%d] %s_RF_post_sig: WR[%d] qp %p wr_id %p flgs 0x%x,"
 						" sg_rate %d hd %d tl %d sz %d m_idx %x\n",
 						m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, m_qp->r_entry.tid,
-						m_qp, m_wr, pmsg->wr.wr_id, m_wr->wr.send_flags,
-						m_qp->post_cnt,	mcm_rw_signal, m_qp->wr_hd, m_qp->wr_tl,
 						(MXF_EP(&m_qp->cm->msg.daddr1)) ? "po_pi":"po_direct",
 						m_wr->w_idx, m_qp, pmsg->wr.wr_id, m_wr->wr.send_flags,
 						mcm_rw_signal, m_qp->wr_hd, m_qp->wr_tl,
@@ -919,6 +934,18 @@ bail:
 		else
 			wc.wc_flags = 0;
 		wc.vendor_err = ret;
+
+		mlog(0, "[%d:%d:%d] ERR %s_RF_post: WR[%d] qp %p wr_id %p, "
+			" post %d hd %d tl %d sz %d \n",
+			m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, m_qp->r_entry.tid,
+			(MXF_EP(&m_qp->cm->msg.daddr1)) ? "po_pi":"po_direct",
+			m_wr ? m_wr->w_idx:0, m_qp, m_wr, pmsg->wr.wr_id,
+			m_qp->post_cnt,	m_qp->wr_hd, m_qp->wr_tl, wc.byte_len);
+		mlog(0, "[%d:%d:%d] ERR m_wr: raddr %Lx rkey 0x%x, ib_wr: raddr %Lx rkey 0x%x\n",
+			m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, m_qp->r_entry.tid,
+			pmsg->wr.wr.rdma.remote_addr, pmsg->wr.wr.rdma.rkey,
+			m_wr ? m_wr->wr.wr.rdma.remote_addr:0, m_wr ? m_wr->wr.wr.rdma.rkey:0);
+
 		mix_dto_event(m_qp->ib_qp2->send_cq->cq_context, &wc, 1);
 	}
 
diff --git a/dapl/svc/mpxyd.c b/dapl/svc/mpxyd.c
index 922eeae..668efa5 100755
--- a/dapl/svc/mpxyd.c
+++ b/dapl/svc/mpxyd.c
@@ -799,6 +799,10 @@ found:
 		msg->dev_addr.lid = md->m_lid;
 		memcpy(msg->dev_addr.gid, md->m_gid, 16);
 	}
+
+	/* MTU changed via DAPL_IB_MTU */
+	if (msg->hdr.flags & MIX_OP_MTU)
+		md->mtu_env = md->dev_attr.mtu;
 err:
 	if (!smd) {
 		mlog(1, " WARN: open failed for %s - %d\n", msg->name, msg->port);
@@ -806,6 +810,7 @@ err:
 	}
 
 	/* send back response */
+	msg->hdr.flags = MIX_OP_RSP;
 	ret = scif_send_msg(op_ep, (void*)msg, sizeof(dat_mix_open_t));
 	if (ret) {
 		mlog(0, " ERR: scif_send dev_id %d op_ep %d, closing device %p\n",
@@ -817,9 +822,10 @@ err:
 		goto bail;
 	}
 
-	mlog(1, " MIC client: mdev[%d] %p smd %p mic%d[%d] -> %s[%d] port %d lid %x %s\n",
+	mlog(1, " MIC client: mdev[%d] %p->%p mic%d[%d] -> %s[%d] port %d lid %x %s mtu %d (%d)\n",
 		md->smd_list.tid, md, smd, mc->scif_id-1, mc->numa_node, msg->name,
-		md->numa_node, msg->port, ntohs(msg->dev_addr.lid), mcm_map_str(md->addr.ep_map));
+		md->numa_node, msg->port, ntohs(msg->dev_addr.lid), mcm_map_str(md->addr.ep_map),
+		md->dev_attr.mtu, md->mtu_env);
 bail:
 	mpxy_unlock(&mc->oplock);
 	mpxy_unlock(&mc->cmlock);
@@ -1187,6 +1193,7 @@ void mpxy_rx_thread(void *mic_client)
 				m_qp = get_head_entry(&smd->qprlist);
 				while (m_qp) {
 					m_pi_pending_wr(m_qp, &data); /* RR's and scif_sendto */
+					m_pi_pending_wt(m_qp); /* WT's pending */
 					m_qp = get_next_entry(&m_qp->r_entry, &smd->qprlist);
 				}
 				mpxy_unlock(&smd->qprlock);
@@ -1445,8 +1452,7 @@ void mcm_dat_dev_log(struct mcm_scif_dev *smd)
 
 	/* show PO mbuf_wc busy slots */
 	idx = smd->m_buf_tl;
-	while ((smd->m_buf_tl != smd->m_buf_hd) &&
-	       (smd->m_buf_hd - smd->m_buf_tl)) {
+	while (idx != smd->m_buf_hd) {
 		if ((smd->m_buf_wc[idx].m_idx && !smd->m_buf_wc[idx].done) || 1) {
 			struct mcm_wr *m_wr = NULL;
 			struct mcm_qp *m_qp = NULL;
@@ -1468,8 +1474,6 @@ void mcm_dat_dev_log(struct mcm_scif_dev *smd)
 				smd->m_buf_wc[idx].tl, smd->m_buf_wc[idx].hd);
 		}
 		idx = (idx + 1) & smd->m_buf_end;
-		if (idx == (smd->m_buf_hd+2))
-			break;
 	}
 
 	/* show PI mbuf_wc busy slots, start from tail */
diff --git a/dapl/svc/mpxyd.h b/dapl/svc/mpxyd.h
index c733157..ec31cc0 100755
--- a/dapl/svc/mpxyd.h
+++ b/dapl/svc/mpxyd.h
@@ -132,6 +132,7 @@ typedef struct mcm_ib_dev {
 	int 			numa_node;
 	int			indata;
 	void			*cntrs;
+	uint8_t			mtu_env;
 
 } mcm_ib_dev_t;
 
@@ -244,6 +245,7 @@ typedef struct mcm_qp {
 	int 			sr_len;		/* SR WR buffer pool len */
 	int			sr_sz;		/* SR WR entry size */
 	int			post_sr;
+	uint8_t			mtu;		/* negotiated QP MTU */
 #ifdef MCM_PROFILE
 	mcm_qp_prof_t		ts;
 	uint32_t		last_wr_sig;
@@ -613,6 +615,7 @@ int m_pi_create_sr_q(struct mcm_qp *m_qp, int entries);
 int m_pi_create_bpool(struct mcm_qp *m_qp, int max_recv_wr);
 void m_qp_destroy_pi(struct mcm_qp *m_qp);
 int m_qp_create_pi(mcm_scif_dev_t *smd, struct mcm_qp *m_qp);
+void m_pi_pending_wt(struct mcm_qp *m_qp);
 void m_pi_pending_wr(struct mcm_qp *m_qp, int *data);
 void m_pi_pending_wc(struct mcm_qp *m_qp, int *events);
 void m_pi_req_event(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_rx, struct ibv_wc *wc, int type);
diff --git a/doc/dat.conf b/doc/dat.conf
index c3794e7..c868814 100755
--- a/doc/dat.conf
+++ b/doc/dat.conf
@@ -70,3 +70,11 @@ ofa-v2-qib0-1m u2.0 nonthreadsafe default libdaplomcm.so.2 dapl.2.0 "qib0 1" ""
 ofa-v2-qib0-2m u2.0 nonthreadsafe default libdaplomcm.so.2 dapl.2.0 "qib0 2" ""
 ofa-v2-qib1-1m u2.0 nonthreadsafe default libdaplomcm.so.2 dapl.2.0 "qib1 1" ""
 ofa-v2-qib1-2m u2.0 nonthreadsafe default libdaplomcm.so.2 dapl.2.0 "qib1 2" ""
+ofa-v2-hfi1_0-1s u2.0 nonthreadsafe default libdaploscm.so.2 dapl.2.0 "hfi1_0 1" ""
+ofa-v2-hfi1_0-2s u2.0 nonthreadsafe default libdaploscm.so.2 dapl.2.0 "hfi1_0 2" ""
+ofa-v2-hfi1_1-1s u2.0 nonthreadsafe default libdaploscm.so.2 dapl.2.0 "hfi1_1 1" ""
+ofa-v2-hfi1_1-2s u2.0 nonthreadsafe default libdaploscm.so.2 dapl.2.0 "hfi1_1 2" ""
+ofa-v2-hfi1_0-1m u2.0 nonthreadsafe default libdaplomcm.so.2 dapl.2.0 "hfi1_0 1" ""
+ofa-v2-hfi1_0-2m u2.0 nonthreadsafe default libdaplomcm.so.2 dapl.2.0 "hfi1_0 2" ""
+ofa-v2-hfi1_1-1m u2.0 nonthreadsafe default libdaplomcm.so.2 dapl.2.0 "hfi1_1 1" ""
+ofa-v2-hfi1_1-2m u2.0 nonthreadsafe default libdaplomcm.so.2 dapl.2.0 "hfi1_1 2" ""
\ No newline at end of file
diff --git a/doc/mpxyd.conf b/doc/mpxyd.conf
index e5d6d5b..f3fd722 100755
--- a/doc/mpxyd.conf
+++ b/doc/mpxyd.conf
@@ -58,7 +58,7 @@ scif_listen_qlen 240
 # 
 # The default is 1 
 
-mcm_affinity 1
+mcm_affinity 2
 
 # mcm_affinity_base_mic:
 # Specifies a hard binding for CPU id base value used for affinity support of
diff --git a/test/dtest/dtest.c b/test/dtest/dtest.c
index 6894a2c..e61e000 100755
--- a/test/dtest/dtest.c
+++ b/test/dtest/dtest.c
@@ -197,7 +197,7 @@ struct dt_time {
 struct dt_time ts;
 
 /* defaults */
-static int all_data = 0;
+static int all_data_sizes = 0;
 static int increment = 0;
 static int failed = 0;
 static int uni_direction = 0;
@@ -228,6 +228,7 @@ static int burst_msg_index = 0;
 static int ucm = 0;
 static int rq_cnt, sq_cnt;
 static DAT_SOCK_ADDR6 remote;
+static int data_check = 0;
 
 /* forward prototypes */
 const char *DT_RetToStr(DAT_RETURN ret_value);
@@ -566,12 +567,12 @@ int main(int argc, char **argv)
 	DAT_PROVIDER_ATTR pr_attr;
 
 	/* parse arguments */
-	while ((c = getopt(argc, argv, "auwWtscvpb:d:B:h:P:S:i:")) != -1) {
+	while ((c = getopt(argc, argv, "UDauwWtscvpb:d:B:h:P:S:i:")) != -1) {
 		switch (c) {
 		case 'i':
 			increment = atoi(optarg);
 		case 'a':
-			all_data = 1;
+			all_data_sizes = 1;
 			fflush(stdout);
 			break;
 		case 'u':
@@ -582,6 +583,10 @@ int main(int argc, char **argv)
 			write_only = 1;
 			fflush(stdout);
 			break;
+		case 'D':
+			data_check = 1;
+			printf("%d Running DATA CHECK mode\n", getpid());
+			/* fall through */
 		case 'W':
 			write_only_pp = 1;
 			uni_direction = 1;
@@ -631,12 +636,24 @@ int main(int argc, char **argv)
 		case 'S':
 			signal_rate = atoi(optarg);
 			break;
+		case 'U':
+			/* fall through */
 		default:
 			print_usage();
 			exit(-12);
 		}
 	}
 
+	if (all_data_sizes && !write_only_pp) {
+		printf("\n\t -a option only valid with -W option\n\n");
+		exit(-12);
+	}
+
+	if (data_check && strstr(provider, "scif")) {
+		printf("\n\t -D option is not valid with scif provider\n\n");
+		exit(-12);
+	}
+
 #if defined(_WIN32) || defined(_WIN64)
 	{
 		WSADATA wsaData;
@@ -680,8 +697,9 @@ int main(int argc, char **argv)
 
 	if (write_only_pp) {
 		/* rdma write pingpong, default == 1 byte */
-		if (!all_data) {
-			buf_len = 1;
+		if (!all_data_sizes) {
+			if (!data_check)
+				buf_len = 1;
 		} else if (!increment) { /* power of 2 */
 			buf_len_p2 = 1;
 			i = 0;
@@ -885,7 +903,7 @@ int main(int argc, char **argv)
 	if (write_only_pp) {
 		int max, inc;
 
-		if (all_data) {
+		if (all_data_sizes) {
 			if (increment) {
 				i = 1;
 				inc = increment;
@@ -896,14 +914,44 @@ int main(int argc, char **argv)
 				max = buf_len_p2;
 			}
 		} else {
-			i = buf_len;
-			max = buf_len;
-			inc = buf_len;
+			if (data_check) {
+				i = buf_len;
+				max = buf_len;
+				inc = 1;
+			}
+			else
+			{
+				i = buf_len;
+				max = buf_len;
+				inc = buf_len;
+			}
 		}
-		printf("\n %d RDMA WRITE PINGPONG\n\n", getpid());
+		printf("\n %d RDMA WRITE PINGPONG %s\n\n", getpid(),
+			data_check ? "with DATA CHECK":"");
+
 		for (; i <= max; i++) {
-			if (do_rdma_write_ping_pong(i, i*inc))
-				break;
+			if (all_data_sizes) {
+				int l_len = (i*inc) ? (i*inc) : 1 << i;
+
+				if ( l_len > 4 && do_rdma_write_ping_pong(i, l_len - 1)) {
+					fprintf(stderr, "%d Error do_rdma_write_ping_pong\n", getpid());
+					goto cleanup;
+				}
+			}
+
+			if (do_rdma_write_ping_pong(i, i*inc)) {
+				fprintf(stderr, "%d Error do_rdma_write_ping_pong\n", getpid());
+				goto cleanup;
+			}
+
+			if (all_data_sizes) {
+				int l_len = (i*inc) ? (i*inc) : 1 << i;
+
+				if ( l_len > 1 && l_len < buf_len && do_rdma_write_ping_pong(i, l_len + 1)) {
+					fprintf(stderr, "%d Error do_rdma_write_ping_pong\n", getpid());
+					goto cleanup;
+				}
+			}
 		}
 	}
 	else if (write_immed && write_only) {
@@ -1021,7 +1069,7 @@ complete:
 	free(rbuf);
 	free(sbuf);
 
-	if (!all_data) {
+	if (ts.rtt && !all_data_sizes) {
 		printf( "%d: %s PingPong: (%d x %d) Total %6.2lf us:"
 			" latency %3.2lf us, BW %4.2lf MB/s\n",
 			getpid(), write_only_pp ? "RDMA write":"Message",
@@ -1992,6 +2040,54 @@ acked:
 	return (DAT_SUCCESS);
 }
 
+#define PAT_NUM 5
+unsigned char pat[PAT_NUM] = { 0, 0xff, 0x55, 0xaa, 0 };
+
+void set_pat(unsigned int len, unsigned int pat_num)
+{
+	if (len <= 1)
+		return;
+
+	if (pat_num >= PAT_NUM) {
+		printf("\n\tpat_num = %d. max valid number is %d.\n\n", pat_num, PAT_NUM - 1);
+		exit(1);
+	}
+
+	if (server) {
+		/* server */
+		if (pat_num == PAT_NUM - 1) {
+			/* future: random data, add checksum */
+			;
+		} else {
+			/* check first byte only for some speed */
+			if ((unsigned char)rbuf[0] != (unsigned char)pat[pat_num]) {
+				fprintf(stderr,"%d: ERR: message len is %d,"
+						" location 0. Rx 0x%x expected"
+						" 0x%x, pat %d\n",
+						getpid(), len, (unsigned char)rbuf[0],
+						(unsigned char)pat[pat_num], pat_num);
+			}
+		}
+		memcpy(sbuf, rbuf, len - 1);
+
+	} else {
+		/* client */
+		int i;
+
+		if (pat_num == PAT_NUM - 1) { /* set random values */
+			struct timeval tv;
+
+			gettimeofday(&tv, NULL);
+			srand((unsigned int)tv.tv_usec);
+			for (i = 0; i < len - 1; i++)
+				sbuf[i] = (unsigned char)rand();
+		} else {
+			memset(sbuf, (unsigned char)pat[pat_num], len - 1);
+		}
+	}
+}
+
+
 /* always uni-direction */
 DAT_RETURN do_rdma_write_ping_pong(int p2, int bytes)
 {
@@ -2006,6 +2102,7 @@ DAT_RETURN do_rdma_write_ping_pong(int p2, int bytes)
 	volatile char *tx_buf, *rx_buf;
 	uint32_t rx_cnt = 0;
 	uint32_t tx_cnt = 0;
+	unsigned char rx_idx = 0;
 
 	len = bytes ? bytes : 1 << p2;
 
@@ -2030,6 +2127,34 @@ DAT_RETURN do_rdma_write_ping_pong(int p2, int bytes)
 		if (rx_cnt < burst && !(!server && !tx_cnt)) {
 			rx_cnt++;
 			while (*rx_buf != (char)rx_cnt);
+			rx_idx = (unsigned char)*rx_buf;
+
+			if (data_check && !server && memcmp(sbuf, rbuf, len)) {
+				int l=0, ll;
+				fprintf(stderr, "%d: ERR: Tx data from server wrong\n", getpid());
+
+				while (sbuf[l] == rbuf[l] && l < len)
+					l++;
+
+				fprintf(stderr,"%d: len %d, 1st error at %d. Tx 0x%x Rx 0x%x\n",
+						getpid(), len, l, (unsigned char)sbuf[l],
+						(unsigned char)rbuf[l]);
+				fprintf(stderr,"%d: rcnt %d (char = %d), tcnt %d, *rbuf %d\n",
+						getpid(), rx_cnt, (char)rx_cnt, tx_cnt,
+						(unsigned char)*rx_buf);
+				fprintf(stderr, "Send:");
+
+				for (ll=l; ll < len && ll < 1 + 64; ll++)
+					fprintf(stderr,"%02x", (unsigned char)sbuf[ll]);
+
+				fprintf(stderr, "\nRecv:");
+
+				for (ll=l; ll < len && ll < 1 + 64; ll++)
+					fprintf(stderr,"%02x", (unsigned char)rbuf[ll]);
+
+				fprintf(stderr, "\n");
+				return (DAT_ABORT);
+			}
 		}
 
 		if (!((i+1) % signal_rate))
@@ -2040,6 +2165,9 @@ DAT_RETURN do_rdma_write_ping_pong(int p2, int bytes)
 		if (tx_cnt == burst)
 			break;
 
+		if (data_check)
+			set_pat(len, tx_cnt % PAT_NUM);
+
 		*tx_buf = (char)++tx_cnt;
 		cookie.as_64 = tx_cnt;
 		ret = dat_ep_post_rdma_write(h_ep, MSG_IOV_COUNT,
@@ -2069,7 +2197,7 @@ DAT_RETURN do_rdma_write_ping_pong(int p2, int bytes)
 	stop = get_time();
 	ts.rtt = ((stop - start) * 1.0e6);
 
-	if ((unsigned char)*rx_buf != (unsigned char)rx_cnt) {
+	if (rx_idx != (unsigned char)rx_cnt) {
 		printf( "%d %s RW pingpong: %p, last *buf %d != cnt %d\n",
 			getpid(), server ? "SERVER:" : "CLIENT:",
 			rx_buf, (unsigned char)*rx_buf,
@@ -2077,7 +2205,7 @@ DAT_RETURN do_rdma_write_ping_pong(int p2, int bytes)
 		return (DAT_ABORT);
 	}
 
-	if (all_data) {
+	if (all_data_sizes) {
 		printf( "%d: RDMA write PingPong: (%d x %d) Total %6.2lf us:"
 			" latency %3.2lf us, BW %4.2lf MB/s\n",
 			getpid(), burst, len, ts.rtt, ts.rtt/burst/2,
@@ -2773,6 +2901,7 @@ void print_usage(void)
 	printf("u: unidirectional bandwidth (default=bidirectional\n");
 	printf("w: rdma write only, streaming\n");
 	printf("W: rdma write only, ping pong\n");
+	printf("D: validate data in ping pong test\n");
 	printf("t: performance times\n");
 	printf("c: use cno\n");
 	printf("a: all data sizes with rdma write pingpong \n");
@@ -2785,6 +2914,7 @@ void print_usage(void)
 	printf("h: hostname/address of server, specified on client\n");
 	printf("P: provider name (default = ofa-v2-mlx4_0-1u)\n");
 	printf("S: signal_rate (default=10, completion every 10 iterations\n");
+	printf("U: print this Usage page\n");
 	printf("\n");
 }
 
diff --git a/test/dtest/dtestx.c b/test/dtest/dtestx.c
index 931c860..a5693d8 100755
--- a/test/dtest/dtestx.c
+++ b/test/dtest/dtestx.c
@@ -180,6 +180,7 @@ int eps = 1;
 int verbose = 0;
 int counters = 0;
 int counters_ok = 0;
+int query_only = 0;
 static int ucm = 0;
 static DAT_SOCK_ADDR6 remote;
 static DAT_IA_ATTR ia_attr;
@@ -1549,7 +1550,7 @@ int main(int argc, char **argv)
 	DAT_RETURN status;
 
 	/* parse arguments */
-	while ((rc = getopt(argc, argv, "csvumpU:h:b:P:")) != -1) {
+	while ((rc = getopt(argc, argv, "qcsvumpU:h:b:P:")) != -1) {
 		switch (rc) {
 		case 'u':
 			ud_test = 1;
@@ -1584,6 +1585,9 @@ int main(int argc, char **argv)
 		case 'v':
 			verbose = 1;
 			break;
+		case 'q':
+			query_only = 1;
+			break;
 		default:
 			print_usage();
 			exit(-12);
@@ -1603,6 +1607,29 @@ int main(int argc, char **argv)
 		}
 	}
 #endif
+	if (query_only) {
+		memset(&ia_attr, 0, sizeof(ia_attr));
+		memset(&prov_attrs, 0, sizeof(prov_attrs));
+		status = dat_ib_open_query(provider, &ia,
+					   DAT_IA_FIELD_ALL, &ia_attr,
+					   DAT_PROVIDER_FIELD_ALL, &prov_attrs);
+		_OK(status, "dat_ib_open_query");
+
+		print_ia_address(ia_attr.ia_address_ptr);
+		printf(" Open_Query: %s num_attrs = %d\n",
+			provider, prov_attrs.num_provider_specific_attr);
+		/* Print provider specific attributes */
+		for (i = 0; i < prov_attrs.num_provider_specific_attr; i++) {
+			printf(" Open_Query: Provider Specific Attribute[%d] %s=%s\n",
+				  i, prov_attrs.provider_specific_attr[i].name,
+				  prov_attrs.provider_specific_attr[i].value);
+		}
+
+		status = dat_ib_close_query(ia);
+		_OK(status, "dat_ib_close_query");
+		exit(0);
+	}
+
 	status = dat_ia_open(provider, 8, &async_evd, &ia);
 	_OK(status, "dat_ia_open");
 
diff --git a/test/dtest/scripts/dtest_suite.sh b/test/dtest/scripts/dtest_suite.sh
new file mode 100755
index 0000000..d6a6713
--- /dev/null
+++ b/test/dtest/scripts/dtest_suite.sh
@@ -0,0 +1,1117 @@
+#!/bin/sh
+#
+# Copyright (c) 2016 Intel Corporation.  All rights reserved.
+#
+# This Software is licensed under one of the following licenses:
+#
+# 1) under the terms of the "Common Public License 1.0" a copy of which is
+#    in the file LICENSE.txt in the root directory. The license is also
+#    available from the Open Source Initiative, see
+#    http://www.opensource.org/licenses/cpl.php.
+#
+# 2) under the terms of the "The BSD License" a copy of which is in the file
+#    LICENSE2.txt in the root directory. The license is also available from
+#    the Open Source Initiative, see
+#    http://www.opensource.org/licenses/bsd-license.php.
+#
+# 3) under the terms of the "GNU General Public License (GPL) Version 2" a 
+#    copy of which is in the file LICENSE3.txt in the root directory. The 
+#    license is also available from the Open Source Initiative, see
+#    http://www.opensource.org/licenses/gpl-license.php.
+#
+# Licensee has the right to choose one of the above licenses.
+#
+# Redistributions of source code must retain the above copyright
+# notice and one of the license notices.
+#
+# Redistributions in binary form must reproduce both the above copyright
+# notice, one of the license notices in the documentation
+# and/or other materials provided with the distribution.
+#
+# Test Suite to test uDAPL Providers and CCL Proxy on MICs and Hosts
+#
+# Sample Usage, all providers, one loop, fast: 
+#
+#    ./dtest_suite.sh -P ALL -l 1 -f
+#
+
+### --- user input section --- ###
+server_list="cst-kc1 cst-kc1-mic0 cst-kc1-mic1"
+client_list="cst-kc2 cst-kc2-mic0 cst-kc2-mic1 cst-kc1 cst-kc1-mic0 cst-kc1-mic1"
+### ---  dtest test cases fine tune zone --- ###
+# Note: value zero indicacte dtest will use the test default value
+b_options="0 1 4096"
+u_options="0 1"
+w_options="0 1"
+S_options="0 9"
+B_options="0 1"
+D_options="0 1"
+W_options="0"
+# test defaults
+def_provider="ofa-v2-mlx4_0-1u"
+dat_conf="/etc/dat.conf"
+### --- End of user input section  --- ###
+
+script_version="1.05"
+
+# History log
+# 1.05 - Disable data validation mode when using scif provider
+#        From: Amir Hanania <amir.hanania at intel.com>
+# 1.04 - Add data validation for dtest ping pong
+#        Add option not to use CPU mask in performance test
+#        From: Amir Hanania <amir.hanania at intel.com>
+# 1.03 - Add dapl tests
+#        From: Amir Hanania <amir.hanania at intel.com>
+# 1.02 - Change performane test to use dtest -W case for latency.
+#        Note: You must have a dtesr version that support -W to run performane test.
+#        From: Amir Hanania <amir.hanania at intel.com>
+# 1.01 - Add multi provider test
+#        From: Amir Hanania <amir.hanania at intel.com>
+# 1.00 - Initial Version 
+#        From: Amir Hanania <amir.hanania at intel.com>
+#        Test script to test dapl.
+#	 Run dtest test in multiple options.
+# Notes:
+# 1. For performance test. Same dtest configuration is used twice.
+#    Once with -W for latency and once without for BW.
+#
+
+user_provider=$def_provider
+server_client_list=$server_list" "$client_list
+host_list=`for i in $server_client_list; do echo $i | awk -F "-mic" '{ print $1 }'; done | sort | uniq`
+provider_search_debug=0
+dapl_test_user_input="y"
+ran_one_dapltest=0
+dapl_test_rep_max=100
+dapl_test_rep=$dapl_test_rep_max
+mfo_test=0
+fast_test=0
+fast_test_str=""
+perf_test=0
+no_inline_data=0
+debug_info=0
+v_for_test=""
+user_srting=""
+ctrl_c=0
+runs=0
+max_run_time=0
+dapl_mtu=0
+loops=0
+log_file_dir="dtest_perf_logs"
+log_file="$log_file_dir/dtest_performance_"
+unidirection_test=0
+cpu_mask="no_cpu_mask"
+user_b_options="none"
+dog_file=/tmp/dog.log
+dog_ser=/tmp/dog.ser
+dog_cli=/tmp/dog.cli
+i=1
+while [ $i -lt 5000000 ]; do
+  b_options_for_perf_test+=" $i"
+  i=$(( $i*2 ))
+done 
+mkdir -p $log_file_dir
+
+control_c()
+# run if user hits control-c
+{
+        echo -en "\n*** ^c ***\n"
+        if [ $ctrl_c -ne 0 ]; then
+                echo -ne "\n*** Forced EXIT! ***\n\n"
+                for s in $server_list; do
+                  ssh root@$s "killall dtest" > /dev/null 2>&1
+                  ssh root@$s "killall dapltest" > /dev/null 2>&1
+                done
+                for c in $client_list; do
+                  ssh root@$c "killall dtest" > /dev/null 2>&1
+                  ssh root@$c "killall dapltest" > /dev/null 2>&1
+                done
+                exit 1
+        fi
+        let "ctrl_c+=1"
+        echo -en "\n*** Will break after this test case ***\n\n"
+}
+
+# trap keyboard interrupt (control-c)
+trap control_c SIGINT
+
+exit_control()
+{
+  # if dog killed us. Clean up the dtest still working.
+  for s in $server_list; do
+    ssh root@$c "killall dtest" > /dev/null 2>&1
+  done 
+  for c in $client_list; do
+    ssh root@$c "killall dtest" > /dev/null 2>&1
+  done
+
+  echo "2" > $dog_file
+  sleep 2
+  #kill dog
+  # jobs -p | xargs kill
+}
+# trap exit to kill dog when script exit
+#trap 'jobs -p | xargs kill' EXIT
+trap exit_control EXIT
+
+function dog(){
+  while true; do
+    val=`cat $dog_file`
+    if [ $val -eq 2 ]; then
+      exit
+    fi
+    if [ $val -eq 1 ]; then
+      server=`cat $dog_ser`
+      client=`cat $dog_cli`
+      server_err=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c ERR"`
+      client_err=`ssh root@$client "cat /tmp/dtest_cli_run.log | grep -c ERR"`
+      server_fail=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c FAIL"`
+      client_fail=`ssh root@$client "cat /tmp/dtest_cli_run.log | grep -c FAIL"`
+      if [ $server_err -gt 0 ] || [ $client_err -gt 0 ] || [ $server_fail -gt 0 ] || [ $client_fail -gt 0 ]; then
+        sleep 2
+        echo -e "\n\n\twatchdog bark - validation test failed\n\n"
+        killall ${0##*/}
+      fi
+      echo -n "." 
+    fi
+    sleep 1
+  done
+}
+
+function wait_for_server_to_be_ready(){
+  i=99
+  echo -ne "Waiting to servers to come up... $i                                     \r"
+  until [ $i -eq 0 ]; do
+    up=0
+    file_found="NOT found"
+    ssh root@$server [ -f /tmp/dtest_ser_run.log ] && file_found="file found"
+    if [ "$file_found" == "file found" ]; then
+      up=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c waiting"`
+    fi
+    if [ $up -eq 1 ]; then
+           break;
+    fi
+    let "i = i - 1"
+    echo -ne "Waiting to servers to come up... $i                                   \r"
+    sleep 0.1
+  done
+}
+
+
+u=0
+w=0
+B=0
+b=0
+S=0
+D=0
+
+function testcase(){
+  # Setting the dtest options
+  if [ $u -ne 0 ]; then
+    u_for_test="-u"
+  else
+    u_for_test=""
+  fi
+  if [ $w -ne 0 ]; then
+    w_for_test="-w"
+  else
+    w_for_test=""
+  fi
+  if [ $B -ne 0 ]; then
+    B_for_test="-B $B"
+  else
+    B_for_test=""
+  fi
+  if [ $b -ne 0 ]; then
+    b_for_test="-b $b"
+  else
+    b_for_test=""
+  fi
+  if [ $S -ne 0 ]; then
+    S_for_test="-S $S"
+  else
+    S_for_test=""
+  fi
+  if [ $W -ne 0 ]; then
+    W_for_test="-W"
+  else
+    W_for_test=""
+  fi
+  if [ $D -ne 0 ]; then
+    if [ $do_not_validate_data_with_scif -eq 1 ]; then
+      return 0
+    fi
+    D_for_test="-D -a -B 10"
+  else
+    D_for_test=""
+  fi
+
+  if [ $ctrl_c -ne 0 ]; then
+    echo -ne "\n*** Stop test due to ctrl c ***\n\n"
+    exit 1
+  fi
+
+  # in case the prev test failed. The files will be still there for debug. Delete them for the new run.
+  ssh root@$server "rm /tmp/dtest_ser_run.log" > /dev/null 2>&1
+  ssh root@$client "rm /tmp/dtest_cli_run.log" > /dev/null 2>&1
+
+  if [ $D -eq 1 ]; then
+    support_data_validation
+    if [ $dtest_support_data_val -ne 1 ]; then
+      return
+    fi
+  fi
+
+  #Start the server
+  echo "----------------------------------------------------------"
+  echo "Test case: $W_for_test $D_for_test $u_for_test $w_for_test $B_for_test $b_for_test $S_for_test $v_for_test $user_srting"
+  echo -ne "Start $taskset_4_server dtest -P $provider server $server\r"
+  ssh root@$server "$export_str $taskset_4_server dtest -P $provider $W_for_test $u_for_test $w_for_test $B_for_test $b_for_test $S_for_test $v_for_test $user_srting $D_for_test >& /tmp/dtest_ser_run.log" &
+  ser_pid=$!
+
+  # Wait for server to be ready
+  wait_for_server_to_be_ready
+
+  if [ $i -eq 0 ]; then
+    echo $server dtest failed - did not start
+    ssh root@$server "killall dtest"
+    ssh root@$client "killall dtest"
+    exit 1
+  fi
+
+  # Start client
+  echo -ne "Start $taskset_4_client dtest -P $provider client                                                                \r"
+  ssh root@$client "$export_str $taskset_4_client dtest -P $provider -h $server $W_for_test $u_for_test $w_for_test $B_for_test $b_for_test $S_for_test $v_for_test $user_srting $D_for_test >& /tmp/dtest_cli_run.log" &
+  cli_pid=$!
+
+  if [ $D -eq 1 ]; then
+    echo $server > $dog_ser
+    echo $client > $dog_cli
+    echo "1" > $dog_file
+  fi
+
+  # Wait for Server and Client to be done
+  wait $ser_pid $cli_pid
+
+  if [ $D -eq 1 ]; then
+    echo "0" > $dog_file
+  fi
+
+  # Check results from log files
+  server_pass=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c PASSED"`
+  client_pass=`ssh root@$client "cat /tmp/dtest_cli_run.log | grep -c PASSED"`
+  server_err=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c ERR"`
+  client_err=`ssh root@$client "cat /tmp/dtest_cli_run.log | grep -c ERR"`
+  do_exit=0
+  if [ $ctrl_c -ne 0 ]; then
+    ssh root@$server "killall -9 dtest" > /dev/null 2>&1
+    ssh root@$client "killall -9 dtest" > /dev/null 2>&1
+    do_exit=1
+  fi
+
+  if [ $server_pass -ne 1 ] || [ $server_err -ne 0 ]; then
+    echo "****** ERROR - $server server failed (with $client client) *******"
+    echo "               log file:  /tmp/dtest_ser_run.log on $server"
+    do_exit=1
+  fi
+
+  if [ $client_pass -ne 1 ] || [ $client_err -ne 0 ]; then
+    echo "****** ERROR - $client client failed (with $server server) *******"
+    echo "               log file: /tmp/dtest_cli_run.log on $client"
+    do_exit=1
+  fi
+
+  if [ $do_exit -eq 1 ]; then
+    echo
+    exit 1
+  fi
+
+  # Print to screen or file the results if needed
+  if [ $perf_test -eq 1 ]; then
+    echo -ne "                                                                                                             \r"
+    if [ $fast_test -eq 1 ]; then
+      if [ $W -ne 0 ]; then
+        # second run is latency test called with -W
+        lat=`ssh root@$client cat /tmp/dtest_cli_run.log | grep PingPong | awk -F "latency " '{print $2}' | awk -F " us" '{ print $1 }'`
+        res="$lat, Tx size=$res"
+        echo "latency: $res"
+        echo $res >> $log_file
+      else
+        # First test for BW
+        res=`ssh root@$client cat /tmp/dtest_cli_run.log | grep direction | awk -F "00 x " '{ print $2 }'`
+      fi
+    else
+      if [ $W -ne 0 ]; then
+        # second run is latency test called with -W
+        lat=`ssh root@$client cat /tmp/dtest_cli_run.log | grep PingPong | awk -F "latency " '{print $2}' | awk -F " us" '{ print $1 }'`
+        echo -e "Byte size: $b\t\tlatency: $lat\t\tBW: $res"
+        res=`echo $res | awk -F " MB" '{ print $1 }'`
+        res=$(printf "%15s" $res)
+        lat=$(printf "%10s" $lat)
+        echo -e "$b\t\t$lat\t\t$res" >> $log_file
+      else
+        # First test for BW
+        res=`ssh root@$client cat /tmp/dtest_cli_run.log | grep direction | awk -F "00 x $b, " '{ print $2 }'`
+      fi
+    fi
+  fi
+  ssh root@$server "rm /tmp/dtest_ser_run.log"
+  ssh root@$client "rm /tmp/dtest_cli_run.log"
+
+  echo "Test case passed                               "
+
+  read  -t 0.01 -n 1 -s u_input
+  ret=$?
+  if [ $ret -eq 0 ] && [ "$u_input" == "i" ]; then
+    print_round_info
+  fi
+
+  return 0
+}
+
+
+function wait_for_it(){
+  max_wait=900
+  i=$max_wait
+  sleep_for=0.1
+  test_start_time=`date +%s`
+  until [ $i -eq 0 ]; do
+    echo -n "."
+    sleep $sleep_for
+    up=`ssh root@$wait_for_it_machine cat $wait_for_it_file | grep -c "$wait_for_it_string"`
+    if [ $up -eq 1 ]; then
+      break;
+    fi
+    let "i = i - 1"
+    if [ $ctrl_c -ne 0 ]; then
+      i=0
+    fi
+    if [ $i -eq $(( $max_wait - 20 )) ]; then
+      sleep_for=1
+    fi
+    if [ $i -eq $(( $max_wait - 40 )) ]; then
+      sleep_for=3
+    fi
+  done
+
+  if [ $i -eq 0 ]; then
+    if [ $ctrl_c -ne 0 ]; then
+      echo -ne "\n\t*** Stop test due to ctrl c ***\n\n"
+    else 
+      echo " failed"
+      echo -e "\n\n\tDid not find $wait_for_it_string string on machine: $wait_for_it_machine at file $wait_for_it_file - EXIT\n\n"
+    fi
+    ssh root@$server killall dapltest > /dev/null 2>&1
+    ssh root@$client killall dapltest > /dev/null 2>&1
+    exit
+  fi
+  test_end_time=`date +%s`
+  test_run_time=$(($test_end_time-$test_start_time))
+  echo " done in $test_run_time sec"
+}
+
+
+function print_round_info(){
+  now=`date +%s`
+  run_time=$(($now-$start_time))
+  ss=$(($run_time%60))
+  mm=$(($run_time/60))
+  mm=$(($mm%60))
+  hh=$(($run_time/3600))
+  echo "**************************************************************"
+  echo -e "\tin round $runs - $hh h $mm m $ss s"
+  echo "**************************************************************"
+}
+
+
+# Check if client and server dtest support data validation 
+function support_data_validation() {
+  dtest_support_data_val=0
+  
+  ssh root@$server "dtest -U >& /tmp/dtest_ser_run.log"
+  ssh root@$client "dtest -U >& /tmp/dtest_cli_run.log"
+  sleep .1
+  ser_is_valid=`ssh root@$server cat /tmp/dtest_ser_run.log | grep -c "validate data"`
+  if [ $ser_is_valid -ne 1 ]; then
+    return 0
+  fi
+  cli_is_valid=`ssh root@$client cat /tmp/dtest_cli_run.log | grep -c "validate data"`
+  if [ $cli_is_valid -ne 1 ]; then
+    return 0
+  fi
+  dtest_support_data_val=1
+}
+
+
+# Run dtest in all data size ping pong test with data validation mode between client and server
+function server_client_data_validation_test(){
+
+  echo -e "\n\n\n\t**** dtest data validation test\t\tprovider: $provider\t\tserver: $server $taskset_4_server\t\tclient: $client $taskset_4_client ****\n"
+  support_data_validation
+  if [ $dtest_support_data_val -ne 1 ]; then
+    echo -e "\t**** $client or $server dtest does not support data validation - skipping ****"
+    return
+  fi
+
+  echo -e "        Start $taskset_4_server dtest -P $provider -D -a on server $server"
+  ssh root@$server "$export_str $taskset_4_server dtest -P $provider -D -a -B 100 >& /tmp/dtest_ser_run.log" &
+  ser_pid=$!
+  wait_for_server_to_be_ready
+
+  echo -e "        Start $taskset_4_client dtest -P $provider -D -a on client $client"
+  ssh root@$client "$export_str $taskset_4_client dtest -P $provider -h $server -D -a -B 100 >& /tmp/dtest_cli_run.log" &
+  cli_pid=$!
+  # just wait a bit for files on server and clien be ready before waking up the dog
+  sleep 1
+
+  echo $server > $dog_ser
+  echo $client > $dog_cli
+  echo "1" > $dog_file
+
+  # Wait for Server and Client to be done
+  wait $ser_pid $cli_pid
+
+  echo "0" > $dog_file
+  echo
+  # Check results from log files
+  server_pass=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c PASSED"`
+  client_pass=`ssh root@$client "cat /tmp/dtest_cli_run.log | grep -c PASSED"`
+  server_err=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c ERR"`
+  client_err=`ssh root@$client "cat /tmp/dtest_cli_run.log | grep -c ERR"`
+  do_exit=0
+  if [ $ctrl_c -ne 0 ]; then
+    ssh root@$server "killall -9 dtest" > /dev/null 2>&1
+    ssh root@$client "killall -9 dtest" > /dev/null 2>&1
+    do_exit=1
+  fi
+
+  if [ $server_pass -ne 1 ] || [ $server_err -ne 0 ]; then
+    echo "****** ERROR - $server server failed (with $client client) *******"
+    echo "               log file:  /tmp/dtest_ser_run.log on $server"
+    do_exit=1
+  fi
+
+  if [ $client_pass -ne 1 ] || [ $client_err -ne 0 ]; then
+    echo "****** ERROR - $client client failed (with $server server) *******"
+    echo "               log file: /tmp/dtest_cli_run.log on $client"
+    do_exit=1
+  fi
+
+  if [ $do_exit -eq 1 ]; then
+    echo
+    exit 1
+  fi
+
+  echo -e "\n\tdtest data validation test\t\tserver: $server\t\tclient: $client\t\tprovider: $provider\t\tTEST PASSED\n\n"
+
+}
+
+
+# Run dapltest between client and server
+function server_client_dapl_test(){
+  ofa_post=""
+  dapl_test_rep=$dapl_test_rep_max
+  if [ $ctrl_c -ne 0 ]; then
+    echo -ne "\n*** Stop test due to ctrl c ***\n\n"
+    exit 1
+  fi
+
+  echo "----------------------------------------------------------"
+  echo -ne "\t**** dapltest\t\tprovider: $provider\t\tserver: $server\t\tclient: $client "
+
+  # in case the prev test failed. The files will be still there for debug. Delete them for the new run.
+  ssh root@$server "rm /tmp/dapltest_ser_run.log" > /dev/null 2>&1
+  ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1
+
+  # 1. skip if roc
+  # 2. check that provider is ofa or scm
+  is_roe=`echo $provider | grep -c roe`
+  if [ $is_roe -eq 1 ]; then
+    good_provider_for_dapltest=0
+    echo -e " - provider $provider not supported - skipping ****"
+    echo "----------------------------------------------------------"
+    return 0
+  fi
+  is_ofa=`ssh root@$server cat $dat_conf | grep $provider | grep -c libdaplofa`
+  is_scm=`ssh root@$server cat $dat_conf | grep $provider | grep -c libdaploscm`
+  if [ $is_ofa -eq 0 ] && [ $is_scm -eq 0 ]; then
+    good_provider_for_dapltest=0
+    echo -e " - provider $provider not supported - skipping ****"
+    echo "----------------------------------------------------------"
+    return 0
+  fi
+  if [ $is_ofa -eq 1 ]; then
+    dat_line=`ssh root@$server cat $dat_conf | grep $provider`
+    ofa_post=`echo $dat_line | grep lofa | awk '{ print $1 }' | awk -F "ofa-v2" '{ print $2 }'`
+  fi
+  ran_one_dapltest=1
+
+  # start server
+  wait_for_it_machine=$server
+  wait_for_it_file="/tmp/dapltest_ser_run.log"
+  wait_for_it_string="Dapltest: Service Point Ready"
+  echo -e " ****\n----------------------------------------------------------"
+  echo -e "dapltest\tprovider: $provider\tserver: $server\tclient: $client"
+  echo -ne "start dapltest server..."
+  ssh root@$server "dapltest -T S -D $provider >& /tmp/dapltest_ser_run.log" &
+  wait_for_it
+
+  # tests
+  wait_for_it_machine=$client
+  wait_for_it_file="/tmp/dapltest_cli_run.log"
+  wait_for_it_string="Total WQE"
+  # test 1
+  echo -ne "start dapltest client test 1 ..."
+  ssh root@$client "dapltest -T T -s $server$ofa_post -D $provider -i $dapl_test_rep -t 1 -w 1 client SR 256 server SR 256 >& /tmp/dapltest_cli_run.log" &
+  wait_for_it
+
+  if [ $fast_test -eq 0 ]; then
+    # test 2
+    if [ $dapl_test_rep -ne 1 ] && [ $test_run_time -ge 4 ]; then
+      dapl_test_rep=$(($dapl_test_rep/$test_run_time/8))
+      if [ $dapl_test_rep -eq 0 ]; then
+        dapl_test_rep=1
+      fi
+      echo Reduce rep to $dapl_test_rep
+    fi
+    echo -ne "start dapltest client test 2 ..."
+    ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1
+    ssh root@$client "dapltest -T T -s $server$ofa_post -D $provider -i $dapl_test_rep -t 1 -w 1 client SR 256 server RW 4096 server SR 256 >& /tmp/dapltest_cli_run.log" &
+    wait_for_it
+
+    # test 3
+    echo -ne "start dapltest client test 3 ..."
+    ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1
+    ssh root@$client "dapltest -T T -s $server$ofa_post -D $provider -i $dapl_test_rep -t 1 -w 1 client SR 256 server RR 4096 server SR 256 >& /tmp/dapltest_cli_run.log" &
+    wait_for_it
+
+    # test 4
+    echo -ne "start dapltest client test 4 ..."
+    ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1
+    ssh root@$client "dapltest -T T -s  $server$ofa_post -D $provider -i $dapl_test_rep -t 1 -w 1 client SR 256 server RW 4096 server SR 256 client SR 256 server RW 4096 server SR 256 client SR 4096 server SR 256 >& /tmp/dapltest_cli_run.log" &
+    wait_for_it
+
+    # test 5
+    if [ $dapl_test_rep -ne 1 ] && [ $test_run_time -ge 2 ]; then
+      dapl_test_rep=$(($dapl_test_rep/8))
+      if [ $dapl_test_rep -eq 0 ]; then
+        dapl_test_rep=1
+      fi
+      echo Reduce rep to $dapl_test_rep
+    fi
+    echo -ne "start dapltest client test 5 ..."
+    ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1
+    ssh root@$client "dapltest -T T -s $server$ofa_post -D $provider -i $dapl_test_rep -t 1 -w 8 client SR 256 server RW 4096 server SR 256 client SR 256 server RW 4096 server SR 256 client SR 4096 server SR 256 >& /tmp/dapltest_cli_run.log" &
+    wait_for_it
+
+    if [ $dapl_test_rep -ne 1 ] && [ $test_run_time -ge 2 ]; then
+      dapl_test_rep=$(($dapl_test_rep/4))
+      if [ $dapl_test_rep -eq 0 ]; then
+        dapl_test_rep=1
+      fi
+      echo Reduce rep to $dapl_test_rep
+    fi
+    # test 6
+    echo -ne "start dapltest client test 6 ..."
+    ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1
+    ssh root@$client "dapltest -T T -s $server$ofa_post -D $provider -i $dapl_test_rep -t 4 -w 8 client SR 256 server RW 4096 server SR 256 client SR 256 server RW 4096 server SR 256 client SR 4096 server SR 256 >& /tmp/dapltest_cli_run.log" &
+    wait_for_it
+  fi
+
+  # stop server
+  echo -n "stop dapltest server..."
+  ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1
+  ssh root@$client "dapltest -T Q -s $server$ofa_post -D $provider >& /tmp/dapltest_cli_run.log" &
+  cli_pid=$!
+
+  wait_for_it_machine=$server
+  wait_for_it_file="/tmp/dapltest_ser_run.log"
+  wait_for_it_string="Exiting"
+  echo -n "wait for dapltest server to stop..."
+  wait_for_it
+
+  # Wait for Server and Client to be done
+  wait $cli_pid
+
+  # clean up
+  ssh root@$server "rm /tmp/dapltest_ser_run.log" > /dev/null 2>&1
+  ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1
+
+  echo -e "\tdapltest\t\tserver: $server\t\tclient: $client\t\tprovider: $provider\t\tTESTS PASSED"
+  echo -e "----------------------------------------------------------\n"
+}
+
+
+# Run all the test cases between two machines.
+function server_host_test(){
+  taskset_4_server=""
+  taskset_4_client=""
+  if [ $perf_test -eq 1 ]; then
+    is_mic=`echo $server | grep -c mic`
+    if [ $is_mic -eq 0 ] && [ "$cpu_mask" != "no_cpu_mask" ]; then
+      taskset_4_server="taskset $cpu_mask "
+    fi
+    is_mic=`echo $client | grep -c mic`
+    if [ $is_mic -eq 0 ] && [ "$cpu_mask" != "no_cpu_mask" ]; then
+      taskset_4_client="taskset $cpu_mask "
+    fi
+
+    echo -e "\n**** dtest: provider: $provider      \tserver: $server \tclient: $client ****\n"  >> $log_file
+    if [ $fast_test -eq 0 ]; then
+      echo -e "\nBytes\t\t   Latency\t\t\t MB/s"  >> $log_file
+    fi
+  fi
+
+  if [ "$dapl_test_user_input" != "o" ]; then
+    echo -e "\n\n\n\t**** dtest\t\tprovider: $provider\t\tserver: $server $taskset_4_server\t\tclient: $client $taskset_4_client ****"
+
+    #set var value to zero in order to use dtest default value for that option.
+    for u in $u_options; do
+      for w in $w_options; do
+        for b in $b_options; do
+          for S in $S_options; do
+            for B in $B_options; do
+              for D in $D_options; do
+                for W in $W_options; do # Always keep last. See Note 1.
+                  # Run one test case between Client and Server.
+                  testcase
+                  ret=$?
+                  if [ $ret -ne 0 ]; then
+                    echo TEST FAILED
+                    exit 1
+                  fi
+                  sleep 1
+                done
+              done
+            done
+          done
+        done
+      done
+    done
+
+    echo -e "\n\tdtest\t\tserver: $server\t\tclient: $client\t\tprovider: $provider\t\tTEST PASSED\n\n"
+
+    if [ $perf_test -ne 1 ] && [ $do_not_validate_data_with_scif -eq 0 ] && [ $fast_test -ne 1 ]; then
+      server_client_data_validation_test
+    fi
+  fi
+
+  if [ "$dapl_test_user_input" != "n" ] && [ $good_provider_for_dapltest -eq 1 ] && [ $fast_test -ne 1 ]; then
+    server_client_dapl_test
+  fi
+
+}
+
+function help(){
+  echo -e "\n\tRun dtest and dapltest accross cluster - from each client to each server\n"
+  echo -e "\t\tServer list: $server_list"
+  echo -e "\t\tClient list: $client_list\n"
+  echo -e "\t-P <PROVIDER NAME> : Provider name or 'ALL' for all prividers (default $def_provider)"
+  echo -e "\t-f: Fast test"
+  echo -e "\t-l <NUM> : How many test loops to run. Def forever"
+  echo -e "\t-t <NUM> : How many minutes to run. Def forever"
+  echo -e "\t-p <CPUs mask> or \"no_cpu_mask\": Performance test"
+  echo -e "\t\tMask in 0xHEX format. should match host's /sys/class/mic/mic0/device/local_cpus"
+  echo -e "\t\tFor no CPU mask enter \"no_cpu_mask\""
+  echo -e "\t\tConsider also: taskset mpxyd, set mcm_affinity to 2 in /etc/mpxyd.conf, performance mode at the host scaling_governor"
+  echo -e "\t\tConsider also to change DAPL MTU (-M optoin)"
+  echo -e "\t-w: Write only test"
+  echo -e "\t-u: uni-direction only test"
+  echo -e "\t-d <n|y|o> : dapl test options. \"n\": No dapl tests. \"y\": Run dapl tests. \"o\": Run Only dapl tests (no dtest). Def: Run dapl_test"
+  echo -e "\t-M <NUM> : DAPL MTU"
+  echo -e "\t-b <NUM> : data size. Can be: one size, many sizes as a string or type \`all\` for all sizes power of 2"
+  echo -e "\t-U: \"user string\". user dtest option string ( -w -b -u and -S dtest options )"
+  echo -e "\t-z: use zero for -w -b -u and -S dtest options (zero mean test default value)"
+  echo -e "\t-i: No inline data test"
+  echo -e "\t-m: Force MFO test"
+  echo -e "\t-D: DAPL debug print in log files"
+  echo -e "\t-v: dtest verbose mode"
+  echo -e "\t-q: qib test over mlx4 (same as -m and -i options)"
+  echo -e "\t-V: Print the script version"
+  echo -e "\t-h: help"
+  echo -e "\n\tWhile test is running:"
+  echo -e "\t^c: Exit gracefully"
+  echo -e "\t^c^c: Forced exit"
+  echo -e "\ti: Print round number and time duration"
+  echo -e "\n\n"
+  exit 1
+}
+
+
+function log(){
+  if [ $provider_search_debug -eq 1 ]; then  
+    echo -e "$@"
+  else
+    echo -n "."
+  fi
+}
+
+
+function providers_search(){ 
+  echo -e "\nSearching for devices"
+  first_host=1
+  for host in $host_list; do
+    # make sure host dat file exist
+    dat_conf_found="NOT found"
+    ssh root@$host "[ -f $dat_conf ]" && dat_conf_found="dat_conf_found"
+    if [ "$dat_conf_found" == "dat_conf_found" ]; then
+      log "$dat_conf found on $host"
+    else
+      echo -e "\n\t$dat_conf was not found on $host.\n\n"
+      exit 1
+    fi
+
+    #ib devices list
+    dev_list=`ssh root@$host ibv_devices | tail -n +3 | awk '{ print $1 }'`
+    for dev in $dev_list; do
+      # for each device
+      log Found $dev device
+      port_cnt=`ssh root@$host ibv_devinfo -d $dev | grep phys_port_cnt | awk '{print$2 }'`
+      log "  $dev phys_port_cnt: $port_cnt"
+      for port in $(seq 1 $port_cnt); do
+        # for each post
+        log "    checking $dev port $port status"
+        up=`ssh root@$host ibv_devinfo -d $dev -i $port | grep state | grep -c PORT_ACTIVE`
+        if [ $up -ne 1 ]; then
+          log "    $dev port $i is not active"
+          continue
+        fi
+        log "    $dev port $port is active"
+        log "    add it to the list"
+        # get a list of providers that this device can use
+        providers+=`ssh root@$host cat $dat_conf | grep "$dev $port" | awk '{ print $1 }'`
+        providers+=" "
+      done 
+    done
+
+    #add network ib devices
+    net_dev_list=` ssh root@$host netstat -i | grep -v "no statistics available" | tail -n +3 | awk '{ print $1 }'`
+    for net_dev in $net_dev_list; do
+      # for each net device
+      log Found $net_dev net device
+      is_ib=`ssh root@$host ip addr show $net_dev | grep -c infiniband`
+      if [ $is_ib -ne 1 ]; then
+        log "  $net_dev net device is not ib device"
+        continue
+      fi
+      log "    $net_dev is infiniband device"
+      has_ip_addr=`ssh root@$host ip addr show $net_dev | grep inet | grep -vc inet6`
+      if [ $has_ip_addr -ne 1 ]; then
+        log "  $net_dev net device has no ip addr"
+        continue
+      fi
+      log "    $net_dev net device has IP address"
+      log "    add it to the list"
+      # get a list of providers that this device can use
+      providers+=`ssh root@$host cat $dat_conf | grep "$net_dev 0" | awk '{ print $1 }'`
+      providers+=" "
+    done
+
+    log; log -n "$host povider list: "; for i in $providers; do log -n "$i "; done; log
+    if [ $first_host -eq 1 ]; then
+      # just save providers from first host
+      hosts_providers_list=$providers
+      first_host=0
+    else
+      # Merge providers from prev hosts with the one from the new host
+      # Keep only the providers that are on both lists
+      log hosts p from prev hosts: $hosts_providers_list
+      hosts_providers_list+=$providers
+      hosts_providers_list=`for p in $hosts_providers_list; do echo $p; done | sort | uniq -d`
+      log hosts p after merge: $hosts_providers_list
+    fi 
+    providers=""
+  done
+  cnt=0
+  echo -e "\nPovider list:"
+  for i in $hosts_providers_list; do
+    echo $i
+    let cnt+=1
+  done
+  if [ $cnt -eq 0 ]; then
+    echo -e "no devices where found\n\n"
+    exit
+  fi
+  echo -e "Total $cnt providers\n\n"
+}
+
+
+# check if the "server-client-provider" combination is OK
+# Set server_client_provider_is_not_valid_combo to one if not a valid combo
+function check_provider_server_client_combo(){
+  server_client_provider_is_not_valid_combo=0
+  #check the following:
+  # 1. scif providers can only run on the same machine.
+  is_scif=`echo $provider | grep -c scif`
+  if [ $is_scif -eq 1 ]; then
+    server_host=`echo $server | awk -F "-mic" '{ print $1 }'`
+    client_host=`echo $client | awk -F "-mic" '{ print $1 }'`
+    if [ $server_host == $client_host ]; then
+      return
+    else
+      server_client_provider_is_not_valid_combo=1
+      return
+    fi
+  fi
+  # 2. MIC qib can only run mcm provider
+  is_ser_mic=`echo $server | grep -c mic`
+  is_cli_mic=`echo $client | grep -c mic`
+  if [ $is_ser_mic -eq 1 ] || [ $is_cli_mic -eq 1 ]; then
+    # MIC Server or Client
+    is_qib_provider=`echo $provider | grep -c qib`
+    if [ $is_qib_provider -eq 1 ]; then
+      # Server or Client is MIC AND qib provider - make sure provider is MCM
+      is_mcm=`echo $provider | grep -c m`
+      if [ $is_mcm -eq 1 ]; then
+        return
+      else
+        server_client_provider_is_not_valid_combo=1
+        return
+      fi
+    fi
+  fi
+  # 3. check if MICs ib interface is UP
+  is_ib_provider=`echo $provider | grep -ce -ib`
+  if [ $is_ib_provider -eq 1 ]; then
+    interface=`echo $provider | awk -F "ofa-v2-" '{ print $2 }'`
+    if [ $is_ser_mic -eq 1 ]; then
+      up=`ssh root@$server ifconfig | grep -c $interface`
+      if [ $up -eq 1 ]; then
+        return
+      else
+        server_client_provider_is_not_valid_combo=1
+        return
+      fi
+    fi
+    if [ $is_cli_mic -eq 1 ]; then
+      up=`ssh root@$client ifconfig | grep -c $interface`
+      if [ $up -eq 1 ]; then
+        return
+      else
+        server_client_provider_is_not_valid_combo=1
+        return
+      fi
+    fi
+  fi
+}
+
+
+
+
+
+
+while getopts uviVzDmfwhiql:t:P:U:p:d:M:b: option
+do
+  case "${option}"
+  in
+  P) user_provider=${OPTARG};;
+  m) no_inline_data=1 ; mfo_test=1;;
+  f) fast_test=1; loops=1; fast_test_str=" fast test";;
+  p) cpu_mask=${OPTARG}; perf_test=1; W_options="0 1";;
+  U) user_srting=${OPTARG}; b_options="0"; u_options="0"; S_options="0"; w_options="0"; B_options="0";;
+  z) b_options="0"; u_options="0"; S_options="0"; w_options="0"; B_options="0";;
+  w) w_options="1";;
+  u) unidirection_test=1; u_options="1";;
+  D) debug_info=1;;
+  d) dapl_test_user_input=${OPTARG};;
+  v) v_for_test=" -v ";;
+  i) no_inline_data=1;;
+  q) no_inline_data=1 ; mfo_test=1;;
+  t) max_run_time=${OPTARG};;
+  M) dapl_mtu=${OPTARG};;
+  l) loops=${OPTARG};;
+  b) user_b_options=${OPTARG};;
+  V) echo -e "\n\t${0##*/} version $script_version\n\n"; exit;;
+  h) help;;
+  esac
+done
+
+if [ $fast_test -eq 1 ]; then
+  b_options="0"; u_options="0"; S_options="0"; w_options="0"; B_options="0"; D_options="0";
+fi
+
+if [ $perf_test -eq 1 ]; then
+  b_options=$b_options_for_perf_test; u_options="0"; S_options="0"; loops=1; w_options="1"; B_options="0"; user_srting="$user_string -p";dapl_test_user_input="n"; D_options="0";
+  legit_input=`echo $cpu_mask | grep -ci 0x`
+  if [ $legit_input -ne 1 ] && [ "$cpu_mask" != "no_cpu_mask" ]; then
+    echo -e "\n\t< 0xCPUs_mask > or \"no_cpu_mask\" in option -p is missing - input=$cpu_mask - Exit\n\n"
+    exit
+  fi
+fi
+
+if [ $fast_test -eq 1 ] && [ $perf_test -eq 1 ]; then
+  b_options="0"
+fi
+
+if [ $unidirection_test -eq 1 ]; then
+  u_options="1"
+fi
+
+if [ "$user_b_options" != "none" ]; then
+  if [ "$user_b_options" == "all" ]; then
+      b_options=$b_options_for_perf_test
+  else
+    b_options="$user_b_options"
+  fi
+fi
+
+if [ "$dapl_test_user_input" != "n" ] && [ "$dapl_test_user_input" != "y" ] && [ "$dapl_test_user_input" != "o" ]; then
+  echo -e "\n\tdapl test option must be n/y/o - Exit\n\n"
+  exit
+fi
+
+# check mpxyd is running on host machines.
+for host in $host_list; do
+  up=`ssh root@$host "ps ax | grep -c mpxyd"`
+  if [ $up -ne 3 ]; then
+    echo -e "\n\tERROR - mpxyd is not running on $host\n\n"
+         exit
+  fi
+  if [ $no_inline_data -eq 1 ]; then
+    up=`ssh root@$host cat /var/log/mpxyd.log | grep -c "RDMA IB inline threshold 0"`
+    if [ $up -ne 1 ]; then
+      echo on host $host you need to run mpxyd with mcm_ib_inline 0 in /etc/mpxyd.conf file for no inline data test
+      exit 1
+    fi
+  fi
+done
+
+if [ $user_provider == "ALL" ] || [ $user_provider == "all" ]; then
+  providers_search
+else
+  hosts_providers_list=$user_provider
+fi
+
+echo -e "\nServer list: $server_list"
+echo -e "Client list: $client_list"
+echo -e "Host list:"
+for i in $host_list; do
+  echo $i
+done
+echo
+
+if [ $mfo_test -eq 1 ]; then
+  export_str="export DAPL_MAX_INLINE=0 ; export DAPL_MCM_MFO=1 ; "
+  echo -ne "\n\t\t**** Running MFO test case \t\t$export_str ****\n\n"
+elif [ $no_inline_data -eq 1 ]; then
+  export_str="export DAPL_MAX_INLINE=0 ; "
+  echo -ne "\n\t\t**** Running no inline data test case \t\t$export_str ****\n\n"
+else
+  export_str=""
+fi
+
+if [ $debug_info -eq 1 ]; then
+  export_str="$export_str export DAPL_DBG_TYPE=0xffffffff ; "
+  echo -ne "\n\t\t**** Running in debug mode\t\texport value: $export_str ****\n\n"
+fi
+if [ $dapl_mtu -ne 0 ]; then
+  export_str="$export_str export DAPL_IB_MTU=$dapl_mtu ; "
+  echo -ne "\n\t\t**** Setting DAPL_IB_MTU to $dapl_mtu \t\texport value: $export_str ****\n\n"
+fi
+
+if [ $loops -ne 0 ]; then
+  echo -e "\n\tRunning$fast_test_str for $loops iterations"
+fi
+
+if [ $max_run_time -ne 0 ]; then
+  echo -e "\n\tRunning$fast_test_str for $max_run_time minutes"
+fi
+
+if [ $loops -eq 0 ] && [ $max_run_time -eq 0 ]; then
+  echo -ne "\n\tRunning$fast_test_str forever\n\n"
+fi
+
+if [ $perf_test -eq 1 ]; then
+  if [ $unidirection_test -eq 1 ]; then
+    log_file+="unidirection_test-"
+  else
+    log_file+="bidirection_test-"
+  fi
+  log_file+=`date +%F-%H-%M-%S`
+  echo -e "\n\tRunning performance test with cpu mask: $cpu_mask\n\tOutput file: $log_file"
+  echo "Server list: $server_list" > $log_file
+  echo "Client list: $client_list" >> $log_file
+  echo "CPU mask: $cpu_mask" >> $log_file
+  if [ $dapl_mtu -ne 0 ]; then
+    echo "DAPL_IB_MTU: $dapl_mtu" >> $log_file
+  else
+    echo "DAPL_IB_MTU: Default value" >> $log_file
+  fi
+  if [ $unidirection_test -eq 1 ]; then
+    echo "Test type: unidirection test" >> $log_file
+  else
+    echo "Test type:bidirection test" >> $log_file
+  fi
+  for host in $host_list; do
+    op_poll=`ssh root@$host cat /var/log/mpxyd.log | grep -c "OP thread polling enabled"`
+    if [ $op_poll -ne 1 ]; then
+      echo "OP thread polling on $host: disabled" >> $log_file
+      echo -e "\tOP thread polling on $host: disabled"
+    else
+      echo "OP thread polling on $host: enabled" >> $log_file
+      echo -e "\tOP thread polling on $host: enabled"
+    fi    
+  done
+  echo -e "\n\n"
+fi
+echo "0" > $dog_file
+dog &
+
+sleep 1
+start_time=`date +%s`
+
+while [ 1 ]; do
+  now=`date +%s`
+  run_time=$(($now-$start_time))
+  ss=$(($run_time%60))
+  mm=$(($run_time/60))
+  total_run_time_in_min=$mm
+  mm=$(($mm%60))
+  hh=$(($run_time/3600))
+  dd=$(($hh/24))
+  hh=$(($hh%24))
+  pp=$(printf "%d days %d hours %d min and %d sec" $dd $hh $mm $ss)
+
+  echo
+  echo
+  echo "**************************************************************"
+  echo "**************************************************************"
+  echo Run time: $pp
+  if [ $max_run_time -ne 0  ] && [ $total_run_time_in_min -ge $max_run_time ]; then
+    echo -e "Ran for the $max_run_time minute requested by the user - Exiting\n\n"
+    break;
+  fi
+  if [ $loops -ne 0  ] && [ $runs -eq $loops ]; then
+    echo -e "Ran for the $loops iterations requested by the user - Exiting\n\n"
+    break;
+  fi
+  runs=$(( $runs + 1 ))
+  echo Starting round $runs
+  date
+  echo "**************************************************************"
+  echo "**************************************************************"
+  echo
+
+  # Runinng
+  for provider in $hosts_providers_list; do
+    do_not_validate_data_with_scif=`echo $provider | grep -c scif`
+    good_provider_for_dapltest=1
+    for server in $server_list; do
+      for client in $client_list; do
+        check_provider_server_client_combo
+        if [ $server_client_provider_is_not_valid_combo -ne 0 ]; then
+          #echo -e "***** ***** skipping test case: Server:$server Client:$client provider:$provider ***** *****"
+          continue
+        fi
+        # Run all test cases between Client and Server.
+        server_host_test
+      done
+    done
+
+  if [ "$dapl_test_user_input" == "o" ] && [ $ran_one_dapltest -eq 0 ]; then
+    echo -e "\n\n\n\n\t\t***** ***** WARNING: only dapltest was set up but no dapltest was done with $provider provider $export_str ***** *****\n\n"
+  else
+    echo -e "\n\n\n\n\t\t***** ***** server client tests with $provider provider $export_str - TEST PASSED ***** *****\n\n"
+  fi
+  done
+done

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ofed/dapl.git



More information about the Pkg-ofed-commits mailing list