[Pkg-ofed-commits] [dapl] 05/06: Imported Upstream version 2.1.8
Ana Beatriz Guerrero López
ana at moszumanska.debian.org
Fri Mar 25 17:45:17 UTC 2016
This is an automated email from the git hooks/post-receive script.
ana pushed a commit to branch master
in repository dapl.
commit 3961331ed4db369eaf21cea5612b78470d26a5f8
Author: Ana Beatriz Guerrero Lopez <ana at debian.org>
Date: Fri Mar 25 18:44:40 2016 +0100
Imported Upstream version 2.1.8
---
ChangeLog | 212 +++++++
Makefile.am | 1 +
Makefile.in | 1 +
configure | 20 +-
configure.in | 4 +-
dapl.spec | 5 +-
dapl.spec.in | 3 +
dapl/openib_cma/device.c | 3 -
dapl/openib_common/dapl_ib_common.h | 11 +-
dapl/openib_common/dapl_mic_common.h | 4 +-
dapl/openib_common/mem.c | 1 -
dapl/openib_common/qp.c | 17 +-
dapl/openib_common/util.c | 79 ++-
dapl/openib_mcm/cm.c | 30 +-
dapl/openib_mcm/mix.c | 7 +-
dapl/openib_scm/cm.c | 78 ++-
dapl/openib_scm/dapl_ib_util.h | 1 -
dapl/openib_scm/device.c | 2 -
dapl/openib_ucm/cm.c | 40 +-
dapl/openib_ucm/device.c | 2 -
dapl/svc/mcm.c | 10 +-
dapl/svc/mix.c | 90 ++-
dapl/svc/mpxy_in.c | 59 +-
dapl/svc/mpxy_out.c | 35 +-
dapl/svc/mpxyd.c | 16 +-
dapl/svc/mpxyd.h | 3 +
doc/dat.conf | 8 +
doc/mpxyd.conf | 2 +-
test/dtest/dtest.c | 160 ++++-
test/dtest/dtestx.c | 29 +-
test/dtest/scripts/dtest_suite.sh | 1117 ++++++++++++++++++++++++++++++++++
31 files changed, 1899 insertions(+), 151 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 2851a0d..f59ee56 100755
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,215 @@
+commit 58d757c07c8a3fcf81bdd0529f633bdc5251a06c
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date: Tue Feb 16 13:12:16 2016 -0800
+
+ mpxyd: fix segfault in proxy_out debug logging
+
+ Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit b6a33ad0afe846749287f1636fa352e7609256e2
+Author: Amir Hanania <amir.hanania at intel.com>
+Date: Tue Feb 16 13:04:56 2016 -0800
+
+ mpxyd: fix debug memory buffer log function
+
+ Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit dc4e874b9ff65533100c4797cabf29980a0b0bbe
+Author: Amir Hanania <amir.hanania at intel.com>
+Date: Tue Feb 16 12:53:53 2016 -0800
+
+ dtest: -D option is not valid with scif providers
+
+ Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit d1b5d4836ad6e89f5ec739596dea502953a0bdcf
+Author: Amir Hanania <amir.hanania at intel.com>
+Date: Tue Feb 16 12:47:04 2016 -0800
+
+ dtest/dapltest: add new automated test suite for HOST to MIC testing
+
+ Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+ Signed-off-by: Amir Hanania <amir.hanania at intel.com>
+
+commit 722ffc3bf1045ff7113f08cbfce42cb8c84d6e67
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date: Tue Feb 16 12:15:08 2016 -0800
+
+ openib: update attributes correctly for iWARP transports
+
+ Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit ab67173b8024e14009c266d76ab9ec0bdd0c5d1f
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date: Wed Feb 10 14:45:12 2016 -0800
+
+ openib_common: set providers mtu to active_mtu instead of 2048
+
+ Better out of the box performance when setting mtu to active_mtu
+ instead of default settings of 2K. The new mtu settings are applied
+ on a per QP basis and negotiated via CM mtu 8-bit field. One of the
+ reserved 8 bit CM message fields is used to insure compatibility
+ with older versions.
+
+ If older endpoints are mixed with newer versions it will fallback to
+ the pre-existing 2K MTU settings, unless overriden by DAPL_IB_MTU.
+
+ The change has been made across all providers including ucm, scm, mcm,
+ and cma (rdma_cm). The mcm provider on a MIC will notify the CCL Proxy
+ service of a DAPL_IB_MTU override via a new MIX_OP_FLAGS bit
+ MIX_OP_MTU during the open call.
+
+ Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit c399977f71d02e4c37d71d9b8ed5ba3e2b0ac33b
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date: Wed Feb 10 14:44:46 2016 -0800
+
+ mpxyd: set affinity default to 2 for best performance
+
+ Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit 0f2353e62df7dddc9a31323ffea97ac08d8b8cf6
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date: Tue Feb 9 09:37:47 2016 -0800
+
+ mcm: cleanup unused variable in dapls_ib_mr_register
+
+ Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit 11c6ae4a1abe1faf56f935cfd95f87cac2a17798
+Author: Amir Hanania <amir.hanania at intel.com>
+Date: Tue Jan 26 14:03:16 2016 -0800
+
+ dtest: enhancement to test, -D option for data check
+
+ With -D option, dtest will run pingpong rdma write test
+ with data validation. Changes pattern during iterations.
+ Aborts and reports location/pattern with any miscompare.
+
+ Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+ Signed-off-by: Amir Hanania <amir.hanania at intel.com>
+
+commit b9860007fc6800dead92303ed9fecccfb465a229
+Author: Amir Hanania <amir.hanania at intel.com>
+Date: Mon Jan 25 12:30:38 2016 -0800
+
+ mcm: add support for Intel Omni-Path driver (hfi) via mic MFO mode
+
+ Set MIC based consumer to MFO (full offload) mode for both qib and new hfi devices.
+ Add to dat.conf entries for hfi verbs support. This can be run from mic or host
+ endpoints.
+
+ Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+ Signed-off-by: Amir Hanania <amir.hanania at intel.com>
+
+commit 400550c8b4a4519ef4467f20cb23d4ac80dccd5e
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date: Mon Jan 25 11:51:33 2016 -0800
+
+ mpxyd: fix ordering issues with the CCL Proxy receive side forwarding mechanism
+
+ scif_writeto doesn't guarantee ordering on DMA posting like IB rdma writes.
+ Since CCL Proxy is emulating IB semantics we must perserve order of
+ the rdma write request from MIC consumers via any proxy scif operations.
+
+ Changes made to proxy-in to defer forwarding RR completed segments
+ unless they are middle segments of a larger write operation. On FS or LS
+ the previous scif_writeto DMA operations must be completed and signaled
+ before posting a first or last segment. Last segment scif_writeto
+ operation is ordered to insure last byte is the last byte of
+ complete rdma write proxied operation.
+
+ During scif_wt errors send WC error status for each pending segment
+ with rdma write operation for accurate proxy-out error processing.
+
+ Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit c266f94af627e395c0f060005078b8152c8afe99
+Author: Amir Hanania <amir.hanania at intel.com>
+Date: Thu Dec 10 15:17:03 2015 -0800
+
+ dtest: report results only if one of the pingpong tests are run
+
+ There are two diff ping pong test cases.
+ It was possible to run dtest with none of them.
+
+ Signed-off-by: Amir Hanania <amir.hanania at intel.com>
+
+commit efa6bed3e44f445c68b011662c75e59265805c74
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date: Thu Dec 10 14:48:05 2015 -0800
+
+ mpxyd: with abnormal CM termination a CM object can be referenced after QP destroy
+
+ The proxy-in CQ is not flushed and processes properly during
+ mix_qp_destroy. Depending on the EP mode there can be 2 seperate
+ connections with multiple CQs to process. Add new mix_cq_flush
+ function that will flush all pending work on TX and RX side of
+ proxy engine. CM object is destroyed and reset only after all
+ pending work is processed on ALL endpoint CQ associations.
+ Add error logging when WR resources are exhausted.
+
+ Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit 2b294cd7dcdbccdc235c056791f36bd2821c2b9b
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date: Thu Dec 10 14:36:22 2015 -0800
+
+ mpxyd: proxy out WR resources exhausted with MFO mode endpoints
+
+ WC status of IBV_WC_RETRY_EXC_ERR reported back to MIC client
+
+ Operation processing thread doesn't yield properly
+ to enable tx thread to process completions and replenish
+ WR resources. Retries occur to quickly.
+
+ add some new error logs for resource issues.
+
+ Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit a912661e87650e2313757e0b02cbbfbf35570bb7
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date: Wed Oct 21 09:49:45 2015 -0700
+
+ release note update for CCL Proxy and Platform BIOS recommendations
+
+ Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit cb4cb9e300216a2c94082b9fe5df939c4972e1e9
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date: Fri Oct 16 13:08:11 2015 -0700
+
+ dtestx: add dat_ib_open_query only option with -q
+
+ Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit 1f4baf860cf2c17960885df7ff49cc0021fe317e
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date: Fri Oct 16 10:21:19 2015 -0700
+
+ scm: CONN_PENDING: SOCKOPT ERR Connection refused ->
+
+ Error caused by cm_msg size compatability issue with new v8
+ protocol and older socket cm providers (2.1.4 and older).
+ The ucm, cma, and mcm providers are not affected.
+
+ Modify socket data sizes for SCM request/reply to interoperate
+ between new v8 with smaller private data and older protocols.
+
+ Adjust SCM reply/rtu based on remote CM version and retry a failed
+ request with pre-v8 adjusted size in case of server side failure.
+
+ Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
+commit 0494ec10176e07804c26b28484535252e47c3f99
+Author: Arlin Davis <arlin.r.davis at intel.com>
+Date: Tue Sep 29 20:23:58 2015 -0700
+
+ Release 2.1.7
+
+ Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
+
commit 963e5d793867644c770c087f1ef443550779ca8c
Author: Arlin Davis <arlin.r.davis at intel.com>
Date: Tue Sep 29 09:05:27 2015 -0700
diff --git a/Makefile.am b/Makefile.am
index 7adaf43..ea99e6e 100755
--- a/Makefile.am
+++ b/Makefile.am
@@ -723,6 +723,7 @@ EXTRA_DIST = dat/common/dat_dictionary.h \
dapl.spec.in \
mpxyd.init.in \
$(man_MANS) \
+ test/dtest/scripts/dtest_suite.sh \
test/dapltest/scripts/cl.sh \
test/dapltest/scripts/srv.sh \
test/dapltest/scripts/regress.sh \
diff --git a/Makefile.in b/Makefile.in
index e2ed238..0acf971 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -1862,6 +1862,7 @@ EXTRA_DIST = dat/common/dat_dictionary.h \
dapl.spec.in \
mpxyd.init.in \
$(man_MANS) \
+ test/dtest/scripts/dtest_suite.sh \
test/dapltest/scripts/cl.sh \
test/dapltest/scripts/srv.sh \
test/dapltest/scripts/regress.sh \
diff --git a/configure b/configure
index a1f1a89..fca96a4 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.67 for dapl 2.1.7.
+# Generated by GNU Autoconf 2.67 for dapl 2.1.8.
#
# Report bugs to <linux-rdma at vger.kernel.org>.
#
@@ -562,8 +562,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='dapl'
PACKAGE_TARNAME='dapl'
-PACKAGE_VERSION='2.1.7'
-PACKAGE_STRING='dapl 2.1.7'
+PACKAGE_VERSION='2.1.8'
+PACKAGE_STRING='dapl 2.1.8'
PACKAGE_BUGREPORT='linux-rdma at vger.kernel.org'
PACKAGE_URL=''
@@ -1318,7 +1318,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures dapl 2.1.7 to adapt to many kinds of systems.
+\`configure' configures dapl 2.1.8 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1388,7 +1388,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of dapl 2.1.7:";;
+ short | recursive ) echo "Configuration of dapl 2.1.8:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-dapl configure 2.1.7
+dapl configure 2.1.8
generated by GNU Autoconf 2.67
Copyright (C) 2010 Free Software Foundation, Inc.
@@ -1935,7 +1935,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by dapl $as_me 2.1.7, which was
+It was created by dapl $as_me 2.1.8, which was
generated by GNU Autoconf 2.67. Invocation command line was
$ $0 $@
@@ -2803,7 +2803,7 @@ fi
# Define the identity of the package.
PACKAGE=dapl
- VERSION=2.1.7
+ VERSION=2.1.8
cat >>confdefs.h <<_ACEOF
@@ -13281,7 +13281,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by dapl $as_me 2.1.7, which was
+This file was extended by dapl $as_me 2.1.8, which was
generated by GNU Autoconf 2.67. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -13347,7 +13347,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
-dapl config.status 2.1.7
+dapl config.status 2.1.8
configured by $0, generated by GNU Autoconf 2.67,
with options \\"\$ac_cs_config\\"
diff --git a/configure.in b/configure.in
index 5fbbfe1..56c99e5 100755
--- a/configure.in
+++ b/configure.in
@@ -1,12 +1,12 @@
dnl Process this file with autoconf to produce a configure script.
AC_PREREQ(2.57)
-AC_INIT(dapl, 2.1.7, linux-rdma at vger.kernel.org)
+AC_INIT(dapl, 2.1.8, linux-rdma at vger.kernel.org)
AC_CONFIG_SRCDIR([dat/udat/udat.c])
AC_CONFIG_AUX_DIR(config)
AC_CONFIG_MACRO_DIR([m4])
AM_CONFIG_HEADER(config.h)
-AM_INIT_AUTOMAKE(dapl, 2.1.7)
+AM_INIT_AUTOMAKE(dapl, 2.1.8)
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
AM_PROG_LIBTOOL
diff --git a/dapl.spec b/dapl.spec
index 4d26fd3..74761ca 100644
--- a/dapl.spec
+++ b/dapl.spec
@@ -37,7 +37,7 @@
%{!?_CONF: %define _CONF ""}
Name: dapl
-Version: 2.1.7
+Version: 2.1.8
Release: 1%{?dist}
Summary: A Library for userspace access to RDMA devices using OS Agnostic DAT APIs, proxy daemon for offloading RDMA
@@ -153,6 +153,9 @@ fi
mv /tmp/%{version}-dat.conf %{_sysconfdir}/dat.conf
%changelog
+* Tue Feb 16 2016 Arlin Davis <ardavis at ichips.intel.com> - 2.1.8
+- DAT/DAPL Version 2.1.8 Release 1, OFED 3.18-2, MPSS 3.7
+
* Tue Sep 29 2015 Arlin Davis <ardavis at ichips.intel.com> - 2.1.7
- DAT/DAPL Version 2.1.7 Release 1, OFED 3.18-1 GA
diff --git a/dapl.spec.in b/dapl.spec.in
index 2f16477..fbec001 100755
--- a/dapl.spec.in
+++ b/dapl.spec.in
@@ -153,6 +153,9 @@ fi
mv /tmp/%{version}-dat.conf %{_sysconfdir}/dat.conf
%changelog
+* Tue Feb 16 2016 Arlin Davis <ardavis at ichips.intel.com> - 2.1.8
+- DAT/DAPL Version 2.1.8 Release 1, OFED 3.18-2, MPSS 3.7
+
* Tue Sep 29 2015 Arlin Davis <ardavis at ichips.intel.com> - 2.1.7
- DAT/DAPL Version 2.1.7 Release 1, OFED 3.18-1 GA
diff --git a/dapl/openib_cma/device.c b/dapl/openib_cma/device.c
index 9e87355..ff6c174 100755
--- a/dapl/openib_cma/device.c
+++ b/dapl/openib_cma/device.c
@@ -394,9 +394,6 @@ DAT_RETURN dapls_ib_open_hca(IN IB_HCA_NAME hca_name,
#endif
done:
- /* set default IB MTU */
- hca_ptr->ib_trans.ib_cm.mtu = dapl_ib_mtu(2048);
-
return DAT_SUCCESS;
}
diff --git a/dapl/openib_common/dapl_ib_common.h b/dapl/openib_common/dapl_ib_common.h
index 69ec31b..8ff898f 100755
--- a/dapl/openib_common/dapl_ib_common.h
+++ b/dapl/openib_common/dapl_ib_common.h
@@ -71,6 +71,7 @@ struct dcm_ib_qp {
char *wr_buf_rx; /* mcm_wr_rx_t entries, devices without inline data */
struct ibv_mr *wr_buf_rx_mr;
#endif
+ uint8_t mtu; /* RC QP MTU, cm exchange, min(local,peer) */
};
#define DCM_CQ_TX 0x1
@@ -150,7 +151,8 @@ typedef struct _ib_cm_msg
uint8_t sportx; /* extend to 24 bits */
uint8_t dportx; /* extend to 24 bits */
uint8_t rtns; /* retransmissions */
- uint8_t resv[2];
+ uint8_t mtu; /* MTU */
+ uint8_t resv[1];
union dcm_addr saddr;
union dcm_addr daddr;
union dcm_addr saddr_alt;
@@ -168,6 +170,11 @@ typedef struct _ib_named_attr
const char *mtu;
const char *port;
const char *port_num;
+ const char *iw_ext;
+ const char *ib_ext;
+ const char *i_data;
+ const char *f_add;
+ const char *c_swap;
} ib_named_attr_t;
@@ -243,7 +250,7 @@ typedef uint16_t ib_hca_port_t;
#define DCM_ACK_RETRY 7 /* 3 bits, 7 * 4.2 == 30 seconds */
#define DCM_RNR_TIMER 12 /* 5 bits, 12 =.64ms, 28 =163ms, 31 =491ms */
#define DCM_RNR_RETRY 7 /* 3 bits, 7 == infinite */
-#define DCM_IB_MTU 2048
+#define DCM_IB_MTU 4096 /* new default MTU size */
/* Global routing defaults */
#define DCM_GLOBAL 0 /* global routing is disabled */
diff --git a/dapl/openib_common/dapl_mic_common.h b/dapl/openib_common/dapl_mic_common.h
index 86a815e..0231013 100755
--- a/dapl/openib_common/dapl_mic_common.h
+++ b/dapl/openib_common/dapl_mic_common.h
@@ -234,7 +234,8 @@ typedef struct dat_mcm_msg
uint32_t s_id; /* src pid */
uint32_t d_id; /* dst pid */
uint8_t rd_in; /* atomic_rd_in */
- uint8_t rsvd[4];
+ uint8_t mtu; /* mtu */
+ uint8_t rsvd[3];
uint8_t seg_sz; /* data segment size in power of 2 */
dat_mcm_addr_t saddr1; /* QPt local, MPXY or MCM on non-MIC node */
dat_mcm_addr_t saddr2; /* QPr local, MIC or MCM on non-MIC node or MPXY */
@@ -369,6 +370,7 @@ typedef enum dat_mix_op_flags
MIX_OP_ASYNC = 0x08,
MIX_OP_INLINE = 0x10,
MIX_OP_SET = 0x20,
+ MIX_OP_MTU = 0x40,
} dat_mix_op_flags_t;
diff --git a/dapl/openib_common/mem.c b/dapl/openib_common/mem.c
index 7f5ea6a..34e4234 100755
--- a/dapl/openib_common/mem.c
+++ b/dapl/openib_common/mem.c
@@ -164,7 +164,6 @@ dapls_ib_mr_register(IN DAPL_IA * ia_ptr,
IN DAT_MEM_PRIV_FLAGS privileges, IN DAT_VA_TYPE va_type)
#ifdef _OPENIB_MCM_
{
- struct ibv_device *ibv_dev;
int ib_access = dapls_convert_privileges(privileges);
dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
diff --git a/dapl/openib_common/qp.c b/dapl/openib_common/qp.c
index 01f91ca..3d622ab 100755
--- a/dapl/openib_common/qp.c
+++ b/dapl/openib_common/qp.c
@@ -648,19 +648,22 @@ dapls_modify_qp_state(IN struct ibv_qp *qp_handle,
qp_attr.dest_qp_num = ntohl(qpn);
qp_attr.rq_psn = 1;
- qp_attr.path_mtu = ia_ptr->hca_ptr->ib_trans.ib_cm.mtu;
qp_attr.min_rnr_timer = ia_ptr->hca_ptr->ib_trans.ib_cm.rnr_timer;
+ qp_attr.path_mtu = ep_ptr->qp_handle->mtu ?
+ ep_ptr->qp_handle->mtu :
+ ia_ptr->hca_ptr->ib_trans.ib_cm.mtu;
#ifdef _OPENIB_MCM_
qp_attr.max_dest_rd_atomic = ia_ptr->hca_ptr->ib_trans.ib_cm.rd_atom_in;
#else
qp_attr.max_dest_rd_atomic = ep_ptr->param.ep_attr.max_rdma_read_in;
#endif
- dapl_dbg_log(DAPL_DBG_TYPE_EP,
- " QPS_RTR: type %d l_qpn %x qpn %x lid 0x%x"
- " port %d ep %p qp_state %d rd_atomic %d\n",
- qp_handle->qp_type, qp_handle->qp_num,
- ntohl(qpn), ntohs(lid), ia_ptr->hca_ptr->port_num,
- ep_ptr, ep_ptr->qp_state, qp_attr.max_dest_rd_atomic);
+ dapl_log(DAPL_DBG_TYPE_EP,
+ " QPS_RTR: type %d l_qpn %x qpn %x lid 0x%x"
+ " port %d ep %p qp_state %d rd_atomic %d mtu %d lmtu %d\n",
+ qp_handle->qp_type, qp_handle->qp_num,
+ ntohl(qpn), ntohs(lid), ia_ptr->hca_ptr->port_num,
+ ep_ptr, ep_ptr->qp_state, qp_attr.max_dest_rd_atomic,
+ qp_attr.path_mtu, ia_ptr->hca_ptr->ib_trans.ib_cm.mtu);
/* address handle. RC and UD */
qp_attr.ah_attr.dlid = ntohs(lid);
diff --git a/dapl/openib_common/util.c b/dapl/openib_common/util.c
index 55bda3b..ec5d72f 100755
--- a/dapl/openib_common/util.c
+++ b/dapl/openib_common/util.c
@@ -285,7 +285,7 @@ enum ibv_mtu dapl_ib_mtu(int mtu)
case 4096:
return IBV_MTU_4096;
default:
- return IBV_MTU_1024;
+ return IBV_MTU_4096;
}
}
@@ -303,7 +303,7 @@ const char *dapl_ib_mtu_str(enum ibv_mtu mtu)
case IBV_MTU_4096:
return "4096";
default:
- return "1024";
+ return "4096";
}
}
@@ -424,6 +424,13 @@ DAT_RETURN dapls_ib_query_hca(IN DAPL_HCA * hca_ptr,
dev_attr.max_qp_wr = DAPL_MIN(dev_attr.max_qp_wr,
dapl_os_get_env_val("DAPL_WR_MAX", dev_attr.max_qp_wr));
+ /* MTU to active by default, reset if env set and <= active_mtu */
+ if (getenv("DAPL_IB_MTU"))
+ tp->ib_cm.mtu = DAPL_MIN(port_attr.active_mtu,
+ dapl_ib_mtu(dapl_os_get_env_val("DAPL_IB_MTU", DCM_IB_MTU)));
+ else
+ tp->ib_cm.mtu = port_attr.active_mtu;
+
#ifdef _OPENIB_MCM_
/* Adjust for CCL Proxy; limited sge's, no READ support, reduce QP and RDMA limits */
dev_attr.max_sge = DAPL_MIN(dev_attr.max_sge, DAT_MIX_SGE_MAX);
@@ -483,21 +490,32 @@ DAT_RETURN dapls_ib_query_hca(IN DAPL_HCA * hca_ptr,
#ifdef _OPENIB_MCM_
if (!MFO_EP(&hca_ptr->ib_trans.addr))
#endif
- if (hca_ptr->ib_hca_handle->device->transport_type == IBV_TRANSPORT_IWARP)
- ia_attr->max_iov_segments_per_rdma_read = 1;
+ if (hca_ptr->ib_hca_handle->device->transport_type == IBV_TRANSPORT_IWARP)
+ ia_attr->max_iov_segments_per_rdma_read = 1;
ia_attr->max_iov_segments_per_rdma_write = dev_attr.max_sge;
ia_attr->num_transport_attr = 0;
ia_attr->transport_attr = NULL;
ia_attr->num_vendor_attr = 0;
ia_attr->vendor_attr = NULL;
#ifdef DAT_EXTENSIONS
- ia_attr->extension_supported = DAT_EXTENSION_IB;
- ia_attr->extension_version = DAT_IB_EXTENSION_VERSION;
+ if (hca_ptr->ib_hca_handle->device->transport_type == IBV_TRANSPORT_IWARP) {
+ ia_attr->extension_supported = DAT_EXTENSION_NONE;
+ tp->na.ib_ext = "FALSE";
+ tp->na.i_data = "FALSE";
+ tp->na.f_add = "FALSE";
+ tp->na.c_swap = "FALSE";
+ } else {
+ ia_attr->extension_supported = DAT_EXTENSION_IB;
+ ia_attr->extension_version = DAT_IB_EXTENSION_VERSION;
+ tp->na.ib_ext = "TRUE";
+ tp->na.i_data = "TRUE";
+ tp->na.f_add = "TRUE";
+ tp->na.c_swap = "TRUE";
+ }
#endif
/* save key device attributes for CM exchange */
tp->ib_cm.rd_atom_in = dev_attr.max_qp_rd_atom;
tp->ib_cm.rd_atom_out = dev_attr.max_qp_init_rd_atom;
- tp->ib_cm.mtu = DAPL_MIN(port_attr.active_mtu, tp->ib_cm.mtu);
tp->ib_cm.ack_timer = DAPL_MAX(dev_attr.local_ca_ack_delay, tp->ib_cm.ack_timer);
/* set provider/transport specific named attributes */
@@ -888,28 +906,47 @@ DAT_NAMED_ATTR ib_attrs[] = {
void dapls_query_provider_specific_attr(IN DAPL_IA * ia_ptr,
IN DAT_PROVIDER_ATTR * attr_ptr)
{
+ int i;
+
attr_ptr->num_provider_specific_attr = SPEC_ATTR_SIZE(ib_attrs);
attr_ptr->provider_specific_attr = ib_attrs;
dapl_log(DAPL_DBG_TYPE_UTIL,
" prov_attr: %p sz %d\n", ib_attrs, SPEC_ATTR_SIZE(ib_attrs));
- /* update common attributes from providers */
- ib_attrs[1].value = ia_ptr->hca_ptr->ib_trans.na.dev;
- ib_attrs[2].value = ia_ptr->hca_ptr->ib_trans.na.mode;
- ib_attrs[3].value = ia_ptr->hca_ptr->ib_trans.na.read;
- ib_attrs[4].value = ia_ptr->hca_ptr->ib_trans.guid_str;
- ib_attrs[5].value = ia_ptr->hca_ptr->ib_trans.na.mtu;
- ib_attrs[6].value = ia_ptr->hca_ptr->ib_trans.na.port;
- ib_attrs[7].value = ia_ptr->hca_ptr->ib_trans.na.port_num;
+ for (i=0; i<SPEC_ATTR_SIZE(ib_attrs); i++) {
+ /* update attributes from IA query results */
+ if (!strcmp(ib_attrs[i].name, "DAT_IB_DEVICE_NAME"))
+ ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.dev;
+ else if (!strcmp(ib_attrs[i].name, "DAT_IB_CONNECTIVITY_MODE"))
+ ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.mode;
+ else if (!strcmp(ib_attrs[i].name, "DAT_IB_RDMA_READ"))
+ ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.read;
+ else if (!strcmp(ib_attrs[i].name, "DAT_IB_NODE_GUID"))
+ ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.guid_str;
+ else if (!strcmp(ib_attrs[i].name, "DAT_IB_TRANSPORT_MTU"))
+ ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.mtu;
+ else if (!strcmp(ib_attrs[i].name, "DAT_IB_PORT_STATUS"))
+ ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.port;
+ else if (!strcmp(ib_attrs[i].name, "DAT_IB_PORT_NUMBER"))
+ ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.port_num;
+ else if (!strcmp(ib_attrs[i].name, "DAT_EXTENSION_INTERFACE"))
+ ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.ib_ext;
+ else if (!strcmp(ib_attrs[i].name, "DAT_IB_IMMED_DATA"))
+ ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.i_data;
+ else if (!strcmp(ib_attrs[i].name, "DAT_IB_FETCH_AND_ADD"))
+ ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.f_add;
+ else if (!strcmp(ib_attrs[i].name, "DAT_IB_CMP_AND_SWAP"))
+ ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.na.c_swap;
#ifdef _OPENIB_MCM_
-{
- int i = attr_ptr->num_provider_specific_attr;
- ib_attrs[i-3].value = ia_ptr->hca_ptr->ib_trans.fam_str;
- ib_attrs[i-2].value = ia_ptr->hca_ptr->ib_trans.mod_str;
- ib_attrs[i-1].value = ia_ptr->hca_ptr->ib_trans.ver_str;
-}
+ else if (!strcmp(ib_attrs[i].name, "DAT_IB_PROXY_CPU_FAMILY"))
+ ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.fam_str;
+ else if (!strcmp(ib_attrs[i].name, "DAT_IB_PROXY_CPU_MODEL"))
+ ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.mod_str;
+ else if (!strcmp(ib_attrs[i].name, "DAT_IB_PROXY_VERSION"))
+ ib_attrs[i].value = ia_ptr->hca_ptr->ib_trans.ver_str;
#endif
+ }
}
/*
diff --git a/dapl/openib_mcm/cm.c b/dapl/openib_mcm/cm.c
index f2a4b8d..48ff0b3 100755
--- a/dapl/openib_mcm/cm.c
+++ b/dapl/openib_mcm/cm.c
@@ -1104,6 +1104,11 @@ void mcm_connect_rtu(dp_ib_cm_handle_t cm, dat_mcm_msg_t *msg)
if (msg->seg_sz) /* set po2 seg_sz, if provided */
cm->msg.seg_sz = msg->seg_sz;
+ /* Set QP MTU, if negotiated. 2K for compatibility */
+ ep->qp_handle->mtu = msg->mtu ?
+ DAPL_MIN(msg->mtu, cm->hca->ib_trans.ib_cm.mtu):
+ getenv("DAPL_IB_MTU") ? cm->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048;
+
cm->msg.d_id = msg->s_id;
dapl_os_memcpy(&ep->remote_ia_address, &msg->saddr2, sizeof(dat_mcm_addr_t));
dapl_os_memcpy(&cm->msg.daddr2, &msg->saddr2, sizeof(dat_mcm_addr_t));
@@ -1129,10 +1134,12 @@ void mcm_connect_rtu(dp_ib_cm_handle_t cm, dat_mcm_msg_t *msg)
}
dapl_dbg_log(DAPL_DBG_TYPE_CM,
- " CONN_RTU: DST lid=%x, QPr=%x, QPt=%x qp_type=%d, port=%x psize=%d\n",
+ " CONN_RTU: DST lid=%x, QPr=%x, QPt=%x qp_type=%d,"
+ " port=%x psize=%d mtu=%d,%d\n",
ntohs(cm->msg.daddr1.lid), ntohl(cm->msg.daddr1.qpn),
ntohl(cm->msg.daddr2.qpn), cm->msg.daddr1.qp_type,
- ntohs(msg->sport), ntohs(msg->p_size));
+ ntohs(msg->sport), ntohs(msg->p_size),
+ cm->tp->ib_cm.mtu, cm->msg.mtu);
if (ntohs(msg->op) == MCM_REP)
event = IB_CME_CONNECTED;
@@ -1227,6 +1234,7 @@ void mcm_connect_rtu(dp_ib_cm_handle_t cm, dat_mcm_msg_t *msg)
/* Send RTU, no private data */
cm->msg.op = htons(MCM_RTU);
+ cm->msg.mtu = ep->qp_handle->mtu; /* send negotiated MTU */
dapl_os_lock(&cm->lock);
cm->state = MCM_CONNECTED;
@@ -1249,11 +1257,12 @@ void mcm_connect_rtu(dp_ib_cm_handle_t cm, dat_mcm_msg_t *msg)
cm->msg.p_data, ntohs(cm->msg.p_size), cm->ep);
dapl_log(DAPL_DBG_TYPE_CM_EST,
- " mcm_ACTIVE_CONN %p %d [lid port qpn] %x %x %x -> %x %x %x %s\n",
+ " mcm_ACTIVE_CONN %p %d [lid port qpn] %x %x %x -> %x %x %x %s mtu %d\n",
cm->hca, cm->retries, ntohs(cm->msg.saddr1.lid),
ntohs(cm->msg.sport), ntohl(cm->msg.saddr1.qpn),
ntohs(cm->msg.daddr1.lid), ntohs(cm->msg.dport),
- ntohl(cm->msg.dqpn), mcm_map_str(cm->msg.daddr1.ep_map));
+ ntohl(cm->msg.dqpn), mcm_map_str(cm->msg.daddr1.ep_map),
+ cm->ep->qp_handle->mtu);
mcm_log_addrs(DAPL_DBG_TYPE_CM_EST, &cm->msg, cm->state, 0);
@@ -1291,6 +1300,7 @@ static void mcm_accept(ib_cm_srvc_handle_t cm, dat_mcm_msg_t *msg)
acm->msg.p_size = msg->p_size;
acm->msg.d_id = msg->s_id;
acm->msg.rd_in = msg->rd_in;
+ acm->msg.mtu = msg->mtu; /* save peer MTU */
if (msg->seg_sz) /* set po2 seg_sz, if provided */
acm->msg.seg_sz = msg->seg_sz;
@@ -1359,11 +1369,12 @@ static void mcm_accept_rtu(dp_ib_cm_handle_t cm, dat_mcm_msg_t *msg)
dapls_cr_callback(cm, IB_CME_CONNECTED, NULL, 0, cm->sp);
dapl_log(DAPL_DBG_TYPE_CM_EST,
- " PASSIVE_CONN %p %d [lid port qpn] %x %x %x <- %x %x %x %s\n",
+ " PASSIVE_CONN %p %d [lid port qpn] %x %x %x <- %x %x %x %s mtu %d\n",
cm->hca, cm->retries, ntohs(cm->msg.saddr1.lid),
ntohs(cm->msg.sport), ntohl(cm->msg.saddr1.qpn),
ntohs(cm->msg.daddr1.lid), ntohs(cm->msg.dport),
- ntohl(cm->msg.dqpn), mcm_map_str(cm->msg.daddr1.ep_map));
+ ntohl(cm->msg.dqpn), mcm_map_str(cm->msg.daddr1.ep_map),
+ cm->ep->qp_handle->mtu);
mcm_log_addrs(DAPL_DBG_TYPE_CM_EST, &cm->msg, cm->state, 1);
return;
@@ -1489,6 +1500,11 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data)
ep->param.ep_attr.max_rdma_read_out =
DAPL_MIN(ep->param.ep_attr.max_rdma_read_out, cm->msg.rd_in);
+ /* Set QP MTU, if negotiated. 2K for compatibility */
+ ep->qp_handle->mtu = cm->msg.mtu ?
+ DAPL_MIN(cm->msg.mtu, cm->hca->ib_trans.ib_cm.mtu):
+ getenv("DAPL_IB_MTU") ? cm->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048;
+
/* modify QPr to RTR and then to RTS, QPr (qp) to remote QPt (daddr2), !xsocket */
dapl_os_lock(&ep->header.lock);
if (!MXF_EP(&cm->hca->ib_trans.addr)) {
@@ -1567,6 +1583,7 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data)
/* setup local QPr info (if !KR) and type from EP, copy pdata, for reply */
cm->msg.op = htons(MCM_REP);
cm->msg.rd_in = ep->param.ep_attr.max_rdma_read_in;
+ cm->msg.mtu = ep->qp_handle->mtu; /* send negotiated MTU */
if (!MXF_EP(&cm->hca->ib_trans.addr)) {
cm->msg.saddr1.qpn = htonl(ep->qp_handle->qp->qp_num);
@@ -1680,6 +1697,7 @@ dapls_ib_connect(IN DAT_EP_HANDLE ep_handle,
/* set max rdma inbound requests */
cm->msg.rd_in = ep->param.ep_attr.max_rdma_read_in;
+ cm->msg.mtu = cm->tp->ib_cm.mtu; /* local MTU to peer */
if (p_size) {
cm->msg.p_size = htons(p_size);
diff --git a/dapl/openib_mcm/mix.c b/dapl/openib_mcm/mix.c
index 970b372..5d96eb5 100755
--- a/dapl/openib_mcm/mix.c
+++ b/dapl/openib_mcm/mix.c
@@ -62,8 +62,8 @@ int dapli_mix_mode(ib_hca_transport_t *tp, char *name)
return 0;
}
- /* MIC node: "qib" device requires full offload */
- mfo_dev = !dapl_os_pstrcmp("qib", name);
+ /* MIC node: "qib" and "hfi" devices requires full offload */
+ mfo_dev = !dapl_os_pstrcmp("qib", name) || !dapl_os_pstrcmp("hfi", name);
if (mfo_mode || mfo_dev) {
tp->addr.ep_map = MIC_FULL_DEV; /* MIC with full proxy offload, no direct verbs */
}
@@ -171,6 +171,9 @@ int dapli_mix_open(ib_hca_transport_t *tp, char *name, int port, int query_only)
msg.port = port;
strcpy((char*)&msg.name, name);
+ if (getenv("DAPL_IB_MTU"))
+ msg.hdr.flags |= MIX_OP_MTU;
+
/* send any overridden attributes to proxy */
msg.dev_attr.ack_timer = tp->ib_cm.ack_timer;
msg.dev_attr.ack_retry = tp->ib_cm.ack_retry;
diff --git a/dapl/openib_scm/cm.c b/dapl/openib_scm/cm.c
index ecc5418..35164ef 100755
--- a/dapl/openib_scm/cm.c
+++ b/dapl/openib_scm/cm.c
@@ -63,6 +63,8 @@
extern char *gid_str;
+#define SCM_BC_DIFF 2 /* cm_msg adjust, pre v8 */
+
/* forward declarations */
static DAT_RETURN
dapli_socket_connect(DAPL_EP * ep_ptr,
@@ -561,6 +563,11 @@ static void dapli_socket_connected(dp_ib_cm_handle_t cm_ptr, int err)
/* send qp info and pdata to remote peer */
exp = sizeof(ib_cm_msg_t) - DCM_MAX_PDATA_SIZE;
+ if (cm_ptr->retry == SCM_CR_RETRY-1) {
+ exp += SCM_BC_DIFF; /* retry with pre-v8 req */
+ dapl_log(DAPL_DBG_TYPE_CM_WARN,
+ " CONN_REQ: retry pre-v8 protocol; peer != v8\n");
+ }
iov[0].iov_base = (void *)&cm_ptr->msg;
iov[0].iov_len = exp;
if (cm_ptr->msg.p_size) {
@@ -583,16 +590,14 @@ static void dapli_socket_connected(dp_ib_cm_handle_t cm_ptr, int err)
goto bail;
}
- dapl_dbg_log(DAPL_DBG_TYPE_CM,
- " CONN_PENDING: sending SRC lid=0x%x,"
- " qpn=0x%x, psize=%d\n",
- ntohs(cm_ptr->msg.saddr.ib.lid),
- ntohl(cm_ptr->msg.saddr.ib.qpn),
- ntohs(cm_ptr->msg.p_size));
- dapl_dbg_log(DAPL_DBG_TYPE_CM,
- " CONN_PENDING: SRC GID %s\n",
- inet_ntop(AF_INET6, &cm_ptr->hca->ib_trans.gid,
- gid_str, sizeof(gid_str)));
+ dapl_log(DAPL_DBG_TYPE_CM,
+ " CONN_REQ: (%d) SRC lid=0x%x,"
+ " qpn=0x%x, psize=%d GID %s\n",
+ exp, ntohs(cm_ptr->msg.saddr.ib.lid),
+ ntohl(cm_ptr->msg.saddr.ib.qpn),
+ ntohs(cm_ptr->msg.p_size),
+ inet_ntop(AF_INET6, &cm_ptr->hca->ib_trans.gid,
+ gid_str, sizeof(gid_str)));
DAPL_CNTR(((DAPL_IA *)dapl_llist_peek_head(&cm_ptr->hca->ia_list_head)), DCNT_IA_CM_REQ_TX);
return;
@@ -666,6 +671,7 @@ dapli_socket_connect(DAPL_EP * ep_ptr,
/* REQ: QP info in msg.saddr, IA address in msg.daddr, and pdata */
cm_ptr->hca = ia_ptr->hca_ptr;
cm_ptr->msg.op = ntohs(DCM_REQ);
+ cm_ptr->msg.mtu = ia_ptr->hca_ptr->ib_trans.ib_cm.mtu; /* local MTU to peer */
cm_ptr->msg.saddr.ib.qpn = htonl(ep_ptr->qp_handle->qp->qp_num);
cm_ptr->msg.saddr.ib.qp_type = ep_ptr->qp_handle->qp->qp_type;
cm_ptr->msg.saddr.ib.lid = ia_ptr->hca_ptr->ib_trans.lid;
@@ -730,9 +736,16 @@ static void dapli_socket_connect_rtu(dp_ib_cm_handle_t cm_ptr)
socklen_t sl;
/* read DST information into cm_ptr, overwrite SRC info */
- dapl_dbg_log(DAPL_DBG_TYPE_EP, " connect_rtu: recv peer QP data\n");
-
len = recv(cm_ptr->socket, (char *)&cm_ptr->msg, exp, 0);
+
+ if (ntohs(cm_ptr->msg.ver) < DCM_VER_XPS) {
+ len += recv(cm_ptr->socket, (char *)&cm_ptr->msg+len, SCM_BC_DIFF, 0);
+ exp += SCM_BC_DIFF;
+ }
+
+ dapl_log(DAPL_DBG_TYPE_CM, " CONN_REP_IN: ver %d cm_sz %d, p_sz %d\n",
+ ntohs(cm_ptr->msg.ver), exp, ntohs(cm_ptr->msg.p_size));
+
if (len != exp || ntohs(cm_ptr->msg.ver) < DCM_VER_MIN) {
int err = dapl_socket_errno();
dapl_log(DAPL_DBG_TYPE_CM_WARN,
@@ -745,9 +758,10 @@ static void dapli_socket_connect_rtu(dp_ib_cm_handle_t cm_ptr)
cm_ptr->retry);
/* Retry; corner case where server tcp stack resets under load */
- if (err == ECONNRESET && --cm_ptr->retry) {
+ if ((err == ECONNRESET && --cm_ptr->retry) || (--cm_ptr->retry == SCM_CR_RETRY-1)) {
closesocket(cm_ptr->socket);
cm_ptr->socket = DAPL_INVALID_SOCKET;
+ dapl_log(DAPL_DBG_TYPE_CM_WARN, "CONN_REQ: retry %d\n", cm_ptr->retry);
dapli_socket_connect(cm_ptr->ep, (DAT_IA_ADDRESS_PTR)&cm_ptr->addr,
ntohs(((struct sockaddr_in *)&cm_ptr->addr)->sin_port) - 1000,
ntohs(cm_ptr->msg.p_size), &cm_ptr->msg.p_data, cm_ptr->retry);
@@ -800,9 +814,10 @@ static void dapli_socket_connect_rtu(dp_ib_cm_handle_t cm_ptr)
len = recv(cm_ptr->socket, cm_ptr->msg.p_data, exp, 0);
if (len != exp) {
int err = dapl_socket_errno();
- dapl_log(DAPL_DBG_TYPE_ERR,
- " CONN_RTU read pdata: ERR 0x%x %s, rcnt=%d -> %s\n",
- err, strerror(err), len,
+ dapl_log(DAPL_DBG_TYPE_CM_WARN,
+ " CONN_RTU read pdata: ERR 0x%x %s"
+ " rcv %d != exp %d -> %s\n",
+ err, strerror(err), len, exp,
inet_ntoa(((struct sockaddr_in *)
ep_ptr->param.
remote_ia_address_ptr)->sin_addr));
@@ -834,6 +849,11 @@ static void dapli_socket_connect_rtu(dp_ib_cm_handle_t cm_ptr)
DAPL_MIN(ep_ptr->param.ep_attr.max_rdma_read_out,
cm_ptr->msg.rd_in);
+ /* Set QP MTU, if negotiated. 2K for compatibility */
+ ep_ptr->qp_handle->mtu = cm_ptr->msg.mtu ?
+ DAPL_MIN(cm_ptr->msg.mtu, cm_ptr->hca->ib_trans.ib_cm.mtu):
+ getenv("DAPL_IB_MTU") ? cm_ptr->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048;
+
/* modify QP to RTR and then to RTS with remote info */
dapl_os_lock(&ep_ptr->header.lock);
if (dapls_modify_qp_state(ep_ptr->qp_handle->qp,
@@ -881,6 +901,7 @@ static void dapli_socket_connect_rtu(dp_ib_cm_handle_t cm_ptr)
dapl_os_unlock(&cm_ptr->lock);
cm_ptr->msg.op = ntohs(DCM_RTU);
+ cm_ptr->msg.mtu = ep_ptr->qp_handle->mtu; /* send negotiated MTU */
if (send(cm_ptr->socket, (char *)&cm_ptr->msg, 4, 0) == -1) {
int err = dapl_socket_errno();
dapl_log(DAPL_DBG_TYPE_ERR,
@@ -954,10 +975,11 @@ ud_bail:
DCM_MAX_PDATA_SIZE, ep_ptr);
}
dapl_log(DAPL_DBG_TYPE_CM_EST,
- " SCM ACTIVE CONN: %x -> %s %x\n",
+ " SCM ACTIVE CONN: %x -> %s %x mtu %d\n",
ntohs(((struct sockaddr_in *) &cm_ptr->addr)->sin_port),
inet_ntoa(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_addr),
- ntohs(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_port)-1000);
+ ntohs(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_port)-1000,
+ ep_ptr->qp_handle->mtu);
return;
bail:
@@ -1121,6 +1143,12 @@ static void dapli_socket_accept_data(ib_cm_srvc_handle_t acm_ptr)
/* read in DST QP info, IA address. check for private data */
len = recv(acm_ptr->socket, (char *)&acm_ptr->msg, exp, 0);
+
+ if (ntohs(acm_ptr->msg.ver) < DCM_VER_XPS) {
+ len += recv(acm_ptr->socket, (char *)&acm_ptr->msg+len, SCM_BC_DIFF, 0);
+ exp += SCM_BC_DIFF;
+ }
+
if (len != exp || ntohs(acm_ptr->msg.ver) < DCM_VER_MIN) {
int err = dapl_socket_errno();
dapl_log(DAPL_DBG_TYPE_ERR,
@@ -1251,6 +1279,14 @@ dapli_socket_accept_usr(DAPL_EP * ep_ptr,
DAPL_MIN(ep_ptr->param.ep_attr.max_rdma_read_out,
cm_ptr->msg.rd_in);
+ if (ntohs(cm_ptr->msg.ver) < DCM_VER_XPS)
+ exp += SCM_BC_DIFF;
+
+ /* Set QP MTU, if negotiated. 2K for compatibility */
+ ep_ptr->qp_handle->mtu = cm_ptr->msg.mtu ?
+ DAPL_MIN(cm_ptr->msg.mtu, cm_ptr->hca->ib_trans.ib_cm.mtu):
+ getenv("DAPL_IB_MTU") ? cm_ptr->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048;
+
/* modify QP to RTR and then to RTS with remote info already read */
dapl_os_lock(&ep_ptr->header.lock);
if (dapls_modify_qp_state(ep_ptr->qp_handle->qp,
@@ -1290,6 +1326,7 @@ dapli_socket_accept_usr(DAPL_EP * ep_ptr,
local.ver = htons(DCM_VER);
local.op = htons(DCM_REP);
local.rd_in = ep_ptr->param.ep_attr.max_rdma_read_in;
+ local.mtu = ep_ptr->qp_handle->mtu; /* send negotiated MTU */
local.saddr.ib.qpn = htonl(ep_ptr->qp_handle->qp->qp_num);
local.saddr.ib.qp_type = ep_ptr->qp_handle->qp->qp_type;
local.saddr.ib.lid = ia_ptr->hca_ptr->ib_trans.lid;
@@ -1435,10 +1472,11 @@ ud_bail:
dapls_cr_callback(cm_ptr, event, NULL, 0, cm_ptr->sp);
}
dapl_log(DAPL_DBG_TYPE_CM_EST,
- " SCM PASSIVE CONN: %x <- %s %x\n",
+ " SCM PASSIVE CONN: %x <- %s %x mtu %d\n",
cm_ptr->sp->conn_qual,
inet_ntoa(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_addr),
- ntohs(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_port));
+ ntohs(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_port),
+ cm_ptr->ep->qp_handle->mtu);
return;
bail:
diff --git a/dapl/openib_scm/dapl_ib_util.h b/dapl/openib_scm/dapl_ib_util.h
index b03018b..ad5bc60 100755
--- a/dapl/openib_scm/dapl_ib_util.h
+++ b/dapl/openib_scm/dapl_ib_util.h
@@ -65,7 +65,6 @@ typedef dp_ib_cm_handle_t ib_cm_srvc_handle_t;
#define SCM_RNR_TIMER 12 /* 5 bits, 12 =.64ms, 28 =163ms, 31 =491ms */
#define SCM_RNR_RETRY 7 /* 3 bits, 7 == infinite */
#define SCM_CR_RETRY 5 /* retries for busy server, connect refused */
-#define SCM_IB_MTU 2048
/* Global routing defaults */
#define SCM_GLOBAL 0 /* global routing is disabled */
diff --git a/dapl/openib_scm/device.c b/dapl/openib_scm/device.c
index 43f9eaf..b210a15 100755
--- a/dapl/openib_scm/device.c
+++ b/dapl/openib_scm/device.c
@@ -371,8 +371,6 @@ DAT_RETURN dapls_ib_open_hca(IN IB_HCA_NAME hca_name,
dapl_os_get_env_val("DAPL_HOP_LIMIT", SCM_HOP_LIMIT);
hca_ptr->ib_trans.ib_cm.tclass =
dapl_os_get_env_val("DAPL_TCLASS", SCM_TCLASS);
- hca_ptr->ib_trans.ib_cm.mtu =
- dapl_ib_mtu(dapl_os_get_env_val("DAPL_IB_MTU", SCM_IB_MTU));
if (flags & DAPL_OPEN_QUERY)
goto done;
diff --git a/dapl/openib_ucm/cm.c b/dapl/openib_ucm/cm.c
index 3d06c82..88dd890 100755
--- a/dapl/openib_ucm/cm.c
+++ b/dapl/openib_ucm/cm.c
@@ -622,14 +622,15 @@ dp_ib_cm_handle_t ucm_cm_find(ib_hca_transport_t *tp, ib_cm_msg_t *msg)
lock = &tp->lock;
dapl_log(DAPL_DBG_TYPE_CM,
- " ucm_recv: %s %d %x %x i %x c %x < %d %x %x i %x c %x\n",
+ " ucm_recv: %s %d %x %x i %x c %x < %d %x %x i %x c %x (%d,%d)\n",
dapl_cm_op_str(msg_op),
ntohl(msg->d_id), ntohs(msg->daddr.ib.lid),
UCM_PORT_NTOH(msg->dportx, msg->dport),
ntohl(msg->daddr.ib.qpn), ntohl(msg->dqpn),
ntohl(msg->s_id), ntohs(msg->saddr.ib.lid),
UCM_PORT_NTOH(msg->sportx, msg->sport),
- ntohl(msg->saddr.ib.qpn), ntohl(msg->sqpn));
+ ntohl(msg->saddr.ib.qpn), ntohl(msg->sqpn),
+ tp->ib_cm.mtu, msg->mtu);
retry_listenq:
dapl_os_lock(lock);
@@ -1467,12 +1468,13 @@ dapli_cm_connect(DAPL_EP *ep, dp_ib_cm_handle_t cm)
{
dapl_log(DAPL_DBG_TYPE_EP,
" connect: lid %x i_qpn %x lport %x p_sz=%d -> "
- " lid %x c_qpn %x rport %x\n",
+ " lid %x c_qpn %x rport %x l_mtu %d\n",
htons(cm->msg.saddr.ib.lid), htonl(cm->msg.saddr.ib.qpn),
UCM_PORT_NTOH(cm->msg.sportx,cm->msg.sport),
htons(cm->msg.p_size),
htons(cm->msg.daddr.ib.lid), htonl(cm->msg.dqpn),
- UCM_PORT_NTOH(cm->msg.dportx,cm->msg.dport));
+ UCM_PORT_NTOH(cm->msg.dportx,cm->msg.dport),
+ cm->hca->ib_trans.ib_cm.mtu);
dapl_os_lock(&cm->lock);
if (cm->state != DCM_INIT && cm->state != DCM_REP_PENDING) {
@@ -1513,6 +1515,8 @@ dapli_cm_connect(DAPL_EP *ep, dp_ib_cm_handle_t cm)
cm->state = DCM_REP_PENDING;
cm->msg.op = htons(DCM_REQ);
+ cm->msg.mtu = cm->hca->ib_trans.ib_cm.mtu; /* local MTU to peer */
+
if (ucm_send(&cm->hca->ib_trans, &cm->msg,
&cm->msg.p_data, ntohs(cm->msg.p_size))) {
dapl_os_unlock(&cm->lock);
@@ -1638,6 +1642,10 @@ static void ucm_connect_rtu(dp_ib_cm_handle_t cm, ib_cm_msg_t *msg)
cm->ep->param.ep_attr.max_rdma_read_out =
DAPL_MIN(cm->ep->param.ep_attr.max_rdma_read_out,
cm->msg.rd_in);
+ /* Set QP MTU, if negotiated. 2K for compatibility */
+ ep->qp_handle->mtu = msg->mtu ?
+ DAPL_MIN(msg->mtu, cm->hca->ib_trans.ib_cm.mtu):
+ getenv("DAPL_IB_MTU") ? cm->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048;
/* modify QP to RTR and then to RTS with remote info */
dapl_os_lock(&cm->ep->header.lock);
@@ -1671,6 +1679,7 @@ static void ucm_connect_rtu(dp_ib_cm_handle_t cm, ib_cm_msg_t *msg)
/* Send RTU, no private data */
cm->msg.op = htons(DCM_RTU);
+ cm->msg.mtu = ep->qp_handle->mtu; /* send negotiated MTU */
dapl_os_lock(&cm->lock);
cm->state = DCM_CONNECTED;
@@ -1760,11 +1769,11 @@ ud_bail:
}
dapl_log(DAPL_DBG_TYPE_CM_EST,
- " UCM_ACTIVE_CONN %p %d [lid port qpn] %x %x %x -> %x %x %x xevent=%d\n",
+ " UCM_ACTIVE_CONN %p %d [lid port qpn] %x %x %x -> %x %x %x mtu %d\n",
cm->hca, cm->retries, ntohs(cm->msg.saddr.ib.lid),
ntohs(cm->msg.sport), ntohl(cm->msg.saddr.ib.qpn),
ntohs(cm->msg.daddr.ib.lid), ntohs(cm->msg.dport),
- ntohl(cm->msg.dqpn), sizeof(DAT_IB_EXTENSION_EVENT_DATA));
+ ntohl(cm->msg.dqpn), ep->qp_handle->mtu);
return;
bail:
if (ntohs(msg->op) != DCM_REJ_USER) {
@@ -1812,6 +1821,7 @@ static void ucm_accept(ib_cm_srvc_handle_t cm, ib_cm_msg_t *msg)
acm->msg.p_size = msg->p_size;
acm->msg.d_id = msg->s_id;
acm->msg.rd_in = msg->rd_in;
+ acm->msg.mtu = msg->mtu; /* save peer MTU */
/* CR saddr is CM daddr info, need EP for local saddr */
dapl_os_memcpy(&acm->msg.daddr, &msg->saddr, sizeof(union dcm_addr));
@@ -1832,14 +1842,15 @@ static void ucm_accept(ib_cm_srvc_handle_t cm, ib_cm_msg_t *msg)
dapl_log(DAPL_DBG_TYPE_CM,
" accepting: op %s [id lid, port, cqp, iqp]:"
- " %d %x %x %x %x <- %d %x %x %x %x\n",
+ " %d %x %x %x %x <- %d %x %x %x %x mtu %d\n",
dapl_cm_op_str(ntohs(msg->op)),
ntohl(acm->msg.s_id), ntohs(msg->daddr.ib.lid),
UCM_PORT_NTOH(msg->dportx, msg->dport),
ntohl(msg->dqpn), ntohl(msg->daddr.ib.qpn),
ntohl(msg->s_id), ntohs(msg->saddr.ib.lid),
UCM_PORT_NTOH(msg->sportx, msg->sport),
- ntohl(msg->sqpn), ntohl(msg->saddr.ib.qpn));
+ ntohl(msg->sqpn), ntohl(msg->saddr.ib.qpn),
+ acm->msg.mtu);
#ifdef DAT_EXTENSIONS
if (acm->msg.daddr.ib.qp_type == IBV_QPT_UD) {
@@ -1958,13 +1969,13 @@ static void ucm_accept_rtu(dp_ib_cm_handle_t cm, ib_cm_msg_t *msg)
}
dapl_log(DAPL_DBG_TYPE_CM_EST,
- " UCM_PASSIVE_CONN %p %d [lid port qpn] %x %x %x <- %x %x %x\n",
+ " UCM_PASSIVE_CONN %p %d [lid port qpn] %x %x %x <- %x %x %x mtu %d\n",
cm->hca, cm->retries, ntohs(cm->msg.saddr.ib.lid),
UCM_PORT_NTOH(cm->msg.sportx, cm->msg.sport),
ntohl(cm->msg.saddr.ib.qpn),
ntohs(cm->msg.daddr.ib.lid),
UCM_PORT_NTOH(cm->msg.dportx, cm->msg.dport),
- ntohl(cm->msg.dqpn));
+ ntohl(cm->msg.dqpn), cm->ep->qp_handle->mtu);
return;
bail:
dapl_log(DAPL_DBG_TYPE_CM_WARN,
@@ -2090,11 +2101,11 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data)
dapl_dbg_log(DAPL_DBG_TYPE_CM,
" ACCEPT_USR: s_id %d r_id %d lid=%x"
- " iqp=%x qp_type %d, psize=%d\n",
+ " iqp=%x qp_type %d, psize=%d r_mtu %d l_mtu %d\n",
ntohl(cm->msg.s_id), ntohl(cm->msg.d_id),
ntohs(cm->msg.daddr.ib.lid),
ntohl(cm->msg.daddr.ib.qpn), cm->msg.daddr.ib.qp_type,
- p_size);
+ p_size, cm->msg.mtu, cm->hca->ib_trans.ib_cm.mtu);
#ifdef DAT_EXTENSIONS
if (cm->msg.daddr.ib.qp_type == IBV_QPT_UD &&
@@ -2110,6 +2121,10 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data)
ep->param.ep_attr.max_rdma_read_out =
DAPL_MIN(ep->param.ep_attr.max_rdma_read_out,
cm->msg.rd_in);
+ /* Set QP MTU, if negotiated. 2K for compatibility */
+ ep->qp_handle->mtu = cm->msg.mtu ?
+ DAPL_MIN(cm->msg.mtu, cm->hca->ib_trans.ib_cm.mtu):
+ getenv("DAPL_IB_MTU") ? cm->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048;
/* modify QP to RTR and then to RTS with remote info already read */
dapl_os_lock(&ep->header.lock);
@@ -2146,6 +2161,7 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data)
/* setup local QP info and type from EP, copy pdata, for reply */
cm->msg.op = htons(DCM_REP);
cm->msg.rd_in = ep->param.ep_attr.max_rdma_read_in;
+ cm->msg.mtu = ep->qp_handle->mtu; /* send negotiated MTU */
cm->msg.saddr.ib.qpn = htonl(ep->qp_handle->qp->qp_num);
cm->msg.saddr.ib.qp_type = ep->qp_handle->qp->qp_type;
cm->msg.saddr.ib.lid = cm->hca->ib_trans.addr.ib.lid;
diff --git a/dapl/openib_ucm/device.c b/dapl/openib_ucm/device.c
index f23c77b..71fee5f 100755
--- a/dapl/openib_ucm/device.c
+++ b/dapl/openib_ucm/device.c
@@ -292,8 +292,6 @@ DAT_RETURN dapls_ib_open_hca(IN IB_HCA_NAME hca_name,
dapl_os_get_env_val("DAPL_HOP_LIMIT", DCM_HOP_LIMIT);
hca_ptr->ib_trans.ib_cm.tclass =
dapl_os_get_env_val("DAPL_TCLASS", DCM_TCLASS);
- hca_ptr->ib_trans.ib_cm.mtu =
- dapl_ib_mtu(dapl_os_get_env_val("DAPL_IB_MTU", DCM_IB_MTU));
if (flags & DAPL_OPEN_QUERY)
goto done;
diff --git a/dapl/svc/mcm.c b/dapl/svc/mcm.c
index 4b91090..7be40b8 100755
--- a/dapl/svc/mcm.c
+++ b/dapl/svc/mcm.c
@@ -346,7 +346,9 @@ int mcm_modify_qp(struct ibv_qp *qp_handle,
qp_attr.dest_qp_num = ntohl(qpn);
qp_attr.rq_psn = 1;
- qp_attr.path_mtu = m_qp->smd->md->dev_attr.mtu;
+ qp_attr.path_mtu = m_qp->mtu ?
+ min(m_qp->mtu, m_qp->smd->md->dev_attr.mtu):
+ m_qp->smd->md->dev_attr.mtu;
qp_attr.max_dest_rd_atomic = 16;
qp_attr.min_rnr_timer = m_qp->smd->md->dev_attr.rnr_timer;
qp_attr.ah_attr.dlid = ntohs(lid);
@@ -1491,6 +1493,7 @@ int mcm_cm_req_out(mcm_cm_t *m_cm)
m_cm->state = MCM_REP_PENDING;
m_cm->msg.op = htons(MCM_REQ);
m_cm->timer = mcm_time_us(); /* reset reply timer */
+ m_cm->msg.mtu = m_cm->smd->md->dev_attr.mtu; /* local MTU to peer */
if (mcm_send(m_cm->md, &m_cm->msg, &m_cm->msg.p_data, ntohs(m_cm->msg.p_size)))
return -1;
@@ -1508,7 +1511,7 @@ int mcm_cm_rtu_out(mcm_cm_t *m_cm)
MCNTR(m_cm->md, MCM_CM_RTU_OUT);
- mlog(1, "[%d:%d] CONN_EST[%d]: %p 0x%x %x 0x%x %Lx %s -> 0x%x %x 0x%x %Lx %s\n",
+ mlog(1, "[%d:%d] CONN_EST[%d]: %p 0x%x %x 0x%x %Lx %s -> 0x%x %x 0x%x %Lx %s mtu %d\n",
m_cm->md->mc->scif_id, m_cm->smd->entry.tid,
m_cm->md->cntrs ? (uint32_t)((uint64_t *)m_cm->md->cntrs)[MCM_CM_RTU_OUT]:0,
m_cm, htons(m_cm->msg.saddr2.lid), htonl(m_cm->msg.saddr2.qpn),
@@ -1516,7 +1519,8 @@ int mcm_cm_rtu_out(mcm_cm_t *m_cm)
htons(m_cm->msg.daddr1.lid),
MXF_EP(&m_cm->msg.saddr1) && MXF_EP(&m_cm->msg.daddr1) ?
htonl(m_cm->msg.daddr2.qpn):htonl(m_cm->msg.daddr1.qpn),
- htons(m_cm->msg.dport), ntohll(r_guid), mcm_map_str(m_cm->msg.daddr1.ep_map));
+ htons(m_cm->msg.dport), ntohll(r_guid), mcm_map_str(m_cm->msg.daddr1.ep_map),
+ m_cm->m_qp->mtu);
mpxy_lock(&m_cm->lock);
if (m_cm->state != MCM_REP_RCV) {
diff --git a/dapl/svc/mix.c b/dapl/svc/mix.c
index cb82499..8e4e622 100755
--- a/dapl/svc/mix.c
+++ b/dapl/svc/mix.c
@@ -186,8 +186,6 @@ void mix_scif_accept(scif_epd_t listen_ep)
mlog(8, " SCIF client: device open client_pid 0x%x - mlen %d - ep %d\n",
ntohl(msg.hdr.req_id), len, op_ep);
- msg.hdr.flags = MIX_OP_RSP;
-
if (msg.hdr.ver < MIX_MIN || msg.hdr.ver > MIX_MAX || msg.hdr.op != MIX_IA_OPEN) {
mlog(0, " ERR: MIC client incompatible with MPXYD (exp %d,rcvd %d) or OP (exp %d,rcvd %d)\n",
DAT_MIX_VER, msg.hdr.ver, msg.hdr.op, MIX_IA_OPEN);
@@ -525,6 +523,27 @@ void m_cq_free(struct mcm_cq *m_cq)
free(m_cq);
}
+/* smd->cqlock/cqrlock held */
+void m_cq_flush(struct mcm_cq *m_cq)
+{
+ struct ibv_cq *ib_cq = NULL;
+ void *cq_ctx;
+ int ret, cnt=0;
+ struct ibv_wc wc;
+
+ mlog(8, " m_cq %p enter:\n", m_cq);
+ ret = ibv_get_cq_event(m_cq->ib_ch, &ib_cq, (void *)&cq_ctx);
+ if (ret == 0)
+ ibv_ack_cq_events(m_cq->ib_cq, 1);
+
+ do {
+ ret = ibv_poll_cq(m_cq->ib_cq, 1, &wc);
+ cnt += ret;
+ }
+ while (ret > 0);
+ mlog(8, " m_cq %p exit: %d events flushed\n", m_cq, cnt);
+}
+
/* destroy proxy CQ, fits in header */
static int mix_cq_destroy(mcm_scif_dev_t *smd, dat_mix_hdr_t *pmsg)
{
@@ -778,8 +797,26 @@ void m_qp_free(struct mcm_qp *m_qp)
mpxy_unlock(&m_qp->smd->qprlock);
}
- mlog(8, " m_qp %p m_cm %p cm_id %d\n",
- m_qp, m_qp->cm, m_qp->cm ? m_qp->cm->entry.tid:0);
+ mlog(8, " m_qp %p m_cm %p cm_id %d cm_state %d\n",
+ m_qp, m_qp->cm, m_qp->cm ? m_qp->cm->entry.tid:0,
+ m_qp->cm ? m_qp->cm->state:0);
+
+ if (m_qp->cm)
+ m_qp->cm->state = MCM_DISCONNECTED;
+
+ mcm_flush_qp(m_qp); /* QP to error, flush consumer messages */
+
+ if (m_qp->m_cq_tx) { /* flush pending PO WRs on cq_tx */
+ mpxy_lock(&m_qp->smd->cqlock);
+ m_cq_flush(m_qp->m_cq_tx);
+ mpxy_unlock(&m_qp->smd->cqlock);
+ }
+
+ if (m_qp->m_cq_rx) { /* flush pending PI WRs on cq_rx */
+ mpxy_lock(&m_qp->smd->cqrlock);
+ m_cq_flush(m_qp->m_cq_rx);
+ mpxy_unlock(&m_qp->smd->cqrlock);
+ }
if (m_qp->cm) { /* unlink CM, serialized */
struct mcm_cm *cm = m_qp->cm;
@@ -791,7 +828,6 @@ void m_qp_free(struct mcm_qp *m_qp)
mpxy_unlock(&cm->lock);
mcm_dqconn_free(m_qp->smd, cm);
}
- mcm_flush_qp(m_qp); /* move QP to error, flush */
if (m_qp->ib_qp1) {
ibv_destroy_qp(m_qp->ib_qp1);
@@ -812,6 +848,7 @@ void m_qp_free(struct mcm_qp *m_qp)
if (m_qp->m_cq_rx) {
mpxy_lock(&m_qp->smd->cqrlock);
m_cq_free(m_qp->m_cq_rx);
+ m_qp->m_cq_rx = NULL;
mpxy_unlock(&m_qp->smd->cqrlock);
}
mpxy_lock_destroy(&m_qp->txlock); /* proxy out */
@@ -1266,12 +1303,16 @@ void mix_dto_event(struct mcm_cq *m_cq, struct dat_mix_wc *wc, int nc)
if (msg.wc[i].status != IBV_WC_SUCCESS) {
if (msg.wc[i].status != IBV_WC_WR_FLUSH_ERR) {
- mlog(0, " [%d:%d] ERROR (ep=%d): cq %p id %d ctx %p stat %d"
- " op 0x%x ln %d wr_id %p wc's %d verr 0x%x errno=%d,%s\n",
- m_cq->smd->md->mc->scif_id, m_cq->smd->entry.tid,
- m_cq->smd->scif_op_ep, m_cq, msg.cq_id, msg.cq_ctx,
- msg.wc[i].status, msg.wc[i].opcode, msg.wc[i].byte_len,
- msg.wc[i].wr_id, msg.wc_cnt, msg.wc[i].vendor_err,
+ mlog(0, " [%d:%d] ERROR (ep=%d): id %d stat %d"
+ " op %x flg %x ln %d wr_id %p wc's %d"
+ " verr 0x%x errno=%d,%s\n",
+ m_cq->smd->md->mc->scif_id,
+ m_cq->smd->entry.tid,
+ m_cq->smd->scif_op_ep, msg.cq_id,
+ msg.wc[i].status, msg.wc[i].opcode,
+ msg.wc[i].wc_flags, msg.wc[i].byte_len,
+ msg.wc[i].wr_id, msg.wc_cnt,
+ msg.wc[i].vendor_err,
errno, strerror(errno));
}
} else {
@@ -1494,6 +1535,7 @@ static int mix_cm_rtu_out(mcm_scif_dev_t *smd, dat_mix_cm_t *pmsg, scif_epd_t sc
ntohs(m_cm->msg.daddr1.lid), ntohll(m_cm->msg.sys_guid));
/* send RTU on wire */
+ m_cm->msg.mtu = m_cm->m_qp->mtu; /* send negotiated MTU */
mcm_cm_rtu_out(m_cm);
return 0;
@@ -1598,6 +1640,12 @@ int mix_cm_rep_in(mcm_cm_t *m_cm, dat_mcm_msg_t *pkt, int pkt_len)
else
m_cm->m_qp->p2p_data = 0;
+ /* Set QP MTU, if negotiated. 2K for compatibility */
+ m_cm->m_qp->mtu = pkt->mtu ?
+ min(pkt->mtu, m_cm->md->dev_attr.mtu):
+ m_cm->md->mtu_env ? m_cm->md->mtu_env : IBV_MTU_2048;
+ m_cm->msg.mtu = m_cm->m_qp->mtu; /* forward negotiated MTU */
+
mlog(2, " WRC: m_qp %p - WR 0x%Lx rkey 0x%x ln %d, sz %d end %d"
" WC 0x%Lx rkey 0x%x ln %d, sz %d end %d\n",
m_cm->m_qp, m_cm->m_qp->wrc.wr_addr, m_cm->m_qp->wrc.wr_rkey,
@@ -1754,6 +1802,7 @@ int mix_cm_req_in(mcm_cm_t *cm, dat_mcm_msg_t *pkt, int pkt_len)
acm->msg.p_size = pkt->p_size;
acm->msg.d_id = pkt->s_id;
acm->msg.rd_in = pkt->rd_in;
+ acm->msg.mtu = pkt->mtu;
#ifdef MPXYD_LOCAL_SUPPORT
acm->msg.sys_guid = pkt->sys_guid; /* remote system guid */;
#else
@@ -1765,13 +1814,14 @@ int mix_cm_req_in(mcm_cm_t *cm, dat_mcm_msg_t *pkt, int pkt_len)
memcpy(&acm->msg.daddr1, &pkt->saddr1, sizeof(dat_mcm_addr_t));
memcpy(&acm->msg.daddr2, &pkt->saddr2, sizeof(dat_mcm_addr_t));
- mlog(2, " [%d:%d] cm %p ep %d sPORT %x %s <- dPORT %x lid=%x psz=%d %s %s %Lx (msg %p %d)\n",
+ mlog(2, " [%d:%d] cm %p ep %d: %x %s <- %x lid=%x psz=%d %s %s %Lx (%p %d) lmtu %d rmtu %d\n",
cm->md->mc->scif_id, cm->smd->entry.tid, acm, acm->smd->scif_ev_ep,
ntohs(acm->msg.sport), mcm_map_str(acm->md->addr.ep_map),
ntohs(acm->msg.dport), ntohs(acm->msg.daddr1.lid), htons(acm->msg.p_size),
mcm_map_str(acm->msg.daddr2.ep_map),
acm->md->addr.lid == acm->msg.daddr1.lid ? "platform":"fabric",
- ntohll(acm->msg.sys_guid), &msg, sizeof(dat_mcm_msg_t));
+ ntohll(acm->msg.sys_guid), &msg, sizeof(dat_mcm_msg_t),
+ cm->md->dev_attr.mtu, pkt->mtu);
if (pkt->p_size)
memcpy(acm->msg.p_data, pkt->p_data, ntohs(pkt->p_size));
@@ -1806,7 +1856,7 @@ int mix_cm_rtu_in(mcm_cm_t *m_cm, dat_mcm_msg_t *pkt, int pkt_len)
dat_mix_cm_t msg;
int len;
- mlog(1, "[%d:%d] CONN_EST[%d]: %p 0x%x %x 0x%x %Lx %s <- 0x%x %x 0x%x %Lx %s\n",
+ mlog(1, "[%d:%d] CONN_EST[%d]: %p 0x%x %x 0x%x %Lx %s <- 0x%x %x 0x%x %Lx %s mtu %d\n",
m_cm->md->mc->scif_id, m_cm->smd->entry.tid,
m_cm->md->cntrs ? (uint32_t)((uint64_t *)m_cm->md->cntrs)[MCM_CM_RTU_IN]:0,
m_cm, htons(pkt->daddr1.lid),
@@ -1814,7 +1864,8 @@ int mix_cm_rtu_in(mcm_cm_t *m_cm, dat_mcm_msg_t *pkt, int pkt_len)
htonl(m_cm->msg.daddr2.qpn):htonl(m_cm->msg.daddr1.qpn),
htons(pkt->dport), system_guid, mcm_map_str(pkt->daddr1.ep_map),
htons(pkt->saddr2.lid), htonl(pkt->saddr2.qpn),
- htons(pkt->sport), ntohll(pkt->sys_guid), mcm_map_str(pkt->saddr2.ep_map));
+ htons(pkt->sport), ntohll(pkt->sys_guid), mcm_map_str(pkt->saddr2.ep_map),
+ m_cm->m_qp->mtu);
/* MXF_EP <- HST_EP, host sends WC on RTU, save WRC info */
if (MXF_EP(&pkt->daddr1) && HST_EP(&pkt->saddr2)) {
@@ -2056,6 +2107,11 @@ static int mix_cm_rep_out(mcm_scif_dev_t *smd, dat_mix_cm_t *pmsg, scif_epd_t sc
m_cm->msg.sys_guid = rand();
#endif
+ /* Set QP MTU, if negotiated. 2K for compatibility */
+ m_cm->m_qp->mtu = m_cm->msg.mtu ?
+ min(m_cm->msg.mtu, m_cm->md->dev_attr.mtu):
+ m_cm->md->mtu_env ? m_cm->md->mtu_env : IBV_MTU_2048;
+
if (qp) {
if (mcm_modify_qp(qp, IBV_QPS_RTR, dqpn, dlid, dgid))
goto err;
@@ -2071,8 +2127,9 @@ static int mix_cm_rep_out(mcm_scif_dev_t *smd, dat_mix_cm_t *pmsg, scif_epd_t sc
goto err;
}
- /* send RTU on wire, monitor for retries */
+ /* send REP on wire, monitor for retries */
m_cm->state = MCM_RTU_PENDING;
+ m_cm->msg.mtu = m_cm->m_qp->mtu; /* send negotiated MTU */
mpxy_unlock(&m_cm->lock);
mcm_cm_rep_out(m_cm);
return 0;
@@ -2183,6 +2240,7 @@ static int mix_proxy_out(mcm_scif_dev_t *smd, dat_mix_sr_t *pmsg, mcm_qp_t *m_qp
mpxy_lock(&m_qp->txlock);
if (((m_qp->wr_hd + 1) & m_qp->wr_end) == m_qp->wr_tl) { /* full */
ret = ENOMEM;
+ mlog(0, " ERR: WR full hd %d tl %d\n", m_qp->wr_hd, m_qp->wr_tl);
goto bail;
}
m_qp->wr_hd = (m_qp->wr_hd + 1) & m_qp->wr_end; /* move hd */
diff --git a/dapl/svc/mpxy_in.c b/dapl/svc/mpxy_in.c
index 54cc62a..bb48c69 100755
--- a/dapl/svc/mpxy_in.c
+++ b/dapl/svc/mpxy_in.c
@@ -476,11 +476,10 @@ static int m_pi_send_wc(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_rx, int status
int wc_idx, ret;
mlog(0x10,"[%d:%d:%d] WC_rem: wr_rx[%d] %p wc_hd %d flgs %x WR_r tl %d-%d"
- " wt %d hd %d wr_id %Lx org_id %Lx\n",
+ " wt %d hd %d oid %Lx st %d\n",
m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, m_qp->r_entry.tid,
wr_rx->w_idx, wr_rx, m_qp->wc_hd_rem, wr_rx->flags, m_qp->wr_tl_r,
- wr_rx->w_idx, m_qp->wr_tl_r_wt, m_qp->wr_hd_r, wr_rx->wr.wr_id,
- wr_rx->org_id);
+ wr_rx->w_idx, m_qp->wr_tl_r_wt, m_qp->wr_hd_r, wr_rx->org_id, status);
/* local WR and remote WR are serialized, should never reach tail of remote WR */
if (((m_qp->wc_hd_rem + 1) & m_qp->wrc.wc_end) == m_qp->wc_tl_rem) {
@@ -576,6 +575,13 @@ static void m_pi_post_writeto(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_sig)
while (m_qp->pi_rr_cnt) { /* RR's pending */
wr_rx = (struct mcm_wr_rx *)(m_qp->wrc.wr_addr + (m_qp->wrc.wr_sz * wr_idx));
+ /* SCIF sync required on IB RW, multiple SCIF writes are not ordered */
+ if (m_qp->post_cnt_wt &&
+ (wr_rx->flags & (M_SEND_FS|M_SEND_LS)) &&
+ (!(wr_rx->flags & (M_READ_WRITE_TO_DONE|M_READ_WRITE_TO)))) {
+ break;
+ }
+
if (!(wr_rx->flags & M_READ_DONE)) {
/* reached head pointer */
if (wr_idx == m_qp->wr_hd_r)
@@ -675,8 +681,18 @@ static void m_pi_post_writeto(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_sig)
ret = scif_writeto(smd->scif_tx_ep, l_off, w_len, r_off, wt_flag);
if (ret) {
- mlog(0, " ERR: scif_sendto, ret %d err: %d %s\n",
+ mlog(0," [%d:%d:%d] ERR: scif_sendto, ret %d err: %d %s\n",
+ smd->md->mc->scif_id, smd->entry.tid, m_qp->r_entry.tid,
ret, errno, strerror(errno));
+ mlog(0," PI: wc %d rr %d stall %d wt %d\n",
+ m_qp->pi_rw_cnt, m_qp->pi_rr_cnt,
+ m_qp->stall_cnt_rr, m_qp->post_cnt_wt);
+ mlog(0," PO: wr %d wr_rem %d pst_sig %d cmp_sig %d\n",
+ m_qp->wr_pp, m_qp->wr_pp_rem,
+ m_qp->post_sig_cnt, m_qp->comp_cnt);
+ mlog(0, " WR_rx[%d] %p l_o %Lx r_o %Lx rb 0x%x-0x%x ln %d id %Lx tl %d hd %d\n",
+ wr_rx->w_idx, wr_rx, l_off, r_off, l_start, l_end, w_len, wr_rx->org_id,
+ m_qp->wr_tl_r, m_qp->wr_hd_r);
goto bail;
}
MCNTR(smd->md, MCM_SCIF_WRITE_TO);
@@ -719,14 +735,22 @@ static void m_pi_post_writeto(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_sig)
}
return;
bail:
- /* report error via WC back to proxy-out */
- mlog(0, " ERR: writeto: wr_rx[%d] %p -> IB raddr %Lx rkey %x"
- " SCIF r_off %Lx, len %d wr_flags %x wt_pend %d\n",
- wr_rx->w_idx, wr_rx, wr_rx->wr.wr.rdma.remote_addr,
- wr_rx->wr.wr.rdma.rkey, r_off, sg_len, wr_rx->flags,
- m_qp->post_cnt_wt);
-
- m_pi_send_wc(m_qp, wr_rx, IBV_WC_REM_ACCESS_ERR);
+ /* report error via WC back to proxy-out, all pending WRs */
+ wr_idx = m_qp->wr_tl_r_wt;
+ do {
+ wr_rx = (struct mcm_wr_rx *)(m_qp->wrc.wr_addr + (m_qp->wrc.wr_sz * wr_idx));
+
+ mlog(0, " ERR: wr_rx[%d] %p -> IB raddr %Lx %x"
+ " SCIF r_o %Lx, ln %d fl %x wt_pnd %d\n",
+ wr_rx->w_idx, wr_rx, wr_rx->wr.wr.rdma.remote_addr,
+ wr_rx->wr.wr.rdma.rkey, r_off, sg_len, wr_rx->flags,
+ m_qp->post_cnt_wt);
+
+ m_pi_send_wc(m_qp, wr_rx, IBV_WC_REM_ACCESS_ERR);
+ wr_idx = (wr_idx + 1) & m_qp->wrc.wr_end; /* next WR */
+
+ } while (wr_idx != m_qp->wr_hd_r);
+
return;
}
@@ -1164,6 +1188,17 @@ retry:
goto retry;
}
+/* Process scif_writeto DMAs waiting for previous WTs completions, order */
+void m_pi_pending_wt(struct mcm_qp *m_qp)
+{
+ struct mcm_wr_rx *wr_rx;
+
+ mpxy_lock(&m_qp->rxlock);
+ wr_rx = (struct mcm_wr_rx *)(m_qp->wrc.wr_addr + (m_qp->wrc.wr_sz * m_qp->wr_hd_r));
+ m_pi_post_writeto(m_qp, wr_rx);
+ mpxy_unlock(&m_qp->rxlock);
+}
+
/*
* Pending Proxy-in services for RDMA Writes from remote peer
*
diff --git a/dapl/svc/mpxy_out.c b/dapl/svc/mpxy_out.c
index d015dc3..5c82703 100755
--- a/dapl/svc/mpxy_out.c
+++ b/dapl/svc/mpxy_out.c
@@ -487,6 +487,23 @@ void m_po_pending_wr(struct mcm_qp *m_qp, int *data)
else
wc.wc_flags = 0;
wc.vendor_err = ret;
+ if (ret) {
+ mlog(0, "[%d:%d:%d] ERR %s_RW_post: WR[%d] wr_id %p flgs 0x%x,"
+ " pcnt %d sg_rate %d hd %d tl %d sz %d m_idx %x\n",
+ m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid,
+ m_qp->r_entry.tid,
+ (MXF_EP(&m_qp->cm->msg.daddr1)) ? "po_pi":"po_direct",
+ m_wr->w_idx, m_wr->wr.wr_id, m_wr->wr.send_flags,
+ m_qp->post_cnt, mcm_rw_signal, m_qp->wr_hd, m_qp->wr_tl,
+ m_wr->wr.sg_list->length, m_wr->m_idx);
+ mlog(0, "[%d:%d:%d] ERR wr_id %Lx next %p sglist %p sge %d op %d flgs"
+ " %d idata 0x%x raddr %p rkey %x \n",
+ m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid,
+ m_qp->r_entry.tid, m_wr->wr.wr_id, m_wr->wr.next,
+ m_wr->wr.sg_list, m_wr->wr.num_sge, m_wr->wr.opcode,
+ m_wr->wr.send_flags, m_wr->wr.imm_data,
+ m_wr->wr.wr.rdma.remote_addr, m_wr->wr.wr.rdma.rkey);
+ }
mix_dto_event(m_qp->ib_qp2->send_cq->cq_context, &wc, 1);
}
@@ -565,7 +582,7 @@ int m_po_proxy_data(mcm_scif_dev_t *smd, dat_mix_sr_t *pmsg, struct mcm_qp *m_qp
off_t l_off, r_off;
uint64_t total_offset;
int l_start, l_end, l_len, cacheln_off, seg_len;
- struct mcm_wr *m_wr;
+ struct mcm_wr *m_wr = NULL;
struct ibv_sge *m_sge;
mlog(4, " q_id %d, q_ctx %p, len %d, wr_id %p, sge %d, op %x flgs %x wr_idx %d\n",
@@ -609,7 +626,7 @@ int m_po_proxy_data(mcm_scif_dev_t *smd, dat_mix_sr_t *pmsg, struct mcm_qp *m_qp
}
write(smd->md->mc->tx_pipe[1], "w", sizeof("w"));
mpxy_unlock(&m_qp->txlock);
- sched_yield();
+ sleep_usec(1000);
mpxy_lock(&m_qp->txlock);
}
if (retries) {
@@ -803,8 +820,6 @@ retry_mr:
mlog(0x10, "[%d:%d:%d] %s_RF_post_sig: WR[%d] qp %p wr_id %p flgs 0x%x,"
" sg_rate %d hd %d tl %d sz %d m_idx %x\n",
m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, m_qp->r_entry.tid,
- m_qp, m_wr, pmsg->wr.wr_id, m_wr->wr.send_flags,
- m_qp->post_cnt, mcm_rw_signal, m_qp->wr_hd, m_qp->wr_tl,
(MXF_EP(&m_qp->cm->msg.daddr1)) ? "po_pi":"po_direct",
m_wr->w_idx, m_qp, pmsg->wr.wr_id, m_wr->wr.send_flags,
mcm_rw_signal, m_qp->wr_hd, m_qp->wr_tl,
@@ -919,6 +934,18 @@ bail:
else
wc.wc_flags = 0;
wc.vendor_err = ret;
+
+ mlog(0, "[%d:%d:%d] ERR %s_RF_post: WR[%d] qp %p wr_id %p, "
+ " post %d hd %d tl %d sz %d \n",
+ m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, m_qp->r_entry.tid,
+ (MXF_EP(&m_qp->cm->msg.daddr1)) ? "po_pi":"po_direct",
+ m_wr ? m_wr->w_idx:0, m_qp, m_wr, pmsg->wr.wr_id,
+ m_qp->post_cnt, m_qp->wr_hd, m_qp->wr_tl, wc.byte_len);
+ mlog(0, "[%d:%d:%d] ERR m_wr: raddr %Lx rkey 0x%x, ib_wr: raddr %Lx rkey 0x%x\n",
+ m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, m_qp->r_entry.tid,
+ pmsg->wr.wr.rdma.remote_addr, pmsg->wr.wr.rdma.rkey,
+ m_wr ? m_wr->wr.wr.rdma.remote_addr:0, m_wr ? m_wr->wr.wr.rdma.rkey:0);
+
mix_dto_event(m_qp->ib_qp2->send_cq->cq_context, &wc, 1);
}
diff --git a/dapl/svc/mpxyd.c b/dapl/svc/mpxyd.c
index 922eeae..668efa5 100755
--- a/dapl/svc/mpxyd.c
+++ b/dapl/svc/mpxyd.c
@@ -799,6 +799,10 @@ found:
msg->dev_addr.lid = md->m_lid;
memcpy(msg->dev_addr.gid, md->m_gid, 16);
}
+
+ /* MTU changed via DAPL_IB_MTU */
+ if (msg->hdr.flags & MIX_OP_MTU)
+ md->mtu_env = md->dev_attr.mtu;
err:
if (!smd) {
mlog(1, " WARN: open failed for %s - %d\n", msg->name, msg->port);
@@ -806,6 +810,7 @@ err:
}
/* send back response */
+ msg->hdr.flags = MIX_OP_RSP;
ret = scif_send_msg(op_ep, (void*)msg, sizeof(dat_mix_open_t));
if (ret) {
mlog(0, " ERR: scif_send dev_id %d op_ep %d, closing device %p\n",
@@ -817,9 +822,10 @@ err:
goto bail;
}
- mlog(1, " MIC client: mdev[%d] %p smd %p mic%d[%d] -> %s[%d] port %d lid %x %s\n",
+ mlog(1, " MIC client: mdev[%d] %p->%p mic%d[%d] -> %s[%d] port %d lid %x %s mtu %d (%d)\n",
md->smd_list.tid, md, smd, mc->scif_id-1, mc->numa_node, msg->name,
- md->numa_node, msg->port, ntohs(msg->dev_addr.lid), mcm_map_str(md->addr.ep_map));
+ md->numa_node, msg->port, ntohs(msg->dev_addr.lid), mcm_map_str(md->addr.ep_map),
+ md->dev_attr.mtu, md->mtu_env);
bail:
mpxy_unlock(&mc->oplock);
mpxy_unlock(&mc->cmlock);
@@ -1187,6 +1193,7 @@ void mpxy_rx_thread(void *mic_client)
m_qp = get_head_entry(&smd->qprlist);
while (m_qp) {
m_pi_pending_wr(m_qp, &data); /* RR's and scif_sendto */
+ m_pi_pending_wt(m_qp); /* WT's pending */
m_qp = get_next_entry(&m_qp->r_entry, &smd->qprlist);
}
mpxy_unlock(&smd->qprlock);
@@ -1445,8 +1452,7 @@ void mcm_dat_dev_log(struct mcm_scif_dev *smd)
/* show PO mbuf_wc busy slots */
idx = smd->m_buf_tl;
- while ((smd->m_buf_tl != smd->m_buf_hd) &&
- (smd->m_buf_hd - smd->m_buf_tl)) {
+ while (idx != smd->m_buf_hd) {
if ((smd->m_buf_wc[idx].m_idx && !smd->m_buf_wc[idx].done) || 1) {
struct mcm_wr *m_wr = NULL;
struct mcm_qp *m_qp = NULL;
@@ -1468,8 +1474,6 @@ void mcm_dat_dev_log(struct mcm_scif_dev *smd)
smd->m_buf_wc[idx].tl, smd->m_buf_wc[idx].hd);
}
idx = (idx + 1) & smd->m_buf_end;
- if (idx == (smd->m_buf_hd+2))
- break;
}
/* show PI mbuf_wc busy slots, start from tail */
diff --git a/dapl/svc/mpxyd.h b/dapl/svc/mpxyd.h
index c733157..ec31cc0 100755
--- a/dapl/svc/mpxyd.h
+++ b/dapl/svc/mpxyd.h
@@ -132,6 +132,7 @@ typedef struct mcm_ib_dev {
int numa_node;
int indata;
void *cntrs;
+ uint8_t mtu_env;
} mcm_ib_dev_t;
@@ -244,6 +245,7 @@ typedef struct mcm_qp {
int sr_len; /* SR WR buffer pool len */
int sr_sz; /* SR WR entry size */
int post_sr;
+ uint8_t mtu; /* negotiated QP MTU */
#ifdef MCM_PROFILE
mcm_qp_prof_t ts;
uint32_t last_wr_sig;
@@ -613,6 +615,7 @@ int m_pi_create_sr_q(struct mcm_qp *m_qp, int entries);
int m_pi_create_bpool(struct mcm_qp *m_qp, int max_recv_wr);
void m_qp_destroy_pi(struct mcm_qp *m_qp);
int m_qp_create_pi(mcm_scif_dev_t *smd, struct mcm_qp *m_qp);
+void m_pi_pending_wt(struct mcm_qp *m_qp);
void m_pi_pending_wr(struct mcm_qp *m_qp, int *data);
void m_pi_pending_wc(struct mcm_qp *m_qp, int *events);
void m_pi_req_event(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_rx, struct ibv_wc *wc, int type);
diff --git a/doc/dat.conf b/doc/dat.conf
index c3794e7..c868814 100755
--- a/doc/dat.conf
+++ b/doc/dat.conf
@@ -70,3 +70,11 @@ ofa-v2-qib0-1m u2.0 nonthreadsafe default libdaplomcm.so.2 dapl.2.0 "qib0 1" ""
ofa-v2-qib0-2m u2.0 nonthreadsafe default libdaplomcm.so.2 dapl.2.0 "qib0 2" ""
ofa-v2-qib1-1m u2.0 nonthreadsafe default libdaplomcm.so.2 dapl.2.0 "qib1 1" ""
ofa-v2-qib1-2m u2.0 nonthreadsafe default libdaplomcm.so.2 dapl.2.0 "qib1 2" ""
+ofa-v2-hfi1_0-1s u2.0 nonthreadsafe default libdaploscm.so.2 dapl.2.0 "hfi1_0 1" ""
+ofa-v2-hfi1_0-2s u2.0 nonthreadsafe default libdaploscm.so.2 dapl.2.0 "hfi1_0 2" ""
+ofa-v2-hfi1_1-1s u2.0 nonthreadsafe default libdaploscm.so.2 dapl.2.0 "hfi1_1 1" ""
+ofa-v2-hfi1_1-2s u2.0 nonthreadsafe default libdaploscm.so.2 dapl.2.0 "hfi1_1 2" ""
+ofa-v2-hfi1_0-1m u2.0 nonthreadsafe default libdaplomcm.so.2 dapl.2.0 "hfi1_0 1" ""
+ofa-v2-hfi1_0-2m u2.0 nonthreadsafe default libdaplomcm.so.2 dapl.2.0 "hfi1_0 2" ""
+ofa-v2-hfi1_1-1m u2.0 nonthreadsafe default libdaplomcm.so.2 dapl.2.0 "hfi1_1 1" ""
+ofa-v2-hfi1_1-2m u2.0 nonthreadsafe default libdaplomcm.so.2 dapl.2.0 "hfi1_1 2" ""
\ No newline at end of file
diff --git a/doc/mpxyd.conf b/doc/mpxyd.conf
index e5d6d5b..f3fd722 100755
--- a/doc/mpxyd.conf
+++ b/doc/mpxyd.conf
@@ -58,7 +58,7 @@ scif_listen_qlen 240
#
# The default is 1
-mcm_affinity 1
+mcm_affinity 2
# mcm_affinity_base_mic:
# Specifies a hard binding for CPU id base value used for affinity support of
diff --git a/test/dtest/dtest.c b/test/dtest/dtest.c
index 6894a2c..e61e000 100755
--- a/test/dtest/dtest.c
+++ b/test/dtest/dtest.c
@@ -197,7 +197,7 @@ struct dt_time {
struct dt_time ts;
/* defaults */
-static int all_data = 0;
+static int all_data_sizes = 0;
static int increment = 0;
static int failed = 0;
static int uni_direction = 0;
@@ -228,6 +228,7 @@ static int burst_msg_index = 0;
static int ucm = 0;
static int rq_cnt, sq_cnt;
static DAT_SOCK_ADDR6 remote;
+static int data_check = 0;
/* forward prototypes */
const char *DT_RetToStr(DAT_RETURN ret_value);
@@ -566,12 +567,12 @@ int main(int argc, char **argv)
DAT_PROVIDER_ATTR pr_attr;
/* parse arguments */
- while ((c = getopt(argc, argv, "auwWtscvpb:d:B:h:P:S:i:")) != -1) {
+ while ((c = getopt(argc, argv, "UDauwWtscvpb:d:B:h:P:S:i:")) != -1) {
switch (c) {
case 'i':
increment = atoi(optarg);
case 'a':
- all_data = 1;
+ all_data_sizes = 1;
fflush(stdout);
break;
case 'u':
@@ -582,6 +583,10 @@ int main(int argc, char **argv)
write_only = 1;
fflush(stdout);
break;
+ case 'D':
+ data_check = 1;
+ printf("%d Running DATA CHECK mode\n", getpid());
+ /* fall through */
case 'W':
write_only_pp = 1;
uni_direction = 1;
@@ -631,12 +636,24 @@ int main(int argc, char **argv)
case 'S':
signal_rate = atoi(optarg);
break;
+ case 'U':
+ /* fall through */
default:
print_usage();
exit(-12);
}
}
+ if (all_data_sizes && !write_only_pp) {
+ printf("\n\t -a option only valid with -W option\n\n");
+ exit(-12);
+ }
+
+ if (data_check && strstr(provider, "scif")) {
+ printf("\n\t -D option is not valid with scif provider\n\n");
+ exit(-12);
+ }
+
#if defined(_WIN32) || defined(_WIN64)
{
WSADATA wsaData;
@@ -680,8 +697,9 @@ int main(int argc, char **argv)
if (write_only_pp) {
/* rdma write pingpong, default == 1 byte */
- if (!all_data) {
- buf_len = 1;
+ if (!all_data_sizes) {
+ if (!data_check)
+ buf_len = 1;
} else if (!increment) { /* power of 2 */
buf_len_p2 = 1;
i = 0;
@@ -885,7 +903,7 @@ int main(int argc, char **argv)
if (write_only_pp) {
int max, inc;
- if (all_data) {
+ if (all_data_sizes) {
if (increment) {
i = 1;
inc = increment;
@@ -896,14 +914,44 @@ int main(int argc, char **argv)
max = buf_len_p2;
}
} else {
- i = buf_len;
- max = buf_len;
- inc = buf_len;
+ if (data_check) {
+ i = buf_len;
+ max = buf_len;
+ inc = 1;
+ }
+ else
+ {
+ i = buf_len;
+ max = buf_len;
+ inc = buf_len;
+ }
}
- printf("\n %d RDMA WRITE PINGPONG\n\n", getpid());
+ printf("\n %d RDMA WRITE PINGPONG %s\n\n", getpid(),
+ data_check ? "with DATA CHECK":"");
+
for (; i <= max; i++) {
- if (do_rdma_write_ping_pong(i, i*inc))
- break;
+ if (all_data_sizes) {
+ int l_len = (i*inc) ? (i*inc) : 1 << i;
+
+ if ( l_len > 4 && do_rdma_write_ping_pong(i, l_len - 1)) {
+ fprintf(stderr, "%d Error do_rdma_write_ping_pong\n", getpid());
+ goto cleanup;
+ }
+ }
+
+ if (do_rdma_write_ping_pong(i, i*inc)) {
+ fprintf(stderr, "%d Error do_rdma_write_ping_pong\n", getpid());
+ goto cleanup;
+ }
+
+ if (all_data_sizes) {
+ int l_len = (i*inc) ? (i*inc) : 1 << i;
+
+ if ( l_len > 1 && l_len < buf_len && do_rdma_write_ping_pong(i, l_len + 1)) {
+ fprintf(stderr, "%d Error do_rdma_write_ping_pong\n", getpid());
+ goto cleanup;
+ }
+ }
}
}
else if (write_immed && write_only) {
@@ -1021,7 +1069,7 @@ complete:
free(rbuf);
free(sbuf);
- if (!all_data) {
+ if (ts.rtt && !all_data_sizes) {
printf( "%d: %s PingPong: (%d x %d) Total %6.2lf us:"
" latency %3.2lf us, BW %4.2lf MB/s\n",
getpid(), write_only_pp ? "RDMA write":"Message",
@@ -1992,6 +2040,54 @@ acked:
return (DAT_SUCCESS);
}
+#define PAT_NUM 5
+unsigned char pat[PAT_NUM] = { 0, 0xff, 0x55, 0xaa, 0 };
+
+void set_pat(unsigned int len, unsigned int pat_num)
+{
+ if (len <= 1)
+ return;
+
+ if (pat_num >= PAT_NUM) {
+ printf("\n\tpat_num = %d. max valid number is %d.\n\n", pat_num, PAT_NUM - 1);
+ exit(1);
+ }
+
+ if (server) {
+ /* server */
+ if (pat_num == PAT_NUM - 1) {
+ /* future: random data, add checksum */
+ ;
+ } else {
+ /* check first byte only for some speed */
+ if ((unsigned char)rbuf[0] != (unsigned char)pat[pat_num]) {
+ fprintf(stderr,"%d: ERR: message len is %d,"
+ " location 0. Rx 0x%x expected"
+ " 0x%x, pat %d\n",
+ getpid(), len, (unsigned char)rbuf[0],
+ (unsigned char)pat[pat_num], pat_num);
+ }
+ }
+ memcpy(sbuf, rbuf, len - 1);
+
+ } else {
+ /* client */
+ int i;
+
+ if (pat_num == PAT_NUM - 1) { /* set random values */
+ struct timeval tv;
+
+ gettimeofday(&tv, NULL);
+ srand((unsigned int)tv.tv_usec);
+ for (i = 0; i < len - 1; i++)
+ sbuf[i] = (unsigned char)rand();
+ } else {
+ memset(sbuf, (unsigned char)pat[pat_num], len - 1);
+ }
+ }
+}
+
+
/* always uni-direction */
DAT_RETURN do_rdma_write_ping_pong(int p2, int bytes)
{
@@ -2006,6 +2102,7 @@ DAT_RETURN do_rdma_write_ping_pong(int p2, int bytes)
volatile char *tx_buf, *rx_buf;
uint32_t rx_cnt = 0;
uint32_t tx_cnt = 0;
+ unsigned char rx_idx = 0;
len = bytes ? bytes : 1 << p2;
@@ -2030,6 +2127,34 @@ DAT_RETURN do_rdma_write_ping_pong(int p2, int bytes)
if (rx_cnt < burst && !(!server && !tx_cnt)) {
rx_cnt++;
while (*rx_buf != (char)rx_cnt);
+ rx_idx = (unsigned char)*rx_buf;
+
+ if (data_check && !server && memcmp(sbuf, rbuf, len)) {
+ int l=0, ll;
+ fprintf(stderr, "%d: ERR: Tx data from server wrong\n", getpid());
+
+ while (sbuf[l] == rbuf[l] && l < len)
+ l++;
+
+ fprintf(stderr,"%d: len %d, 1st error at %d. Tx 0x%x Rx 0x%x\n",
+ getpid(), len, l, (unsigned char)sbuf[l],
+ (unsigned char)rbuf[l]);
+ fprintf(stderr,"%d: rcnt %d (char = %d), tcnt %d, *rbuf %d\n",
+ getpid(), rx_cnt, (char)rx_cnt, tx_cnt,
+ (unsigned char)*rx_buf);
+ fprintf(stderr, "Send:");
+
+ for (ll=l; ll < len && ll < 1 + 64; ll++)
+ fprintf(stderr,"%02x", (unsigned char)sbuf[ll]);
+
+ fprintf(stderr, "\nRecv:");
+
+ for (ll=l; ll < len && ll < 1 + 64; ll++)
+ fprintf(stderr,"%02x", (unsigned char)rbuf[ll]);
+
+ fprintf(stderr, "\n");
+ return (DAT_ABORT);
+ }
}
if (!((i+1) % signal_rate))
@@ -2040,6 +2165,9 @@ DAT_RETURN do_rdma_write_ping_pong(int p2, int bytes)
if (tx_cnt == burst)
break;
+ if (data_check)
+ set_pat(len, tx_cnt % PAT_NUM);
+
*tx_buf = (char)++tx_cnt;
cookie.as_64 = tx_cnt;
ret = dat_ep_post_rdma_write(h_ep, MSG_IOV_COUNT,
@@ -2069,7 +2197,7 @@ DAT_RETURN do_rdma_write_ping_pong(int p2, int bytes)
stop = get_time();
ts.rtt = ((stop - start) * 1.0e6);
- if ((unsigned char)*rx_buf != (unsigned char)rx_cnt) {
+ if (rx_idx != (unsigned char)rx_cnt) {
printf( "%d %s RW pingpong: %p, last *buf %d != cnt %d\n",
getpid(), server ? "SERVER:" : "CLIENT:",
rx_buf, (unsigned char)*rx_buf,
@@ -2077,7 +2205,7 @@ DAT_RETURN do_rdma_write_ping_pong(int p2, int bytes)
return (DAT_ABORT);
}
- if (all_data) {
+ if (all_data_sizes) {
printf( "%d: RDMA write PingPong: (%d x %d) Total %6.2lf us:"
" latency %3.2lf us, BW %4.2lf MB/s\n",
getpid(), burst, len, ts.rtt, ts.rtt/burst/2,
@@ -2773,6 +2901,7 @@ void print_usage(void)
printf("u: unidirectional bandwidth (default=bidirectional\n");
printf("w: rdma write only, streaming\n");
printf("W: rdma write only, ping pong\n");
+ printf("D: validate data in ping pong test\n");
printf("t: performance times\n");
printf("c: use cno\n");
printf("a: all data sizes with rdma write pingpong \n");
@@ -2785,6 +2914,7 @@ void print_usage(void)
printf("h: hostname/address of server, specified on client\n");
printf("P: provider name (default = ofa-v2-mlx4_0-1u)\n");
printf("S: signal_rate (default=10, completion every 10 iterations\n");
+ printf("U: print this Usage page\n");
printf("\n");
}
diff --git a/test/dtest/dtestx.c b/test/dtest/dtestx.c
index 931c860..a5693d8 100755
--- a/test/dtest/dtestx.c
+++ b/test/dtest/dtestx.c
@@ -180,6 +180,7 @@ int eps = 1;
int verbose = 0;
int counters = 0;
int counters_ok = 0;
+int query_only = 0;
static int ucm = 0;
static DAT_SOCK_ADDR6 remote;
static DAT_IA_ATTR ia_attr;
@@ -1549,7 +1550,7 @@ int main(int argc, char **argv)
DAT_RETURN status;
/* parse arguments */
- while ((rc = getopt(argc, argv, "csvumpU:h:b:P:")) != -1) {
+ while ((rc = getopt(argc, argv, "qcsvumpU:h:b:P:")) != -1) {
switch (rc) {
case 'u':
ud_test = 1;
@@ -1584,6 +1585,9 @@ int main(int argc, char **argv)
case 'v':
verbose = 1;
break;
+ case 'q':
+ query_only = 1;
+ break;
default:
print_usage();
exit(-12);
@@ -1603,6 +1607,29 @@ int main(int argc, char **argv)
}
}
#endif
+ if (query_only) {
+ memset(&ia_attr, 0, sizeof(ia_attr));
+ memset(&prov_attrs, 0, sizeof(prov_attrs));
+ status = dat_ib_open_query(provider, &ia,
+ DAT_IA_FIELD_ALL, &ia_attr,
+ DAT_PROVIDER_FIELD_ALL, &prov_attrs);
+ _OK(status, "dat_ib_open_query");
+
+ print_ia_address(ia_attr.ia_address_ptr);
+ printf(" Open_Query: %s num_attrs = %d\n",
+ provider, prov_attrs.num_provider_specific_attr);
+ /* Print provider specific attributes */
+ for (i = 0; i < prov_attrs.num_provider_specific_attr; i++) {
+ printf(" Open_Query: Provider Specific Attribute[%d] %s=%s\n",
+ i, prov_attrs.provider_specific_attr[i].name,
+ prov_attrs.provider_specific_attr[i].value);
+ }
+
+ status = dat_ib_close_query(ia);
+ _OK(status, "dat_ib_close_query");
+ exit(0);
+ }
+
status = dat_ia_open(provider, 8, &async_evd, &ia);
_OK(status, "dat_ia_open");
diff --git a/test/dtest/scripts/dtest_suite.sh b/test/dtest/scripts/dtest_suite.sh
new file mode 100755
index 0000000..d6a6713
--- /dev/null
+++ b/test/dtest/scripts/dtest_suite.sh
@@ -0,0 +1,1117 @@
+#!/bin/sh
+#
+# Copyright (c) 2016 Intel Corporation. All rights reserved.
+#
+# This Software is licensed under one of the following licenses:
+#
+# 1) under the terms of the "Common Public License 1.0" a copy of which is
+# in the file LICENSE.txt in the root directory. The license is also
+# available from the Open Source Initiative, see
+# http://www.opensource.org/licenses/cpl.php.
+#
+# 2) under the terms of the "The BSD License" a copy of which is in the file
+# LICENSE2.txt in the root directory. The license is also available from
+# the Open Source Initiative, see
+# http://www.opensource.org/licenses/bsd-license.php.
+#
+# 3) under the terms of the "GNU General Public License (GPL) Version 2" a
+# copy of which is in the file LICENSE3.txt in the root directory. The
+# license is also available from the Open Source Initiative, see
+# http://www.opensource.org/licenses/gpl-license.php.
+#
+# Licensee has the right to choose one of the above licenses.
+#
+# Redistributions of source code must retain the above copyright
+# notice and one of the license notices.
+#
+# Redistributions in binary form must reproduce both the above copyright
+# notice, one of the license notices in the documentation
+# and/or other materials provided with the distribution.
+#
+# Test Suite to test uDAPL Providers and CCL Proxy on MICs and Hosts
+#
+# Sample Usage, all providers, one loop, fast:
+#
+# ./dtest_suite.sh -P ALL -l 1 -f
+#
+
+### --- user input section --- ###
+server_list="cst-kc1 cst-kc1-mic0 cst-kc1-mic1"
+client_list="cst-kc2 cst-kc2-mic0 cst-kc2-mic1 cst-kc1 cst-kc1-mic0 cst-kc1-mic1"
+### --- dtest test cases fine tune zone --- ###
+# Note: value zero indicacte dtest will use the test default value
+b_options="0 1 4096"
+u_options="0 1"
+w_options="0 1"
+S_options="0 9"
+B_options="0 1"
+D_options="0 1"
+W_options="0"
+# test defaults
+def_provider="ofa-v2-mlx4_0-1u"
+dat_conf="/etc/dat.conf"
+### --- End of user input section --- ###
+
+script_version="1.05"
+
+# History log
+# 1.05 - Disable data validation mode when using scif provider
+# From: Amir Hanania <amir.hanania at intel.com>
+# 1.04 - Add data validation for dtest ping pong
+# Add option not to use CPU mask in performance test
+# From: Amir Hanania <amir.hanania at intel.com>
+# 1.03 - Add dapl tests
+# From: Amir Hanania <amir.hanania at intel.com>
+# 1.02 - Change performane test to use dtest -W case for latency.
+# Note: You must have a dtesr version that support -W to run performane test.
+# From: Amir Hanania <amir.hanania at intel.com>
+# 1.01 - Add multi provider test
+# From: Amir Hanania <amir.hanania at intel.com>
+# 1.00 - Initial Version
+# From: Amir Hanania <amir.hanania at intel.com>
+# Test script to test dapl.
+# Run dtest test in multiple options.
+# Notes:
+# 1. For performance test. Same dtest configuration is used twice.
+# Once with -W for latency and once without for BW.
+#
+
+user_provider=$def_provider
+server_client_list=$server_list" "$client_list
+host_list=`for i in $server_client_list; do echo $i | awk -F "-mic" '{ print $1 }'; done | sort | uniq`
+provider_search_debug=0
+dapl_test_user_input="y"
+ran_one_dapltest=0
+dapl_test_rep_max=100
+dapl_test_rep=$dapl_test_rep_max
+mfo_test=0
+fast_test=0
+fast_test_str=""
+perf_test=0
+no_inline_data=0
+debug_info=0
+v_for_test=""
+user_srting=""
+ctrl_c=0
+runs=0
+max_run_time=0
+dapl_mtu=0
+loops=0
+log_file_dir="dtest_perf_logs"
+log_file="$log_file_dir/dtest_performance_"
+unidirection_test=0
+cpu_mask="no_cpu_mask"
+user_b_options="none"
+dog_file=/tmp/dog.log
+dog_ser=/tmp/dog.ser
+dog_cli=/tmp/dog.cli
+i=1
+while [ $i -lt 5000000 ]; do
+ b_options_for_perf_test+=" $i"
+ i=$(( $i*2 ))
+done
+mkdir -p $log_file_dir
+
+control_c()
+# run if user hits control-c
+{
+ echo -en "\n*** ^c ***\n"
+ if [ $ctrl_c -ne 0 ]; then
+ echo -ne "\n*** Forced EXIT! ***\n\n"
+ for s in $server_list; do
+ ssh root@$s "killall dtest" > /dev/null 2>&1
+ ssh root@$s "killall dapltest" > /dev/null 2>&1
+ done
+ for c in $client_list; do
+ ssh root@$c "killall dtest" > /dev/null 2>&1
+ ssh root@$c "killall dapltest" > /dev/null 2>&1
+ done
+ exit 1
+ fi
+ let "ctrl_c+=1"
+ echo -en "\n*** Will break after this test case ***\n\n"
+}
+
+# trap keyboard interrupt (control-c)
+trap control_c SIGINT
+
+exit_control()
+{
+ # if dog killed us. Clean up the dtest still working.
+ for s in $server_list; do
+ ssh root@$c "killall dtest" > /dev/null 2>&1
+ done
+ for c in $client_list; do
+ ssh root@$c "killall dtest" > /dev/null 2>&1
+ done
+
+ echo "2" > $dog_file
+ sleep 2
+ #kill dog
+ # jobs -p | xargs kill
+}
+# trap exit to kill dog when script exit
+#trap 'jobs -p | xargs kill' EXIT
+trap exit_control EXIT
+
+function dog(){
+ while true; do
+ val=`cat $dog_file`
+ if [ $val -eq 2 ]; then
+ exit
+ fi
+ if [ $val -eq 1 ]; then
+ server=`cat $dog_ser`
+ client=`cat $dog_cli`
+ server_err=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c ERR"`
+ client_err=`ssh root@$client "cat /tmp/dtest_cli_run.log | grep -c ERR"`
+ server_fail=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c FAIL"`
+ client_fail=`ssh root@$client "cat /tmp/dtest_cli_run.log | grep -c FAIL"`
+ if [ $server_err -gt 0 ] || [ $client_err -gt 0 ] || [ $server_fail -gt 0 ] || [ $client_fail -gt 0 ]; then
+ sleep 2
+ echo -e "\n\n\twatchdog bark - validation test failed\n\n"
+ killall ${0##*/}
+ fi
+ echo -n "."
+ fi
+ sleep 1
+ done
+}
+
+function wait_for_server_to_be_ready(){
+ i=99
+ echo -ne "Waiting to servers to come up... $i \r"
+ until [ $i -eq 0 ]; do
+ up=0
+ file_found="NOT found"
+ ssh root@$server [ -f /tmp/dtest_ser_run.log ] && file_found="file found"
+ if [ "$file_found" == "file found" ]; then
+ up=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c waiting"`
+ fi
+ if [ $up -eq 1 ]; then
+ break;
+ fi
+ let "i = i - 1"
+ echo -ne "Waiting to servers to come up... $i \r"
+ sleep 0.1
+ done
+}
+
+
+u=0
+w=0
+B=0
+b=0
+S=0
+D=0
+
+function testcase(){
+ # Setting the dtest options
+ if [ $u -ne 0 ]; then
+ u_for_test="-u"
+ else
+ u_for_test=""
+ fi
+ if [ $w -ne 0 ]; then
+ w_for_test="-w"
+ else
+ w_for_test=""
+ fi
+ if [ $B -ne 0 ]; then
+ B_for_test="-B $B"
+ else
+ B_for_test=""
+ fi
+ if [ $b -ne 0 ]; then
+ b_for_test="-b $b"
+ else
+ b_for_test=""
+ fi
+ if [ $S -ne 0 ]; then
+ S_for_test="-S $S"
+ else
+ S_for_test=""
+ fi
+ if [ $W -ne 0 ]; then
+ W_for_test="-W"
+ else
+ W_for_test=""
+ fi
+ if [ $D -ne 0 ]; then
+ if [ $do_not_validate_data_with_scif -eq 1 ]; then
+ return 0
+ fi
+ D_for_test="-D -a -B 10"
+ else
+ D_for_test=""
+ fi
+
+ if [ $ctrl_c -ne 0 ]; then
+ echo -ne "\n*** Stop test due to ctrl c ***\n\n"
+ exit 1
+ fi
+
+ # in case the prev test failed. The files will be still there for debug. Delete them for the new run.
+ ssh root@$server "rm /tmp/dtest_ser_run.log" > /dev/null 2>&1
+ ssh root@$client "rm /tmp/dtest_cli_run.log" > /dev/null 2>&1
+
+ if [ $D -eq 1 ]; then
+ support_data_validation
+ if [ $dtest_support_data_val -ne 1 ]; then
+ return
+ fi
+ fi
+
+ #Start the server
+ echo "----------------------------------------------------------"
+ echo "Test case: $W_for_test $D_for_test $u_for_test $w_for_test $B_for_test $b_for_test $S_for_test $v_for_test $user_srting"
+ echo -ne "Start $taskset_4_server dtest -P $provider server $server\r"
+ ssh root@$server "$export_str $taskset_4_server dtest -P $provider $W_for_test $u_for_test $w_for_test $B_for_test $b_for_test $S_for_test $v_for_test $user_srting $D_for_test >& /tmp/dtest_ser_run.log" &
+ ser_pid=$!
+
+ # Wait for server to be ready
+ wait_for_server_to_be_ready
+
+ if [ $i -eq 0 ]; then
+ echo $server dtest failed - did not start
+ ssh root@$server "killall dtest"
+ ssh root@$client "killall dtest"
+ exit 1
+ fi
+
+ # Start client
+ echo -ne "Start $taskset_4_client dtest -P $provider client \r"
+ ssh root@$client "$export_str $taskset_4_client dtest -P $provider -h $server $W_for_test $u_for_test $w_for_test $B_for_test $b_for_test $S_for_test $v_for_test $user_srting $D_for_test >& /tmp/dtest_cli_run.log" &
+ cli_pid=$!
+
+ if [ $D -eq 1 ]; then
+ echo $server > $dog_ser
+ echo $client > $dog_cli
+ echo "1" > $dog_file
+ fi
+
+ # Wait for Server and Client to be done
+ wait $ser_pid $cli_pid
+
+ if [ $D -eq 1 ]; then
+ echo "0" > $dog_file
+ fi
+
+ # Check results from log files
+ server_pass=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c PASSED"`
+ client_pass=`ssh root@$client "cat /tmp/dtest_cli_run.log | grep -c PASSED"`
+ server_err=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c ERR"`
+ client_err=`ssh root@$client "cat /tmp/dtest_cli_run.log | grep -c ERR"`
+ do_exit=0
+ if [ $ctrl_c -ne 0 ]; then
+ ssh root@$server "killall -9 dtest" > /dev/null 2>&1
+ ssh root@$client "killall -9 dtest" > /dev/null 2>&1
+ do_exit=1
+ fi
+
+ if [ $server_pass -ne 1 ] || [ $server_err -ne 0 ]; then
+ echo "****** ERROR - $server server failed (with $client client) *******"
+ echo " log file: /tmp/dtest_ser_run.log on $server"
+ do_exit=1
+ fi
+
+ if [ $client_pass -ne 1 ] || [ $client_err -ne 0 ]; then
+ echo "****** ERROR - $client client failed (with $server server) *******"
+ echo " log file: /tmp/dtest_cli_run.log on $client"
+ do_exit=1
+ fi
+
+ if [ $do_exit -eq 1 ]; then
+ echo
+ exit 1
+ fi
+
+ # Print to screen or file the results if needed
+ if [ $perf_test -eq 1 ]; then
+ echo -ne " \r"
+ if [ $fast_test -eq 1 ]; then
+ if [ $W -ne 0 ]; then
+ # second run is latency test called with -W
+ lat=`ssh root@$client cat /tmp/dtest_cli_run.log | grep PingPong | awk -F "latency " '{print $2}' | awk -F " us" '{ print $1 }'`
+ res="$lat, Tx size=$res"
+ echo "latency: $res"
+ echo $res >> $log_file
+ else
+ # First test for BW
+ res=`ssh root@$client cat /tmp/dtest_cli_run.log | grep direction | awk -F "00 x " '{ print $2 }'`
+ fi
+ else
+ if [ $W -ne 0 ]; then
+ # second run is latency test called with -W
+ lat=`ssh root@$client cat /tmp/dtest_cli_run.log | grep PingPong | awk -F "latency " '{print $2}' | awk -F " us" '{ print $1 }'`
+ echo -e "Byte size: $b\t\tlatency: $lat\t\tBW: $res"
+ res=`echo $res | awk -F " MB" '{ print $1 }'`
+ res=$(printf "%15s" $res)
+ lat=$(printf "%10s" $lat)
+ echo -e "$b\t\t$lat\t\t$res" >> $log_file
+ else
+ # First test for BW
+ res=`ssh root@$client cat /tmp/dtest_cli_run.log | grep direction | awk -F "00 x $b, " '{ print $2 }'`
+ fi
+ fi
+ fi
+ ssh root@$server "rm /tmp/dtest_ser_run.log"
+ ssh root@$client "rm /tmp/dtest_cli_run.log"
+
+ echo "Test case passed "
+
+ read -t 0.01 -n 1 -s u_input
+ ret=$?
+ if [ $ret -eq 0 ] && [ "$u_input" == "i" ]; then
+ print_round_info
+ fi
+
+ return 0
+}
+
+
+function wait_for_it(){
+ max_wait=900
+ i=$max_wait
+ sleep_for=0.1
+ test_start_time=`date +%s`
+ until [ $i -eq 0 ]; do
+ echo -n "."
+ sleep $sleep_for
+ up=`ssh root@$wait_for_it_machine cat $wait_for_it_file | grep -c "$wait_for_it_string"`
+ if [ $up -eq 1 ]; then
+ break;
+ fi
+ let "i = i - 1"
+ if [ $ctrl_c -ne 0 ]; then
+ i=0
+ fi
+ if [ $i -eq $(( $max_wait - 20 )) ]; then
+ sleep_for=1
+ fi
+ if [ $i -eq $(( $max_wait - 40 )) ]; then
+ sleep_for=3
+ fi
+ done
+
+ if [ $i -eq 0 ]; then
+ if [ $ctrl_c -ne 0 ]; then
+ echo -ne "\n\t*** Stop test due to ctrl c ***\n\n"
+ else
+ echo " failed"
+ echo -e "\n\n\tDid not find $wait_for_it_string string on machine: $wait_for_it_machine at file $wait_for_it_file - EXIT\n\n"
+ fi
+ ssh root@$server killall dapltest > /dev/null 2>&1
+ ssh root@$client killall dapltest > /dev/null 2>&1
+ exit
+ fi
+ test_end_time=`date +%s`
+ test_run_time=$(($test_end_time-$test_start_time))
+ echo " done in $test_run_time sec"
+}
+
+
+function print_round_info(){
+ now=`date +%s`
+ run_time=$(($now-$start_time))
+ ss=$(($run_time%60))
+ mm=$(($run_time/60))
+ mm=$(($mm%60))
+ hh=$(($run_time/3600))
+ echo "**************************************************************"
+ echo -e "\tin round $runs - $hh h $mm m $ss s"
+ echo "**************************************************************"
+}
+
+
+# Check if client and server dtest support data validation
+function support_data_validation() {
+ dtest_support_data_val=0
+
+ ssh root@$server "dtest -U >& /tmp/dtest_ser_run.log"
+ ssh root@$client "dtest -U >& /tmp/dtest_cli_run.log"
+ sleep .1
+ ser_is_valid=`ssh root@$server cat /tmp/dtest_ser_run.log | grep -c "validate data"`
+ if [ $ser_is_valid -ne 1 ]; then
+ return 0
+ fi
+ cli_is_valid=`ssh root@$client cat /tmp/dtest_cli_run.log | grep -c "validate data"`
+ if [ $cli_is_valid -ne 1 ]; then
+ return 0
+ fi
+ dtest_support_data_val=1
+}
+
+
+# Run dtest in all data size ping pong test with data validation mode between client and server
+function server_client_data_validation_test(){
+
+ echo -e "\n\n\n\t**** dtest data validation test\t\tprovider: $provider\t\tserver: $server $taskset_4_server\t\tclient: $client $taskset_4_client ****\n"
+ support_data_validation
+ if [ $dtest_support_data_val -ne 1 ]; then
+ echo -e "\t**** $client or $server dtest does not support data validation - skipping ****"
+ return
+ fi
+
+ echo -e " Start $taskset_4_server dtest -P $provider -D -a on server $server"
+ ssh root@$server "$export_str $taskset_4_server dtest -P $provider -D -a -B 100 >& /tmp/dtest_ser_run.log" &
+ ser_pid=$!
+ wait_for_server_to_be_ready
+
+ echo -e " Start $taskset_4_client dtest -P $provider -D -a on client $client"
+ ssh root@$client "$export_str $taskset_4_client dtest -P $provider -h $server -D -a -B 100 >& /tmp/dtest_cli_run.log" &
+ cli_pid=$!
+ # just wait a bit for files on server and clien be ready before waking up the dog
+ sleep 1
+
+ echo $server > $dog_ser
+ echo $client > $dog_cli
+ echo "1" > $dog_file
+
+ # Wait for Server and Client to be done
+ wait $ser_pid $cli_pid
+
+ echo "0" > $dog_file
+ echo
+ # Check results from log files
+ server_pass=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c PASSED"`
+ client_pass=`ssh root@$client "cat /tmp/dtest_cli_run.log | grep -c PASSED"`
+ server_err=`ssh root@$server "cat /tmp/dtest_ser_run.log | grep -c ERR"`
+ client_err=`ssh root@$client "cat /tmp/dtest_cli_run.log | grep -c ERR"`
+ do_exit=0
+ if [ $ctrl_c -ne 0 ]; then
+ ssh root@$server "killall -9 dtest" > /dev/null 2>&1
+ ssh root@$client "killall -9 dtest" > /dev/null 2>&1
+ do_exit=1
+ fi
+
+ if [ $server_pass -ne 1 ] || [ $server_err -ne 0 ]; then
+ echo "****** ERROR - $server server failed (with $client client) *******"
+ echo " log file: /tmp/dtest_ser_run.log on $server"
+ do_exit=1
+ fi
+
+ if [ $client_pass -ne 1 ] || [ $client_err -ne 0 ]; then
+ echo "****** ERROR - $client client failed (with $server server) *******"
+ echo " log file: /tmp/dtest_cli_run.log on $client"
+ do_exit=1
+ fi
+
+ if [ $do_exit -eq 1 ]; then
+ echo
+ exit 1
+ fi
+
+ echo -e "\n\tdtest data validation test\t\tserver: $server\t\tclient: $client\t\tprovider: $provider\t\tTEST PASSED\n\n"
+
+}
+
+
+# Run dapltest between client and server
+function server_client_dapl_test(){
+ ofa_post=""
+ dapl_test_rep=$dapl_test_rep_max
+ if [ $ctrl_c -ne 0 ]; then
+ echo -ne "\n*** Stop test due to ctrl c ***\n\n"
+ exit 1
+ fi
+
+ echo "----------------------------------------------------------"
+ echo -ne "\t**** dapltest\t\tprovider: $provider\t\tserver: $server\t\tclient: $client "
+
+ # in case the prev test failed. The files will be still there for debug. Delete them for the new run.
+ ssh root@$server "rm /tmp/dapltest_ser_run.log" > /dev/null 2>&1
+ ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1
+
+ # 1. skip if roc
+ # 2. check that provider is ofa or scm
+ is_roe=`echo $provider | grep -c roe`
+ if [ $is_roe -eq 1 ]; then
+ good_provider_for_dapltest=0
+ echo -e " - provider $provider not supported - skipping ****"
+ echo "----------------------------------------------------------"
+ return 0
+ fi
+ is_ofa=`ssh root@$server cat $dat_conf | grep $provider | grep -c libdaplofa`
+ is_scm=`ssh root@$server cat $dat_conf | grep $provider | grep -c libdaploscm`
+ if [ $is_ofa -eq 0 ] && [ $is_scm -eq 0 ]; then
+ good_provider_for_dapltest=0
+ echo -e " - provider $provider not supported - skipping ****"
+ echo "----------------------------------------------------------"
+ return 0
+ fi
+ if [ $is_ofa -eq 1 ]; then
+ dat_line=`ssh root@$server cat $dat_conf | grep $provider`
+ ofa_post=`echo $dat_line | grep lofa | awk '{ print $1 }' | awk -F "ofa-v2" '{ print $2 }'`
+ fi
+ ran_one_dapltest=1
+
+ # start server
+ wait_for_it_machine=$server
+ wait_for_it_file="/tmp/dapltest_ser_run.log"
+ wait_for_it_string="Dapltest: Service Point Ready"
+ echo -e " ****\n----------------------------------------------------------"
+ echo -e "dapltest\tprovider: $provider\tserver: $server\tclient: $client"
+ echo -ne "start dapltest server..."
+ ssh root@$server "dapltest -T S -D $provider >& /tmp/dapltest_ser_run.log" &
+ wait_for_it
+
+ # tests
+ wait_for_it_machine=$client
+ wait_for_it_file="/tmp/dapltest_cli_run.log"
+ wait_for_it_string="Total WQE"
+ # test 1
+ echo -ne "start dapltest client test 1 ..."
+ ssh root@$client "dapltest -T T -s $server$ofa_post -D $provider -i $dapl_test_rep -t 1 -w 1 client SR 256 server SR 256 >& /tmp/dapltest_cli_run.log" &
+ wait_for_it
+
+ if [ $fast_test -eq 0 ]; then
+ # test 2
+ if [ $dapl_test_rep -ne 1 ] && [ $test_run_time -ge 4 ]; then
+ dapl_test_rep=$(($dapl_test_rep/$test_run_time/8))
+ if [ $dapl_test_rep -eq 0 ]; then
+ dapl_test_rep=1
+ fi
+ echo Reduce rep to $dapl_test_rep
+ fi
+ echo -ne "start dapltest client test 2 ..."
+ ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1
+ ssh root@$client "dapltest -T T -s $server$ofa_post -D $provider -i $dapl_test_rep -t 1 -w 1 client SR 256 server RW 4096 server SR 256 >& /tmp/dapltest_cli_run.log" &
+ wait_for_it
+
+ # test 3
+ echo -ne "start dapltest client test 3 ..."
+ ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1
+ ssh root@$client "dapltest -T T -s $server$ofa_post -D $provider -i $dapl_test_rep -t 1 -w 1 client SR 256 server RR 4096 server SR 256 >& /tmp/dapltest_cli_run.log" &
+ wait_for_it
+
+ # test 4
+ echo -ne "start dapltest client test 4 ..."
+ ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1
+ ssh root@$client "dapltest -T T -s $server$ofa_post -D $provider -i $dapl_test_rep -t 1 -w 1 client SR 256 server RW 4096 server SR 256 client SR 256 server RW 4096 server SR 256 client SR 4096 server SR 256 >& /tmp/dapltest_cli_run.log" &
+ wait_for_it
+
+ # test 5
+ if [ $dapl_test_rep -ne 1 ] && [ $test_run_time -ge 2 ]; then
+ dapl_test_rep=$(($dapl_test_rep/8))
+ if [ $dapl_test_rep -eq 0 ]; then
+ dapl_test_rep=1
+ fi
+ echo Reduce rep to $dapl_test_rep
+ fi
+ echo -ne "start dapltest client test 5 ..."
+ ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1
+ ssh root@$client "dapltest -T T -s $server$ofa_post -D $provider -i $dapl_test_rep -t 1 -w 8 client SR 256 server RW 4096 server SR 256 client SR 256 server RW 4096 server SR 256 client SR 4096 server SR 256 >& /tmp/dapltest_cli_run.log" &
+ wait_for_it
+
+ if [ $dapl_test_rep -ne 1 ] && [ $test_run_time -ge 2 ]; then
+ dapl_test_rep=$(($dapl_test_rep/4))
+ if [ $dapl_test_rep -eq 0 ]; then
+ dapl_test_rep=1
+ fi
+ echo Reduce rep to $dapl_test_rep
+ fi
+ # test 6
+ echo -ne "start dapltest client test 6 ..."
+ ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1
+ ssh root@$client "dapltest -T T -s $server$ofa_post -D $provider -i $dapl_test_rep -t 4 -w 8 client SR 256 server RW 4096 server SR 256 client SR 256 server RW 4096 server SR 256 client SR 4096 server SR 256 >& /tmp/dapltest_cli_run.log" &
+ wait_for_it
+ fi
+
+ # stop server
+ echo -n "stop dapltest server..."
+ ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1
+ ssh root@$client "dapltest -T Q -s $server$ofa_post -D $provider >& /tmp/dapltest_cli_run.log" &
+ cli_pid=$!
+
+ wait_for_it_machine=$server
+ wait_for_it_file="/tmp/dapltest_ser_run.log"
+ wait_for_it_string="Exiting"
+ echo -n "wait for dapltest server to stop..."
+ wait_for_it
+
+ # Wait for Server and Client to be done
+ wait $cli_pid
+
+ # clean up
+ ssh root@$server "rm /tmp/dapltest_ser_run.log" > /dev/null 2>&1
+ ssh root@$client "rm /tmp/dapltest_cli_run.log" > /dev/null 2>&1
+
+ echo -e "\tdapltest\t\tserver: $server\t\tclient: $client\t\tprovider: $provider\t\tTESTS PASSED"
+ echo -e "----------------------------------------------------------\n"
+}
+
+
+# Run all the test cases between two machines.
+function server_host_test(){
+ taskset_4_server=""
+ taskset_4_client=""
+ if [ $perf_test -eq 1 ]; then
+ is_mic=`echo $server | grep -c mic`
+ if [ $is_mic -eq 0 ] && [ "$cpu_mask" != "no_cpu_mask" ]; then
+ taskset_4_server="taskset $cpu_mask "
+ fi
+ is_mic=`echo $client | grep -c mic`
+ if [ $is_mic -eq 0 ] && [ "$cpu_mask" != "no_cpu_mask" ]; then
+ taskset_4_client="taskset $cpu_mask "
+ fi
+
+ echo -e "\n**** dtest: provider: $provider \tserver: $server \tclient: $client ****\n" >> $log_file
+ if [ $fast_test -eq 0 ]; then
+ echo -e "\nBytes\t\t Latency\t\t\t MB/s" >> $log_file
+ fi
+ fi
+
+ if [ "$dapl_test_user_input" != "o" ]; then
+ echo -e "\n\n\n\t**** dtest\t\tprovider: $provider\t\tserver: $server $taskset_4_server\t\tclient: $client $taskset_4_client ****"
+
+ #set var value to zero in order to use dtest default value for that option.
+ for u in $u_options; do
+ for w in $w_options; do
+ for b in $b_options; do
+ for S in $S_options; do
+ for B in $B_options; do
+ for D in $D_options; do
+ for W in $W_options; do # Always keep last. See Note 1.
+ # Run one test case between Client and Server.
+ testcase
+ ret=$?
+ if [ $ret -ne 0 ]; then
+ echo TEST FAILED
+ exit 1
+ fi
+ sleep 1
+ done
+ done
+ done
+ done
+ done
+ done
+ done
+
+ echo -e "\n\tdtest\t\tserver: $server\t\tclient: $client\t\tprovider: $provider\t\tTEST PASSED\n\n"
+
+ if [ $perf_test -ne 1 ] && [ $do_not_validate_data_with_scif -eq 0 ] && [ $fast_test -ne 1 ]; then
+ server_client_data_validation_test
+ fi
+ fi
+
+ if [ "$dapl_test_user_input" != "n" ] && [ $good_provider_for_dapltest -eq 1 ] && [ $fast_test -ne 1 ]; then
+ server_client_dapl_test
+ fi
+
+}
+
+function help(){
+ echo -e "\n\tRun dtest and dapltest accross cluster - from each client to each server\n"
+ echo -e "\t\tServer list: $server_list"
+ echo -e "\t\tClient list: $client_list\n"
+ echo -e "\t-P <PROVIDER NAME> : Provider name or 'ALL' for all prividers (default $def_provider)"
+ echo -e "\t-f: Fast test"
+ echo -e "\t-l <NUM> : How many test loops to run. Def forever"
+ echo -e "\t-t <NUM> : How many minutes to run. Def forever"
+ echo -e "\t-p <CPUs mask> or \"no_cpu_mask\": Performance test"
+ echo -e "\t\tMask in 0xHEX format. should match host's /sys/class/mic/mic0/device/local_cpus"
+ echo -e "\t\tFor no CPU mask enter \"no_cpu_mask\""
+ echo -e "\t\tConsider also: taskset mpxyd, set mcm_affinity to 2 in /etc/mpxyd.conf, performance mode at the host scaling_governor"
+ echo -e "\t\tConsider also to change DAPL MTU (-M optoin)"
+ echo -e "\t-w: Write only test"
+ echo -e "\t-u: uni-direction only test"
+ echo -e "\t-d <n|y|o> : dapl test options. \"n\": No dapl tests. \"y\": Run dapl tests. \"o\": Run Only dapl tests (no dtest). Def: Run dapl_test"
+ echo -e "\t-M <NUM> : DAPL MTU"
+ echo -e "\t-b <NUM> : data size. Can be: one size, many sizes as a string or type \`all\` for all sizes power of 2"
+ echo -e "\t-U: \"user string\". user dtest option string ( -w -b -u and -S dtest options )"
+ echo -e "\t-z: use zero for -w -b -u and -S dtest options (zero mean test default value)"
+ echo -e "\t-i: No inline data test"
+ echo -e "\t-m: Force MFO test"
+ echo -e "\t-D: DAPL debug print in log files"
+ echo -e "\t-v: dtest verbose mode"
+ echo -e "\t-q: qib test over mlx4 (same as -m and -i options)"
+ echo -e "\t-V: Print the script version"
+ echo -e "\t-h: help"
+ echo -e "\n\tWhile test is running:"
+ echo -e "\t^c: Exit gracefully"
+ echo -e "\t^c^c: Forced exit"
+ echo -e "\ti: Print round number and time duration"
+ echo -e "\n\n"
+ exit 1
+}
+
+
+function log(){
+ if [ $provider_search_debug -eq 1 ]; then
+ echo -e "$@"
+ else
+ echo -n "."
+ fi
+}
+
+
+function providers_search(){
+ echo -e "\nSearching for devices"
+ first_host=1
+ for host in $host_list; do
+ # make sure host dat file exist
+ dat_conf_found="NOT found"
+ ssh root@$host "[ -f $dat_conf ]" && dat_conf_found="dat_conf_found"
+ if [ "$dat_conf_found" == "dat_conf_found" ]; then
+ log "$dat_conf found on $host"
+ else
+ echo -e "\n\t$dat_conf was not found on $host.\n\n"
+ exit 1
+ fi
+
+ #ib devices list
+ dev_list=`ssh root@$host ibv_devices | tail -n +3 | awk '{ print $1 }'`
+ for dev in $dev_list; do
+ # for each device
+ log Found $dev device
+ port_cnt=`ssh root@$host ibv_devinfo -d $dev | grep phys_port_cnt | awk '{print$2 }'`
+ log " $dev phys_port_cnt: $port_cnt"
+ for port in $(seq 1 $port_cnt); do
+ # for each post
+ log " checking $dev port $port status"
+ up=`ssh root@$host ibv_devinfo -d $dev -i $port | grep state | grep -c PORT_ACTIVE`
+ if [ $up -ne 1 ]; then
+ log " $dev port $i is not active"
+ continue
+ fi
+ log " $dev port $port is active"
+ log " add it to the list"
+ # get a list of providers that this device can use
+ providers+=`ssh root@$host cat $dat_conf | grep "$dev $port" | awk '{ print $1 }'`
+ providers+=" "
+ done
+ done
+
+ #add network ib devices
+ net_dev_list=` ssh root@$host netstat -i | grep -v "no statistics available" | tail -n +3 | awk '{ print $1 }'`
+ for net_dev in $net_dev_list; do
+ # for each net device
+ log Found $net_dev net device
+ is_ib=`ssh root@$host ip addr show $net_dev | grep -c infiniband`
+ if [ $is_ib -ne 1 ]; then
+ log " $net_dev net device is not ib device"
+ continue
+ fi
+ log " $net_dev is infiniband device"
+ has_ip_addr=`ssh root@$host ip addr show $net_dev | grep inet | grep -vc inet6`
+ if [ $has_ip_addr -ne 1 ]; then
+ log " $net_dev net device has no ip addr"
+ continue
+ fi
+ log " $net_dev net device has IP address"
+ log " add it to the list"
+ # get a list of providers that this device can use
+ providers+=`ssh root@$host cat $dat_conf | grep "$net_dev 0" | awk '{ print $1 }'`
+ providers+=" "
+ done
+
+ log; log -n "$host povider list: "; for i in $providers; do log -n "$i "; done; log
+ if [ $first_host -eq 1 ]; then
+ # just save providers from first host
+ hosts_providers_list=$providers
+ first_host=0
+ else
+ # Merge providers from prev hosts with the one from the new host
+ # Keep only the providers that are on both lists
+ log hosts p from prev hosts: $hosts_providers_list
+ hosts_providers_list+=$providers
+ hosts_providers_list=`for p in $hosts_providers_list; do echo $p; done | sort | uniq -d`
+ log hosts p after merge: $hosts_providers_list
+ fi
+ providers=""
+ done
+ cnt=0
+ echo -e "\nPovider list:"
+ for i in $hosts_providers_list; do
+ echo $i
+ let cnt+=1
+ done
+ if [ $cnt -eq 0 ]; then
+ echo -e "no devices where found\n\n"
+ exit
+ fi
+ echo -e "Total $cnt providers\n\n"
+}
+
+
+# check if the "server-client-provider" combination is OK
+# Set server_client_provider_is_not_valid_combo to one if not a valid combo
+function check_provider_server_client_combo(){
+ server_client_provider_is_not_valid_combo=0
+ #check the following:
+ # 1. scif providers can only run on the same machine.
+ is_scif=`echo $provider | grep -c scif`
+ if [ $is_scif -eq 1 ]; then
+ server_host=`echo $server | awk -F "-mic" '{ print $1 }'`
+ client_host=`echo $client | awk -F "-mic" '{ print $1 }'`
+ if [ $server_host == $client_host ]; then
+ return
+ else
+ server_client_provider_is_not_valid_combo=1
+ return
+ fi
+ fi
+ # 2. MIC qib can only run mcm provider
+ is_ser_mic=`echo $server | grep -c mic`
+ is_cli_mic=`echo $client | grep -c mic`
+ if [ $is_ser_mic -eq 1 ] || [ $is_cli_mic -eq 1 ]; then
+ # MIC Server or Client
+ is_qib_provider=`echo $provider | grep -c qib`
+ if [ $is_qib_provider -eq 1 ]; then
+ # Server or Client is MIC AND qib provider - make sure provider is MCM
+ is_mcm=`echo $provider | grep -c m`
+ if [ $is_mcm -eq 1 ]; then
+ return
+ else
+ server_client_provider_is_not_valid_combo=1
+ return
+ fi
+ fi
+ fi
+ # 3. check if MICs ib interface is UP
+ is_ib_provider=`echo $provider | grep -ce -ib`
+ if [ $is_ib_provider -eq 1 ]; then
+ interface=`echo $provider | awk -F "ofa-v2-" '{ print $2 }'`
+ if [ $is_ser_mic -eq 1 ]; then
+ up=`ssh root@$server ifconfig | grep -c $interface`
+ if [ $up -eq 1 ]; then
+ return
+ else
+ server_client_provider_is_not_valid_combo=1
+ return
+ fi
+ fi
+ if [ $is_cli_mic -eq 1 ]; then
+ up=`ssh root@$client ifconfig | grep -c $interface`
+ if [ $up -eq 1 ]; then
+ return
+ else
+ server_client_provider_is_not_valid_combo=1
+ return
+ fi
+ fi
+ fi
+}
+
+
+
+
+
+
+while getopts uviVzDmfwhiql:t:P:U:p:d:M:b: option
+do
+ case "${option}"
+ in
+ P) user_provider=${OPTARG};;
+ m) no_inline_data=1 ; mfo_test=1;;
+ f) fast_test=1; loops=1; fast_test_str=" fast test";;
+ p) cpu_mask=${OPTARG}; perf_test=1; W_options="0 1";;
+ U) user_srting=${OPTARG}; b_options="0"; u_options="0"; S_options="0"; w_options="0"; B_options="0";;
+ z) b_options="0"; u_options="0"; S_options="0"; w_options="0"; B_options="0";;
+ w) w_options="1";;
+ u) unidirection_test=1; u_options="1";;
+ D) debug_info=1;;
+ d) dapl_test_user_input=${OPTARG};;
+ v) v_for_test=" -v ";;
+ i) no_inline_data=1;;
+ q) no_inline_data=1 ; mfo_test=1;;
+ t) max_run_time=${OPTARG};;
+ M) dapl_mtu=${OPTARG};;
+ l) loops=${OPTARG};;
+ b) user_b_options=${OPTARG};;
+ V) echo -e "\n\t${0##*/} version $script_version\n\n"; exit;;
+ h) help;;
+ esac
+done
+
+if [ $fast_test -eq 1 ]; then
+ b_options="0"; u_options="0"; S_options="0"; w_options="0"; B_options="0"; D_options="0";
+fi
+
+if [ $perf_test -eq 1 ]; then
+ b_options=$b_options_for_perf_test; u_options="0"; S_options="0"; loops=1; w_options="1"; B_options="0"; user_srting="$user_string -p";dapl_test_user_input="n"; D_options="0";
+ legit_input=`echo $cpu_mask | grep -ci 0x`
+ if [ $legit_input -ne 1 ] && [ "$cpu_mask" != "no_cpu_mask" ]; then
+ echo -e "\n\t< 0xCPUs_mask > or \"no_cpu_mask\" in option -p is missing - input=$cpu_mask - Exit\n\n"
+ exit
+ fi
+fi
+
+if [ $fast_test -eq 1 ] && [ $perf_test -eq 1 ]; then
+ b_options="0"
+fi
+
+if [ $unidirection_test -eq 1 ]; then
+ u_options="1"
+fi
+
+if [ "$user_b_options" != "none" ]; then
+ if [ "$user_b_options" == "all" ]; then
+ b_options=$b_options_for_perf_test
+ else
+ b_options="$user_b_options"
+ fi
+fi
+
+if [ "$dapl_test_user_input" != "n" ] && [ "$dapl_test_user_input" != "y" ] && [ "$dapl_test_user_input" != "o" ]; then
+ echo -e "\n\tdapl test option must be n/y/o - Exit\n\n"
+ exit
+fi
+
+# check mpxyd is running on host machines.
+for host in $host_list; do
+ up=`ssh root@$host "ps ax | grep -c mpxyd"`
+ if [ $up -ne 3 ]; then
+ echo -e "\n\tERROR - mpxyd is not running on $host\n\n"
+ exit
+ fi
+ if [ $no_inline_data -eq 1 ]; then
+ up=`ssh root@$host cat /var/log/mpxyd.log | grep -c "RDMA IB inline threshold 0"`
+ if [ $up -ne 1 ]; then
+ echo on host $host you need to run mpxyd with mcm_ib_inline 0 in /etc/mpxyd.conf file for no inline data test
+ exit 1
+ fi
+ fi
+done
+
+if [ $user_provider == "ALL" ] || [ $user_provider == "all" ]; then
+ providers_search
+else
+ hosts_providers_list=$user_provider
+fi
+
+echo -e "\nServer list: $server_list"
+echo -e "Client list: $client_list"
+echo -e "Host list:"
+for i in $host_list; do
+ echo $i
+done
+echo
+
+if [ $mfo_test -eq 1 ]; then
+ export_str="export DAPL_MAX_INLINE=0 ; export DAPL_MCM_MFO=1 ; "
+ echo -ne "\n\t\t**** Running MFO test case \t\t$export_str ****\n\n"
+elif [ $no_inline_data -eq 1 ]; then
+ export_str="export DAPL_MAX_INLINE=0 ; "
+ echo -ne "\n\t\t**** Running no inline data test case \t\t$export_str ****\n\n"
+else
+ export_str=""
+fi
+
+if [ $debug_info -eq 1 ]; then
+ export_str="$export_str export DAPL_DBG_TYPE=0xffffffff ; "
+ echo -ne "\n\t\t**** Running in debug mode\t\texport value: $export_str ****\n\n"
+fi
+if [ $dapl_mtu -ne 0 ]; then
+ export_str="$export_str export DAPL_IB_MTU=$dapl_mtu ; "
+ echo -ne "\n\t\t**** Setting DAPL_IB_MTU to $dapl_mtu \t\texport value: $export_str ****\n\n"
+fi
+
+if [ $loops -ne 0 ]; then
+ echo -e "\n\tRunning$fast_test_str for $loops iterations"
+fi
+
+if [ $max_run_time -ne 0 ]; then
+ echo -e "\n\tRunning$fast_test_str for $max_run_time minutes"
+fi
+
+if [ $loops -eq 0 ] && [ $max_run_time -eq 0 ]; then
+ echo -ne "\n\tRunning$fast_test_str forever\n\n"
+fi
+
+if [ $perf_test -eq 1 ]; then
+ if [ $unidirection_test -eq 1 ]; then
+ log_file+="unidirection_test-"
+ else
+ log_file+="bidirection_test-"
+ fi
+ log_file+=`date +%F-%H-%M-%S`
+ echo -e "\n\tRunning performance test with cpu mask: $cpu_mask\n\tOutput file: $log_file"
+ echo "Server list: $server_list" > $log_file
+ echo "Client list: $client_list" >> $log_file
+ echo "CPU mask: $cpu_mask" >> $log_file
+ if [ $dapl_mtu -ne 0 ]; then
+ echo "DAPL_IB_MTU: $dapl_mtu" >> $log_file
+ else
+ echo "DAPL_IB_MTU: Default value" >> $log_file
+ fi
+ if [ $unidirection_test -eq 1 ]; then
+ echo "Test type: unidirection test" >> $log_file
+ else
+ echo "Test type:bidirection test" >> $log_file
+ fi
+ for host in $host_list; do
+ op_poll=`ssh root@$host cat /var/log/mpxyd.log | grep -c "OP thread polling enabled"`
+ if [ $op_poll -ne 1 ]; then
+ echo "OP thread polling on $host: disabled" >> $log_file
+ echo -e "\tOP thread polling on $host: disabled"
+ else
+ echo "OP thread polling on $host: enabled" >> $log_file
+ echo -e "\tOP thread polling on $host: enabled"
+ fi
+ done
+ echo -e "\n\n"
+fi
+echo "0" > $dog_file
+dog &
+
+sleep 1
+start_time=`date +%s`
+
+while [ 1 ]; do
+ now=`date +%s`
+ run_time=$(($now-$start_time))
+ ss=$(($run_time%60))
+ mm=$(($run_time/60))
+ total_run_time_in_min=$mm
+ mm=$(($mm%60))
+ hh=$(($run_time/3600))
+ dd=$(($hh/24))
+ hh=$(($hh%24))
+ pp=$(printf "%d days %d hours %d min and %d sec" $dd $hh $mm $ss)
+
+ echo
+ echo
+ echo "**************************************************************"
+ echo "**************************************************************"
+ echo Run time: $pp
+ if [ $max_run_time -ne 0 ] && [ $total_run_time_in_min -ge $max_run_time ]; then
+ echo -e "Ran for the $max_run_time minute requested by the user - Exiting\n\n"
+ break;
+ fi
+ if [ $loops -ne 0 ] && [ $runs -eq $loops ]; then
+ echo -e "Ran for the $loops iterations requested by the user - Exiting\n\n"
+ break;
+ fi
+ runs=$(( $runs + 1 ))
+ echo Starting round $runs
+ date
+ echo "**************************************************************"
+ echo "**************************************************************"
+ echo
+
+ # Runinng
+ for provider in $hosts_providers_list; do
+ do_not_validate_data_with_scif=`echo $provider | grep -c scif`
+ good_provider_for_dapltest=1
+ for server in $server_list; do
+ for client in $client_list; do
+ check_provider_server_client_combo
+ if [ $server_client_provider_is_not_valid_combo -ne 0 ]; then
+ #echo -e "***** ***** skipping test case: Server:$server Client:$client provider:$provider ***** *****"
+ continue
+ fi
+ # Run all test cases between Client and Server.
+ server_host_test
+ done
+ done
+
+ if [ "$dapl_test_user_input" == "o" ] && [ $ran_one_dapltest -eq 0 ]; then
+ echo -e "\n\n\n\n\t\t***** ***** WARNING: only dapltest was set up but no dapltest was done with $provider provider $export_str ***** *****\n\n"
+ else
+ echo -e "\n\n\n\n\t\t***** ***** server client tests with $provider provider $export_str - TEST PASSED ***** *****\n\n"
+ fi
+ done
+done
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ofed/dapl.git
More information about the Pkg-ofed-commits
mailing list