[Pkg-ofed-commits] [perftest] 01/03: Imported Upstream version 1.2-OFED-1.4.2
Ana Beatriz Guerrero López
ana at moszumanska.debian.org
Wed Jul 2 14:01:26 UTC 2014
This is an automated email from the git hooks/post-receive script.
ana pushed a commit to branch master
in repository perftest.
commit 719537d322cddfd3d50832d5f88fe744d5432629
Author: Ana Guerrero López <ana at ekaia.org>
Date: Wed Jul 2 16:01:16 2014 +0200
Imported Upstream version 1.2-OFED-1.4.2
---
COPYING | 28 +
Makefile | 20 +
README | 144 +++++
clock_test.c | 25 +
get_clock.c | 190 +++++++
get_clock.h | 81 +++
perftest.spec | 53 ++
rdma_bw.c | 1276 +++++++++++++++++++++++++++++++++++++++++++
rdma_lat.c | 1307 ++++++++++++++++++++++++++++++++++++++++++++
read_bw.c | 1049 ++++++++++++++++++++++++++++++++++++
read_lat.c | 1108 ++++++++++++++++++++++++++++++++++++++
runme | 19 +
send_bw.c | 1489 +++++++++++++++++++++++++++++++++++++++++++++++++++
send_lat.c | 1375 +++++++++++++++++++++++++++++++++++++++++++++++
write_bw.c | 1182 ++++++++++++++++++++++++++++++++++++++++
write_bw_postlist.c | 1166 ++++++++++++++++++++++++++++++++++++++++
write_lat.c | 1094 +++++++++++++++++++++++++++++++++++++
17 files changed, 11606 insertions(+)
diff --git a/COPYING b/COPYING
new file mode 100755
index 0000000..1dae2f9
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,28 @@
+This software is available to you under a choice of one of two
+licenses. You may choose to be licensed under the terms of the GNU
+General Public License (GPL) Version 2, available from the file
+COPYING in the main directory of this source tree, or the
+OpenIB.org BSD license below:
+
+ Redistribution and use in source and binary forms, with or
+ without modification, are permitted provided that the following
+ conditions are met:
+
+ - Redistributions of source code must retain the above
+ copyright notice, this list of conditions and the following
+ disclaimer.
+
+ - Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials
+ provided with the distribution.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/Makefile b/Makefile
new file mode 100755
index 0000000..8042531
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,20 @@
+TESTS = write_bw_postlist rdma_lat rdma_bw send_lat send_bw write_lat write_bw read_lat read_bw
+UTILS = clock_test
+
+all: ${TESTS} ${UTILS}
+
+CFLAGS += -Wall -g -D_GNU_SOURCE -O2
+EXTRA_FILES = get_clock.c
+EXTRA_HEADERS = get_clock.h
+#The following seems to help GNU make on some platforms
+LOADLIBES +=
+LDFLAGS +=
+
+${TESTS}: LOADLIBES += -libverbs -lrdmacm
+
+${TESTS} ${UTILS}: %: %.c ${EXTRA_FILES} ${EXTRA_HEADERS}
+ $(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $< ${EXTRA_FILES} $(LOADLIBES) $(LDLIBS) -o ib_$@
+clean:
+ $(foreach fname,${TESTS} ${UTILS}, rm -f ib_${fname})
+.DELETE_ON_ERROR:
+.PHONY: all clean
diff --git a/README b/README
new file mode 100755
index 0000000..8c0d558
--- /dev/null
+++ b/README
@@ -0,0 +1,144 @@
+ Open Fabrics Enterprise Distribution (OFED)
+ Performance Tests README for OFED 1.3
+
+ March 2008
+
+
+
+===============================================================================
+Table of Contents
+===============================================================================
+1. Overview
+2. Notes on Testing Method
+3. Test Descriptions
+4. Running Tests
+
+===============================================================================
+1. Overview
+===============================================================================
+This is a collection of tests written over uverbs intended for use as a
+performance micro-benchmark. As an example, the tests can be used for
+hardware or software tuning and/or functional testing.
+
+Please post results and observations to the openib-general mailing list.
+See "Contact Us" at http://openib.org/mailman/listinfo/openib-general and
+http://www.openib.org.
+
+
+===============================================================================
+2. Notes on Testing Method
+===============================================================================
+- The benchmark uses the CPU cycle counter to get time stamps without a context
+ switch. Some CPU architectures (e.g., Intel's 80486 or older PPC) do NOT have
+ such capability.
+
+- The benchmark measures round-trip time but reports half of that as one-way
+ latency. This means that it may not be sufficiently accurate for asymmetrical
+ configurations.
+
+- Min/Median/Max results are reported.
+ The Median (vs average) is less sensitive to extreme scores.
+ Typically, the Max value is the first value measured.
+
+- Larger samples only help marginally. The default (1000) is very satisfactory.
+ Note that an array of cycles_t (typically an unsigned long) is allocated
+ once to collect samples and again to store the difference between them.
+ Really big sample sizes (e.g., 1 million) might expose other problems
+ with the program.
+
+- The "-H" option will dump the histogram for additional statistical analysis.
+ See xgraph, ygraph, r-base (http://www.r-project.org/), pspp, or other
+ statistical math programs.
+
+Architectures tested: i686, x86_64, ia64
+
+
+===============================================================================
+3. Test Descriptions
+===============================================================================
+
+
+
+The following tests are mainly useful for hardware/software benchmarking.
+
+write_lat.c latency test with RDMA write transactions
+write_bw.c bandwidth test with RDMA write transactions
+send_lat.c latency test with send transactions
+send_bw.c bandwidth test with send transactions
+read_lat.c latency test with RDMA read transactions
+read_bw.c bandwidth test with RDMA read transactions
+
+
+Legacy tests: (To be removed in the next release)
+rdma_lat.c latency test with RDMA write transactions
+rdma_bw.c streaming bandwidth test with RDMA write transactions
+
+The executable name of each test starts with the general prefix "ib_";
+for example, ib_write_lat.
+
+
+===============================================================================
+4. Running Tests
+===============================================================================
+
+Prerequisites:
+ kernel 2.6
+ ib_uverbs (kernel module) matches libibverbs
+ ("match" means binary compatible, but ideally of the same SVN rev)
+
+Server: ./<test name> <options>
+Client: ./<test name> <options> <server IP address>
+
+ o <server address> is IPv4 or IPv6 address. You can use the IPoIB
+diags_release_notes.txt
+mpi-selector_release_notes.txt
+rdma_cm_release_notes.txt
+MSTFLINT_README.txt
+open_mpi_release_notes.txt RDS_README.txt
+ib-bonding.txt
+mthca_release_notes.txt
+opensm_release_notes.txt
+rds_release_notes.txt
+ibutils_release_notes.txt*
+mvapich_release_notes.txt
+PERF_TEST_README.txt
+sdp_release_notes.txt
+ipoib_release_notes.txt
+srp_release_notes.txt
+QoS_in_OFED.txt
+SRPT_README.txt
+mlx4_release_notes.txt
+QoS_management_in_OpenSM.
+ address if IPoIB is configured.
+ o --help lists the available <options>
+
+ *** IMPORTANT NOTE: The SAME OPTIONS must be passed to both server and client.
+
+
+Common Options to all tests:
+ -p, --port=<port> listen on/connect to port <port> (default: 18515)
+ -m, --mtu=<mtu> mtu size (default: 1024)
+ -d, --ib-dev=<dev> use IB device <dev> (default: first device found)
+ -i, --ib-port=<port> use port <port> of IB device (default: 1)
+ -s, --size=<size> size of message to exchange (default: 1)
+ -a, --all run sizes from 2 till 2^23
+ -t, --tx-depth=<dep> size of tx queue (default: 50)
+ -n, --iters=<iters> number of exchanges (at least 100, default: 1000)
+ -C, --report-cycles report times in cpu cycle units
+ (default: microseconds)
+ -H, --report-histogram print out all results
+ (default: print summary only)
+ -U, --report-unsorted (implies -H) print out unsorted results
+ (default: sorted)
+ -V, --version display version number
+
+ *** IMPORTANT NOTE: You need to be running a Subnet Manager on the switch or
+ on one of the nodes in your fabric.
+
+Example:
+Run "ib_write_lat -a" on the server side.
+Then run "ib_write_lat -a <server IP address>" on the client side.
+
+ib_write_lat will exit on both server and client after printing results.
+
+
diff --git a/clock_test.c b/clock_test.c
new file mode 100755
index 0000000..b585d74
--- /dev/null
+++ b/clock_test.c
@@ -0,0 +1,25 @@
+#include <unistd.h>
+#include <stdio.h>
+#include "get_clock.h"
+
+int main()
+{
+ int no_cpu_freq_fail = 0;
+ double mhz;
+ mhz = get_cpu_mhz(no_cpu_freq_fail);
+ cycles_t c1, c2;
+
+ if (!mhz) {
+ printf("Unable to calibrate cycles. Exiting.\n");
+ return 2;
+ }
+
+ printf("Type CTRL-C to cancel.\n");
+ for(;;)
+ {
+ c1 = get_cycles();
+ sleep(1);
+ c2 = get_cycles();
+ printf("1 sec = %g usec\n", (c2 - c1) / mhz);
+ }
+}
diff --git a/get_clock.c b/get_clock.c
new file mode 100755
index 0000000..0acb074
--- /dev/null
+++ b/get_clock.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ *
+ * Author: Michael S. Tsirkin <mst at mellanox.co.il>
+ */
+
+/* #define DEBUG 1 */
+/* #define DEBUG_DATA 1 */
+/* #define GET_CPU_MHZ_FROM_PROC 1 */
+
+/* For gettimeofday */
+#define _BSD_SOURCE
+#include <sys/time.h>
+
+#include <unistd.h>
+#include <stdio.h>
+#include "get_clock.h"
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+#ifndef DEBUG_DATA
+#define DEBUG_DATA 0
+#endif
+
+#define MEASUREMENTS 200
+#define USECSTEP 10
+#define USECSTART 100
+
+/*
+ Use linear regression to calculate cycles per microsecond.
+ http://en.wikipedia.org/wiki/Linear_regression#Parameter_estimation
+*/
+static double sample_get_cpu_mhz(void)
+{
+ struct timeval tv1, tv2;
+ cycles_t start;
+ double sx = 0, sy = 0, sxx = 0, syy = 0, sxy = 0;
+ double tx, ty;
+ int i;
+
+ /* Regression: y = a + b x */
+ long x[MEASUREMENTS];
+ cycles_t y[MEASUREMENTS];
+ double a; /* system call overhead in cycles */
+ double b; /* cycles per microsecond */
+ double r_2;
+
+ for (i = 0; i < MEASUREMENTS; ++i) {
+ start = get_cycles();
+
+ if (gettimeofday(&tv1, NULL)) {
+ fprintf(stderr, "gettimeofday failed.\n");
+ return 0;
+ }
+
+ do {
+ if (gettimeofday(&tv2, NULL)) {
+ fprintf(stderr, "gettimeofday failed.\n");
+ return 0;
+ }
+ } while ((tv2.tv_sec - tv1.tv_sec) * 1000000 +
+ (tv2.tv_usec - tv1.tv_usec) < USECSTART + i * USECSTEP);
+
+ x[i] = (tv2.tv_sec - tv1.tv_sec) * 1000000 +
+ tv2.tv_usec - tv1.tv_usec;
+ y[i] = get_cycles() - start;
+ if (DEBUG_DATA)
+ fprintf(stderr, "x=%ld y=%Ld\n", x[i], (long long)y[i]);
+ }
+
+ for (i = 0; i < MEASUREMENTS; ++i) {
+ tx = x[i];
+ ty = y[i];
+ sx += tx;
+ sy += ty;
+ sxx += tx * tx;
+ syy += ty * ty;
+ sxy += tx * ty;
+ }
+
+ b = (MEASUREMENTS * sxy - sx * sy) / (MEASUREMENTS * sxx - sx * sx);
+ a = (sy - b * sx) / MEASUREMENTS;
+
+ if (DEBUG)
+ fprintf(stderr, "a = %g\n", a);
+ if (DEBUG)
+ fprintf(stderr, "b = %g\n", b);
+ if (DEBUG)
+ fprintf(stderr, "a / b = %g\n", a / b);
+ r_2 = (MEASUREMENTS * sxy - sx * sy) * (MEASUREMENTS * sxy - sx * sy) /
+ (MEASUREMENTS * sxx - sx * sx) /
+ (MEASUREMENTS * syy - sy * sy);
+
+ if (DEBUG)
+ fprintf(stderr, "r^2 = %g\n", r_2);
+ if (r_2 < 0.9) {
+ fprintf(stderr,"Correlation coefficient r^2: %g < 0.9\n", r_2);
+ return 0;
+ }
+
+ return b;
+}
+
+static double proc_get_cpu_mhz(int no_cpu_freq_fail)
+{
+ FILE* f;
+ char buf[256];
+ double mhz = 0.0;
+
+ f = fopen("/proc/cpuinfo","r");
+ if (!f)
+ return 0.0;
+ while(fgets(buf, sizeof(buf), f)) {
+ double m;
+ int rc;
+ rc = sscanf(buf, "cpu MHz : %lf", &m);
+ if (rc != 1) { /* PPC has a different format */
+ rc = sscanf(buf, "clock : %lf", &m);
+ if (rc != 1)
+ continue;
+ }
+ if (mhz == 0.0) {
+ mhz = m;
+ continue;
+ }
+ if (mhz != m) {
+ fprintf(stderr, "Conflicting CPU frequency values"
+ " detected: %lf != %lf\n", mhz, m);
+ if (no_cpu_freq_fail) {
+ fprintf(stderr, "Test integrity may be harmed !\n");
+ }else{
+ return 0.0;
+ }
+ continue;
+ }
+ }
+ fclose(f);
+ return mhz;
+}
+
+
+double get_cpu_mhz(int no_cpu_freq_fail)
+{
+ double sample, proc, delta;
+ sample = sample_get_cpu_mhz();
+ proc = proc_get_cpu_mhz(no_cpu_freq_fail);
+
+ if (!proc || !sample)
+ return 0;
+
+ delta = proc > sample ? proc - sample : sample - proc;
+ if (delta / proc > 0.01) {
+ fprintf(stderr, "Warning: measured timestamp frequency "
+ "%g differs from nominal %g MHz\n",
+ sample, proc);
+ return sample;
+ }
+ return proc;
+}
diff --git a/get_clock.h b/get_clock.h
new file mode 100755
index 0000000..8985568
--- /dev/null
+++ b/get_clock.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ *
+ * Author: Michael S. Tsirkin <mst at mellanox.co.il>
+ */
+
+#ifndef GET_CLOCK_H
+#define GET_CLOCK_H
+
+#if defined (__x86_64__) || defined(__i386__)
+/* Note: only x86 CPUs which have rdtsc instruction are supported. */
+typedef unsigned long long cycles_t;
+static inline cycles_t get_cycles()
+{
+ unsigned low, high;
+ unsigned long long val;
+ asm volatile ("rdtsc" : "=a" (low), "=d" (high));
+ val = high;
+ val = (val << 32) | low;
+ return val;
+}
+#elif defined(__PPC__) || defined(__PPC64__)
+/* Note: only PPC CPUs which have mftb instruction are supported. */
+/* PPC64 has mftb */
+typedef unsigned long cycles_t;
+static inline cycles_t get_cycles()
+{
+ cycles_t ret;
+
+ asm volatile ("mftb %0" : "=r" (ret) : );
+ return ret;
+}
+#elif defined(__ia64__)
+/* Itanium2 and up has ar.itc (Itanium1 has errata) */
+typedef unsigned long cycles_t;
+static inline cycles_t get_cycles()
+{
+ cycles_t ret;
+
+ asm volatile ("mov %0=ar.itc" : "=r" (ret));
+ return ret;
+}
+
+#else
+#warning get_cycles not implemented for this architecture: attempt asm/timex.h
+#include <asm/timex.h>
+#endif
+
+extern double get_cpu_mhz(int);
+
+#endif
diff --git a/perftest.spec b/perftest.spec
new file mode 100755
index 0000000..079dc7a
--- /dev/null
+++ b/perftest.spec
@@ -0,0 +1,53 @@
+Name: perftest
+Summary: IB Performance tests
+Version: 1.2
+Release: 1.ofed1.4.2
+License: BSD 3-Clause, GPL v2 or later
+Group: Productivity/Networking/Diagnostic
+Source: http://www.openfabrics.org/downloads/perftest-1.2.tar.gz
+Url: http://www.openfabrics.org
+BuildRoot: %{_tmppath}/%{name}-%{version}-build
+BuildRequires: libibverbs-devel librdmacm-devel
+
+%description
+gen2 uverbs microbenchmarks
+
+
+
+%prep
+%setup -q
+
+%build
+export CFLAGS="$RPM_OPT_FLAGS"
+%{__make}
+chmod -x runme
+
+%install
+install -D -m 0755 ib_rdma_lat $RPM_BUILD_ROOT%{_bindir}/ib_rdma_lat
+install -D -m 0755 ib_rdma_bw $RPM_BUILD_ROOT%{_bindir}/ib_rdma_bw
+install -D -m 0755 ib_write_lat $RPM_BUILD_ROOT%{_bindir}/ib_write_lat
+install -D -m 0755 ib_write_bw $RPM_BUILD_ROOT%{_bindir}/ib_write_bw
+install -D -m 0755 ib_send_lat $RPM_BUILD_ROOT%{_bindir}/ib_send_lat
+install -D -m 0755 ib_send_bw $RPM_BUILD_ROOT%{_bindir}/ib_send_bw
+install -D -m 0755 ib_read_lat $RPM_BUILD_ROOT%{_bindir}/ib_read_lat
+install -D -m 0755 ib_read_bw $RPM_BUILD_ROOT%{_bindir}/ib_read_bw
+install -D -m 0755 ib_write_bw_postlist $RPM_BUILD_ROOT%{_bindir}/ib_write_bw_postlist
+install -D -m 0755 ib_clock_test $RPM_BUILD_ROOT%{_bindir}/ib_clock_test
+
+%clean
+rm -rf ${RPM_BUILD_ROOT}
+
+%files
+%defattr(-, root, root)
+%doc README COPYING runme
+%_bindir/*
+
+%changelog
+* Mon Jul 09 2007 - hvogel at suse.de
+- Use correct version
+* Wed Jul 04 2007 - hvogel at suse.de
+- Add GPL COPYING file [#289509]
+* Mon Jul 02 2007 - hvogel at suse.de
+- Update to the OFED 1.2 version
+* Fri Jun 22 2007 - hvogel at suse.de
+- Initial Package, Version 1.1
diff --git a/rdma_bw.c b/rdma_bw.c
new file mode 100755
index 0000000..5c3f462
--- /dev/null
+++ b/rdma_bw.c
@@ -0,0 +1,1276 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <netdb.h>
+#include <malloc.h>
+#include <getopt.h>
+#include <arpa/inet.h>
+#include <byteswap.h>
+#include <time.h>
+
+#include <infiniband/verbs.h>
+#include <rdma/rdma_cma.h>
+
+#include "get_clock.h"
+
+#define PINGPONG_RDMA_WRID 3
+
+static int sl = 0;
+static int page_size;
+static pid_t pid;
+
+struct pingpong_context {
+ struct ibv_context *context;
+ struct ibv_pd *pd;
+ struct ibv_mr *mr;
+ struct ibv_cq *rcq;
+ struct ibv_cq *scq;
+ struct ibv_qp *qp;
+ struct ibv_comp_channel *ch;
+ void *buf;
+ unsigned size;
+ int tx_depth;
+ struct ibv_sge list;
+ struct ibv_send_wr wr;
+};
+
+struct pingpong_dest {
+ int lid;
+ int qpn;
+ int psn;
+ unsigned rkey;
+ unsigned long long vaddr;
+};
+
+struct pp_data {
+ int port;
+ int ib_port;
+ unsigned size;
+ int tx_depth;
+ int use_cma;
+ int sockfd;
+ char *servername;
+ struct pingpong_dest my_dest;
+ struct pingpong_dest *rem_dest;
+ struct ibv_device *ib_dev;
+ struct rdma_event_channel *cm_channel;
+ struct rdma_cm_id *cm_id;
+
+};
+
+static void pp_post_recv(struct pingpong_context *);
+static void pp_wait_for_done(struct pingpong_context *);
+static void pp_send_done(struct pingpong_context *);
+static void pp_wait_for_start(struct pingpong_context *);
+static void pp_send_start(struct pingpong_context *);
+static void pp_close_cma(struct pp_data );
+static struct pingpong_context *pp_init_ctx(void *, struct pp_data *);
+
+static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port)
+{
+ struct ibv_port_attr attr;
+
+ if (ibv_query_port(ctx->context, port, &attr))
+ return 0;
+
+ return attr.lid;
+}
+
+static struct pingpong_context *pp_client_connect(struct pp_data *data)
+{
+ struct addrinfo *res, *t;
+ struct addrinfo hints = {
+ .ai_family = AF_UNSPEC,
+ .ai_socktype = SOCK_STREAM
+ };
+ char *service;
+ int n;
+ int sockfd = -1;
+ int n_retries = 10;
+ struct rdma_cm_event *event;
+ struct sockaddr_in sin;
+ struct pingpong_context *ctx = NULL;
+ struct rdma_conn_param conn_param;
+
+ if (asprintf(&service, "%d", data->port) < 0)
+ goto err4;
+
+ n = getaddrinfo(data->servername, service, &hints, &res);
+
+ if (n < 0) {
+ fprintf(stderr, "%d:%s: %s for %s:%d\n",
+ pid, __func__, gai_strerror(n),
+ data->servername, data->port);
+ goto err4;
+ }
+
+ if (data->use_cma) {
+ sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr;
+ sin.sin_family = AF_INET;
+ sin.sin_port = htons(data->port);
+retry_addr:
+ if (rdma_resolve_addr(data->cm_id, NULL,
+ (struct sockaddr *)&sin, 2000)) {
+ fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n",
+ pid, __func__ );
+ goto err2;
+ }
+
+ if (rdma_get_cm_event(data->cm_channel, &event))
+ goto err2;
+
+ if (event->event == RDMA_CM_EVENT_ADDR_ERROR
+ && n_retries-- > 0) {
+ rdma_ack_cm_event (event);
+ goto retry_addr;
+ }
+
+ if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
+ fprintf(stderr, "%d:%s: unexpected CM event %d\n",
+ pid, __func__, event->event);
+ goto err1;
+ }
+ rdma_ack_cm_event(event);
+
+retry_route:
+ if (rdma_resolve_route(data->cm_id, 2000)) {
+ fprintf(stderr, "%d:%s: rdma_resolve_route failed\n",
+ pid, __func__);
+ goto err2;
+ }
+
+ if (rdma_get_cm_event(data->cm_channel, &event))
+ goto err2;
+
+ if (event->event == RDMA_CM_EVENT_ROUTE_ERROR
+ && n_retries-- > 0) {
+ rdma_ack_cm_event(event);
+ goto retry_route;
+ }
+
+ if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
+ fprintf(stderr, "%d:%s: unexpected CM event %d\n",
+ pid, __func__, event->event);
+ rdma_ack_cm_event(event);
+ goto err1;
+ }
+ rdma_ack_cm_event(event);
+ ctx = pp_init_ctx(data->cm_id, data);
+ if (!ctx) {
+ fprintf(stderr, "%d:%s: pp_init_ctx failed\n", pid, __func__);
+ goto err2;
+ }
+ data->my_dest.psn = lrand48() & 0xffffff;
+ data->my_dest.qpn = 0;
+ data->my_dest.rkey = ctx->mr->rkey;
+ data->my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size;
+
+ memset(&conn_param, 0, sizeof conn_param);
+ conn_param.responder_resources = 1;
+ conn_param.initiator_depth = 1;
+ conn_param.retry_count = 5;
+ conn_param.private_data = &data->my_dest;
+ conn_param.private_data_len = sizeof(data->my_dest);
+
+ if (rdma_connect(data->cm_id, &conn_param)) {
+ fprintf(stderr, "%d:%s: rdma_connect failure\n", pid, __func__);
+ goto err2;
+ }
+
+ if (rdma_get_cm_event(data->cm_channel, &event))
+ goto err2;
+
+ if (event->event != RDMA_CM_EVENT_ESTABLISHED) {
+ fprintf(stderr, "%d:%s: unexpected CM event %d\n",
+ pid, __func__, event->event);
+ goto err1;
+ }
+ if (!event->param.conn.private_data ||
+ (event->param.conn.private_data_len < sizeof(*data->rem_dest))) {
+ fprintf(stderr, "%d:%s: bad private data ptr %p len %d\n",
+ pid, __func__, event->param.conn.private_data,
+ event->param.conn.private_data_len);
+ goto err1;
+ }
+ data->rem_dest = malloc(sizeof *data->rem_dest);
+ if (!data->rem_dest)
+ goto err1;
+
+ memcpy(data->rem_dest, event->param.conn.private_data, sizeof(*data->rem_dest));
+ rdma_ack_cm_event(event);
+
+ } else {
+ for (t = res; t; t = t->ai_next) {
+ sockfd = socket(t->ai_family, t->ai_socktype,
+ t->ai_protocol);
+ if (sockfd >= 0) {
+ if (!connect(sockfd, t->ai_addr, t->ai_addrlen))
+ break;
+ close(sockfd);
+ sockfd = -1;
+ }
+ }
+ if (sockfd < 0) {
+ fprintf(stderr, "%d:%s: Couldn't connect to %s:%d\n",
+ pid, __func__, data->servername, data->port);
+ goto err3;
+ }
+ ctx = pp_init_ctx(data->ib_dev, data);
+ if (!ctx)
+ goto err3;
+ data->sockfd = sockfd;
+ }
+
+ freeaddrinfo(res);
+ return ctx;
+
+err1:
+ rdma_ack_cm_event(event);
+err2:
+ rdma_destroy_id(data->cm_id);
+ rdma_destroy_event_channel(data->cm_channel);
+err3:
+ freeaddrinfo(res);
+err4:
+ return NULL;
+
+}
+
+static int pp_client_exch_dest(struct pp_data *data)
+{
+ char msg[sizeof "0000:000000:000000:00000000:0000000000000000"];
+ int parsed;
+
+ if (!data->use_cma) {
+ sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx", data->my_dest.lid,
+ data->my_dest.qpn, data->my_dest.psn,
+ data->my_dest.rkey, data->my_dest.vaddr);
+ if (write(data->sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("client write");
+ fprintf(stderr, "%d:%s: Couldn't send local address\n",
+ pid, __func__);
+ goto err;
+ }
+
+ if (read(data->sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("client read");
+ fprintf(stderr, "%d:%s: Couldn't read remote address\n",
+ pid, __func__);
+ goto err;
+ }
+
+ if (data->rem_dest != NULL)
+ free(data->rem_dest);
+ data->rem_dest = malloc(sizeof *data->rem_dest);
+ if (!data->rem_dest)
+ goto err;
+
+ parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &data->rem_dest->lid,
+ &data->rem_dest->qpn, &data->rem_dest->psn,
+ &data->rem_dest->rkey, &data->rem_dest->vaddr);
+
+ if (parsed != 5) {
+ fprintf(stderr, "%d:%s: Couldn't parse line <%.*s>\n",
+ pid, __func__, (int)sizeof msg, msg);
+ free(data->rem_dest);
+ goto err;
+ }
+ }
+ return 0;
+err:
+ return 1;
+}
+
+static struct pingpong_context *pp_server_connect(struct pp_data *data)
+{
+ struct addrinfo *res, *t;
+ struct addrinfo hints = {
+ .ai_flags = AI_PASSIVE,
+ .ai_family = AF_UNSPEC,
+ .ai_socktype = SOCK_STREAM
+ };
+ char *service;
+ int sockfd = -1, connfd;
+ int n;
+ struct rdma_cm_event *event;
+ struct sockaddr_in sin;
+ struct pingpong_context *ctx = NULL;
+ struct rdma_cm_id *child_cm_id;
+ struct rdma_conn_param conn_param;
+
+ if (asprintf(&service, "%d", data->port) < 0)
+ goto err5;
+
+ if ( (n = getaddrinfo(NULL, service, &hints, &res)) < 0 ) {
+ fprintf(stderr, "%d:%s: %s for port %d\n", pid, __func__,
+ gai_strerror(n), data->port);
+ goto err5;
+ }
+
+ if (data->use_cma) {
+ sin.sin_addr.s_addr = 0;
+ sin.sin_family = AF_INET;
+ sin.sin_port = htons(data->port);
+ if (rdma_bind_addr(data->cm_id, (struct sockaddr *)&sin)) {
+ fprintf(stderr, "%d:%s: rdma_bind_addr failed\n", pid, __func__);
+ goto err3;
+ }
+
+ if (rdma_listen(data->cm_id, 0)) {
+ fprintf(stderr, "%d:%s: rdma_listen failed\n", pid, __func__);
+ goto err3;
+ }
+
+ if (rdma_get_cm_event(data->cm_channel, &event))
+ goto err3;
+
+ if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
+ fprintf(stderr, "%d:%s: bad event waiting for connect request %d\n",
+ pid, __func__, event->event);
+ goto err2;
+ }
+
+ if (!event->param.conn.private_data ||
+ (event->param.conn.private_data_len < sizeof(*data->rem_dest))) {
+ fprintf(stderr, "%d:%s: bad private data len %d\n", pid,
+ __func__, event->param.conn.private_data_len);
+ goto err2;
+ }
+
+ data->rem_dest = malloc(sizeof *data->rem_dest);
+ if (!data->rem_dest)
+ goto err2;
+
+ memcpy(data->rem_dest, event->param.conn.private_data, sizeof(*data->rem_dest));
+
+ child_cm_id = (struct rdma_cm_id *)event->id;
+ ctx = pp_init_ctx(child_cm_id, data);
+ if (!ctx) {
+ free(data->rem_dest);
+ goto err1;
+ }
+ data->my_dest.psn = lrand48() & 0xffffff;
+ data->my_dest.qpn = 0;
+ data->my_dest.rkey = ctx->mr->rkey;
+ data->my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size;
+
+ memset(&conn_param, 0, sizeof conn_param);
+ conn_param.responder_resources = 1;
+ conn_param.initiator_depth = 1;
+ conn_param.private_data = &data->my_dest;
+ conn_param.private_data_len = sizeof(data->my_dest);
+ if (rdma_accept(child_cm_id, &conn_param)) {
+ fprintf(stderr, "%d:%s: rdma_accept failed\n", pid, __func__);
+ goto err1;
+ }
+ rdma_ack_cm_event(event);
+ if (rdma_get_cm_event(data->cm_channel, &event)) {
+ fprintf(stderr, "%d:%s: rdma_get_cm_event error\n", pid, __func__);
+ rdma_destroy_id(child_cm_id);
+ goto err3;
+ }
+ if (event->event != RDMA_CM_EVENT_ESTABLISHED) {
+ fprintf(stderr, "%d:%s: bad event waiting for established %d\n",
+ pid, __func__, event->event);
+ goto err1;
+ }
+ rdma_ack_cm_event(event);
+ } else {
+ for (t = res; t; t = t->ai_next) {
+ sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
+ if (sockfd >= 0) {
+ n = 1;
+
+ setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n);
+
+ if (!bind(sockfd, t->ai_addr, t->ai_addrlen))
+ break;
+ close(sockfd);
+ sockfd = -1;
+ }
+ }
+
+ if (sockfd < 0) {
+ fprintf(stderr, "%d:%s: Couldn't listen to port %d\n", pid,
+ __func__, data->port);
+ goto err4;
+ }
+
+ listen(sockfd, 1);
+ connfd = accept(sockfd, NULL, 0);
+ if (connfd < 0) {
+ perror("server accept");
+ fprintf(stderr, "%d:%s: accept() failed\n", pid, __func__);
+ close(sockfd);
+ goto err4;
+ }
+
+ close(sockfd);
+
+ ctx = pp_init_ctx(data->ib_dev, data);
+ if (!ctx)
+ goto err4;
+ data->sockfd = connfd;
+ }
+ freeaddrinfo(res);
+ return ctx;
+
+err1:
+ rdma_destroy_id(child_cm_id);
+err2:
+ rdma_ack_cm_event(event);
+err3:
+ rdma_destroy_id(data->cm_id);
+ rdma_destroy_event_channel(data->cm_channel);
+err4:
+ freeaddrinfo(res);
+err5:
+ return NULL;
+
+}
+
+static int pp_server_exch_dest(struct pp_data *data)
+{
+ char msg[sizeof "0000:000000:000000:00000000:0000000000000000"];
+ int parsed;
+ int n;
+
+ if (!data->use_cma) {
+ n = read(data->sockfd, msg, sizeof msg);
+ if (n != sizeof msg) {
+ perror("server read");
+ fprintf(stderr, "%d:%s: %d/%d Couldn't read remote address\n",
+ pid, __func__, n, (int) sizeof msg);
+ goto err;
+ }
+
+ if (data->rem_dest != NULL)
+ free(data->rem_dest);
+ data->rem_dest = malloc(sizeof *data->rem_dest);
+ if (!data->rem_dest)
+ goto err;
+
+ parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &data->rem_dest->lid,
+ &data->rem_dest->qpn, &data->rem_dest->psn,
+ &data->rem_dest->rkey, &data->rem_dest->vaddr);
+ if (parsed != 5) {
+ fprintf(stderr, "%d:%s: Couldn't parse line <%.*s>\n", pid,
+ __func__, (int)sizeof msg, msg);
+ free(data->rem_dest);
+ goto err;
+ }
+
+ sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx", data->my_dest.lid,
+ data->my_dest.qpn, data->my_dest.psn,
+ data->my_dest.rkey, data->my_dest.vaddr);
+ if (write(data->sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("server write");
+ fprintf(stderr, "%d:%s: Couldn't send local address\n",
+ pid, __func__);
+ free(data->rem_dest);
+ goto err;
+ }
+ }
+
+ return 0;
+err:
+ return 1;
+}
+
+static struct pingpong_context *pp_init_ctx(void *ptr, struct pp_data *data)
+{
+ struct pingpong_context *ctx;
+ struct ibv_device *ib_dev;
+ struct rdma_cm_id *cm_id;
+
+ ctx = malloc(sizeof *ctx);
+ if (!ctx)
+ return NULL;
+
+ ctx->size = data->size;
+ ctx->tx_depth = data->tx_depth;
+
+ ctx->buf = memalign(page_size, ctx->size * 2);
+ if (!ctx->buf) {
+ fprintf(stderr, "%d:%s: Couldn't allocate work buf.\n",
+ pid, __func__);
+ return NULL;
+ }
+
+ memset(ctx->buf, 0, ctx->size * 2);
+
+ if (data->use_cma) {
+ cm_id = (struct rdma_cm_id *)ptr;
+ ctx->context = cm_id->verbs;
+ if (!ctx->context) {
+ fprintf(stderr, "%d:%s: Unbound cm_id!!\n", pid,
+ __func__);
+ return NULL;
+ }
+
+ } else {
+ ib_dev = (struct ibv_device *)ptr;
+ ctx->context = ibv_open_device(ib_dev);
+ if (!ctx->context) {
+ fprintf(stderr, "%d:%s: Couldn't get context for %s\n",
+ pid, __func__, ibv_get_device_name(ib_dev));
+ return NULL;
+ }
+ }
+
+ ctx->pd = ibv_alloc_pd(ctx->context);
+ if (!ctx->pd) {
+ fprintf(stderr, "%d:%s: Couldn't allocate PD\n", pid, __func__);
+ return NULL;
+ }
+
+ /* We dont really want IBV_ACCESS_LOCAL_WRITE, but IB spec says:
+ * The Consumer is not allowed to assign Remote Write or Remote Atomic to
+ * a Memory Region that has not been assigned Local Write. */
+ ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, ctx->size * 2,
+ IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+ if (!ctx->mr) {
+ fprintf(stderr, "%d:%s: Couldn't allocate MR\n", pid, __func__);
+ return NULL;
+ }
+
+
+ ctx->ch = ibv_create_comp_channel(ctx->context);
+ if (!ctx->ch) {
+ fprintf(stderr, "%d:%s: Couldn't create comp channel\n", pid,
+ __func__);
+ return NULL;
+ }
+
+ ctx->rcq = ibv_create_cq(ctx->context, 1, NULL, NULL, 0);
+ if (!ctx->rcq) {
+ fprintf(stderr, "%d:%s: Couldn't create recv CQ\n", pid,
+ __func__);
+ return NULL;
+ }
+
+ ctx->scq = ibv_create_cq(ctx->context, ctx->tx_depth, ctx, ctx->ch, 0);
+ if (!ctx->scq) {
+ fprintf(stderr, "%d:%s: Couldn't create send CQ\n", pid,
+ __func__);
+ return NULL;
+ }
+
+
+ struct ibv_qp_init_attr attr = {
+ .send_cq = ctx->scq,
+ .recv_cq = ctx->rcq,
+ .cap = {
+ .max_send_wr = ctx->tx_depth,
+ /* Work around: driver doesnt support
+ * recv_wr = 0 */
+ .max_recv_wr = 1,
+ .max_send_sge = 1,
+ .max_recv_sge = 1,
+ .max_inline_data = 0
+ },
+ .qp_type = IBV_QPT_RC
+ };
+
+ if (data->use_cma) {
+ if (rdma_create_qp(cm_id, ctx->pd, &attr)) {
+ fprintf(stderr, "%d:%s: Couldn't create QP\n", pid, __func__);
+ return NULL;
+ }
+ ctx->qp = cm_id->qp;
+ pp_post_recv(ctx);
+ return ctx;
+ } else {
+ ctx->qp = ibv_create_qp(ctx->pd, &attr);
+ if (!ctx->qp) {
+ fprintf(stderr, "%d:%s: Couldn't create QP\n", pid, __func__);
+ return NULL;
+ }
+ {
+ struct ibv_qp_attr attr;
+
+ attr.qp_state = IBV_QPS_INIT;
+ attr.pkey_index = 0;
+ attr.port_num = data->ib_port;
+ attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE;
+
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_PKEY_INDEX |
+ IBV_QP_PORT |
+ IBV_QP_ACCESS_FLAGS)) {
+ fprintf(stderr, "%d:%s: Failed to modify QP to INIT\n",
+ pid, __func__);
+ return NULL;
+ }
+ }
+
+ return ctx;
+ }
+
+}
+
+static int pp_connect_ctx(struct pingpong_context *ctx, struct pp_data data)
+{
+ struct ibv_qp_attr attr;
+ memset(&attr, 0, sizeof attr);
+
+ attr.qp_state = IBV_QPS_RTR;
+ attr.path_mtu = IBV_MTU_2048;
+ attr.dest_qp_num = data.rem_dest->qpn;
+ attr.rq_psn = data.rem_dest->psn;
+ attr.max_dest_rd_atomic = 1;
+ attr.min_rnr_timer = 12;
+ attr.ah_attr.is_global = 0;
+ attr.ah_attr.dlid = data.rem_dest->lid;
+ attr.ah_attr.sl = sl;
+ attr.ah_attr.src_path_bits = 0;
+ attr.ah_attr.port_num = data.ib_port;
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_AV |
+ IBV_QP_PATH_MTU |
+ IBV_QP_DEST_QPN |
+ IBV_QP_RQ_PSN |
+ IBV_QP_MAX_DEST_RD_ATOMIC |
+ IBV_QP_MIN_RNR_TIMER)) {
+ fprintf(stderr, "%d:%s: Failed to modify QP to RTR\n", pid, __func__);
+ return 1;
+ }
+
+ attr.qp_state = IBV_QPS_RTS;
+ attr.timeout = 14;
+ attr.retry_cnt = 7;
+ attr.rnr_retry = 7;
+ attr.sq_psn = data.my_dest.psn;
+ attr.max_rd_atomic = 1;
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_TIMEOUT |
+ IBV_QP_RETRY_CNT |
+ IBV_QP_RNR_RETRY |
+ IBV_QP_SQ_PSN |
+ IBV_QP_MAX_QP_RD_ATOMIC)) {
+ fprintf(stderr, "%d:%s: Failed to modify QP to RTS\n", pid, __func__);
+ return 1;
+ }
+
+ return 0;
+}
+
+static void pp_post_recv(struct pingpong_context *ctx)
+{
+ struct ibv_sge list;
+ struct ibv_recv_wr wr, *bad_wr;
+ int rc;
+
+ list.addr = (uintptr_t) ctx->buf;
+ list.length = 1;
+ list.lkey = ctx->mr->lkey;
+ wr.next = NULL;
+ wr.wr_id = 0xdeadbeef;
+ wr.sg_list = &list;
+ wr.num_sge = 1;
+
+ rc = ibv_post_recv(ctx->qp, &wr, &bad_wr);
+ if (rc) {
+ perror("ibv_post_recv");
+ fprintf(stderr, "%d:%s: ibv_post_recv failed %d\n", pid,
+ __func__, rc);
+ }
+}
+
+static void pp_wait_for_done(struct pingpong_context *ctx)
+{
+ struct ibv_wc wc;
+ int ne;
+
+ do {
+ usleep(500);
+ ne = ibv_poll_cq(ctx->rcq, 1, &wc);
+ } while (ne == 0);
+
+ if (wc.status)
+ fprintf(stderr, "%d:%s: bad wc status %d\n", pid, __func__,
+ wc.status);
+ if (!(wc.opcode & IBV_WC_RECV))
+ fprintf(stderr, "%d:%s: bad wc opcode %d\n", pid, __func__,
+ wc.opcode);
+ if (wc.wr_id != 0xdeadbeef)
+ fprintf(stderr, "%d:%s: bad wc wr_id 0x%x\n", pid, __func__,
+ (int)wc.wr_id);
+}
+
+static void pp_send_done(struct pingpong_context *ctx)
+{
+ struct ibv_send_wr *bad_wr;
+ struct ibv_wc wc;
+ int ne;
+
+ ctx->list.addr = (uintptr_t) ctx->buf;
+ ctx->list.length = 1;
+ ctx->list.lkey = ctx->mr->lkey;
+ ctx->wr.wr_id = 0xcafebabe;
+ ctx->wr.sg_list = &ctx->list;
+ ctx->wr.num_sge = 1;
+ ctx->wr.opcode = IBV_WR_SEND;
+ ctx->wr.send_flags = IBV_SEND_SIGNALED;
+ ctx->wr.next = NULL;
+ if (ibv_post_send(ctx->qp, &ctx->wr, &bad_wr)) {
+ fprintf(stderr, "%d:%s: ibv_post_send failed\n", pid, __func__);
+ return;
+ }
+ do {
+ usleep(500);
+ ne = ibv_poll_cq(ctx->scq, 1, &wc);
+ } while (ne == 0);
+
+ if (wc.status)
+ fprintf(stderr, "%d:%s: bad wc status %d\n", pid, __func__,
+ wc.status);
+ if (wc.opcode != IBV_WC_SEND)
+ fprintf(stderr, "%d:%s: bad wc opcode %d\n", pid, __func__,
+ wc.opcode);
+ if (wc.wr_id != 0xcafebabe)
+ fprintf(stderr, "%d:%s: bad wc wr_id 0x%x\n", pid, __func__,
+ (int)wc.wr_id);
+}
+
+static void pp_wait_for_start(struct pingpong_context *ctx)
+{
+ struct ibv_wc wc;
+ int ne;
+
+ do {
+ usleep(500);
+ ne = ibv_poll_cq(ctx->rcq, 1, &wc);
+ } while (ne == 0);
+
+ if (wc.status)
+ fprintf(stderr, "%d:%s: bad wc status %d\n", pid, __func__,
+ wc.status);
+ if (!(wc.opcode & IBV_WC_RECV))
+ fprintf(stderr, "%d:%s: bad wc opcode %d\n", pid, __func__,
+ wc.opcode);
+ if (wc.wr_id != 0xdeadbeef)
+ fprintf(stderr, "%d:%s: bad wc wr_id 0x%x\n", pid, __func__,
+ (int)wc.wr_id);
+ pp_post_recv(ctx);
+}
+
+static void pp_send_start(struct pingpong_context *ctx)
+{
+ struct ibv_send_wr *bad_wr;
+ struct ibv_wc wc;
+ int ne;
+
+ ctx->list.addr = (uintptr_t) ctx->buf;
+ ctx->list.length = 1;
+ ctx->list.lkey = ctx->mr->lkey;
+ ctx->wr.wr_id = 0xabbaabba;
+ ctx->wr.sg_list = &ctx->list;
+ ctx->wr.num_sge = 1;
+ ctx->wr.opcode = IBV_WR_SEND;
+ ctx->wr.send_flags = IBV_SEND_SIGNALED;
+ ctx->wr.next = NULL;
+ if (ibv_post_send(ctx->qp, &ctx->wr, &bad_wr)) {
+ fprintf(stderr, "%d:%s: ibv_post_send failed\n", pid, __func__);
+ return;
+ }
+ do {
+ usleep(500);
+ ne = ibv_poll_cq(ctx->scq, 1, &wc);
+ } while (ne == 0);
+
+ if (wc.status)
+ fprintf(stderr, "%d:%s: bad wc status %d\n", pid, __func__,
+ wc.status);
+ if (wc.opcode != IBV_WC_SEND)
+ fprintf(stderr, "%d:%s: bad wc opcode %d\n", pid, __func__,
+ wc.opcode);
+ if (wc.wr_id != 0xabbaabba)
+ fprintf(stderr, "%d:%s: bad wc wr_id 0x%x\n", pid, __func__,
+ (int)wc.wr_id);
+}
+
+static void pp_close_cma(struct pp_data data)
+{
+ struct rdma_cm_event *event;
+ int rc;
+
+ if (data.servername) {
+ rc = rdma_disconnect(data.cm_id);
+ if (rc) {
+ perror("rdma_disconnect");
+ fprintf(stderr, "%d:%s: rdma disconnect error\n", pid,
+ __func__);
+ return;
+ }
+ }
+
+ rdma_get_cm_event(data.cm_channel, &event);
+ if (event->event != RDMA_CM_EVENT_DISCONNECTED)
+ fprintf(stderr, "%d:%s: unexpected event during disconnect %d\n",
+ pid, __func__, event->event);
+ rdma_ack_cm_event(event);
+ rdma_destroy_id(data.cm_id);
+ rdma_destroy_event_channel(data.cm_channel);
+}
+
+static void usage(const char *argv0)
+{
+ printf("Usage:\n");
+ printf(" %s start a server and wait for connection\n", argv0);
+ printf(" %s <host> connect to server at <host>\n", argv0);
+ printf("\n");
+ printf("Options:\n");
+ printf(" -p, --port=<port> listen on/connect to port <port> (default 18515)\n");
+ printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n");
+ printf(" -i, --ib-port=<port> use port <port> of IB device (default 1)\n");
+ printf(" -s, --size=<size> size of message to exchange (default 65536)\n");
+ printf(" -t, --tx-depth=<dep> size of tx queue (default 100)\n");
+ printf(" -n, --iters=<iters> number of exchanges (at least 2, default 1000)\n");
+ printf(" -S, --sl=<sl> SL (default 0)\n");
+ printf(" -b, --bidirectional measure bidirectional bandwidth (default unidirectional)\n");
+ printf(" -c, --cma use RDMA CM\n");
+}
+
+static void print_report(unsigned int iters, unsigned size, int duplex,
+ cycles_t *tposted, cycles_t *tcompleted)
+{
+ double cycles_to_units;
+ unsigned long tsize; /* Transferred size, in megabytes */
+ int i, j;
+ int opt_posted = 0, opt_completed = 0;
+ cycles_t opt_delta;
+ cycles_t t;
+
+
+ opt_delta = tcompleted[opt_posted] - tposted[opt_completed];
+
+ /* Find the peak bandwidth */
+ for (i = 0; i < iters; ++i)
+ for (j = i; j < iters; ++j) {
+ t = (tcompleted[j] - tposted[i]) / (j - i + 1);
+ if (t < opt_delta) {
+ opt_delta = t;
+ opt_posted = i;
+ opt_completed = j;
+ }
+ }
+
+ cycles_to_units = get_cpu_mhz(0) * 1000000;
+
+ tsize = duplex ? 2 : 1;
+ tsize = tsize * size;
+
+ printf("\n%d: Bandwidth peak (#%d to #%d): %g MB/sec\n", pid,
+ opt_posted, opt_completed,
+ tsize * cycles_to_units / opt_delta / 0x100000);
+ printf("%d: Bandwidth average: %g MB/sec\n", pid,
+ tsize * iters * cycles_to_units /
+ (tcompleted[iters - 1] - tposted[0]) / 0x100000);
+
+ printf("%d: Service Demand peak (#%d to #%d): %ld cycles/KB\n", pid,
+ opt_posted, opt_completed,
+ (unsigned long)opt_delta * 1024 / tsize);
+ printf("%d: Service Demand Avg : %ld cycles/KB\n", pid,
+ (unsigned long)(tcompleted[iters - 1] - tposted[0]) *
+ 1024 / (tsize * iters));
+}
+
+
+int main(int argc, char *argv[])
+{
+ struct ibv_device **dev_list;
+ struct pingpong_context *ctx = NULL;
+ char *ib_devname = NULL;
+ int iters = 1000;
+ int scnt, ccnt;
+ int duplex = 0;
+ struct ibv_qp *qp;
+ cycles_t *tposted;
+ cycles_t *tcompleted;
+ struct pp_data data = {
+ .port = 18515,
+ .ib_port = 1,
+ .size = 65536,
+ .tx_depth = 100,
+ .use_cma = 0,
+ .servername = NULL,
+ .rem_dest = NULL,
+ .ib_dev = NULL,
+ .cm_channel = NULL,
+ .cm_id = NULL
+
+ };
+
+ /* Parameter parsing. */
+ while (1) {
+ int c;
+
+ static struct option long_options[] = {
+ { .name = "port", .has_arg = 1, .val = 'p' },
+ { .name = "ib-dev", .has_arg = 1, .val = 'd' },
+ { .name = "ib-port", .has_arg = 1, .val = 'i' },
+ { .name = "size", .has_arg = 1, .val = 's' },
+ { .name = "iters", .has_arg = 1, .val = 'n' },
+ { .name = "tx-depth", .has_arg = 1, .val = 't' },
+ { .name = "sl", .has_arg = 1, .val = 'S' },
+ { .name = "bidirectional", .has_arg = 0, .val = 'b' },
+ { .name = "cma", .has_arg = 0, .val = 'c' },
+ { 0 }
+ };
+
+ c = getopt_long(argc, argv, "p:d:i:s:n:t:S:bc", long_options, NULL);
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'p':
+ data.port = strtol(optarg, NULL, 0);
+ if (data.port < 0 || data.port > 65535) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ case 'd':
+ ib_devname = strdupa(optarg);
+ break;
+
+ case 'i':
+ data.ib_port = strtol(optarg, NULL, 0);
+ if (data.ib_port < 0) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ case 's':
+ data.size = strtoll(optarg, NULL, 0);
+ if (data.size < 1 || data.size > UINT_MAX / 2) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ case 't':
+ data.tx_depth = strtol(optarg, NULL, 0);
+ if (data.tx_depth < 1) { usage(argv[0]); return 1; }
+ break;
+
+ case 'n':
+ iters = strtol(optarg, NULL, 0);
+ if (iters < 2) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ break;
+
+ case 'S':
+ sl = strtol(optarg, NULL, 0);
+ if (sl > 15) { usage(argv[0]); return 1; }
+ break;
+
+ case 'b':
+ duplex = 1;
+ break;
+
+ case 'c':
+ data.use_cma = 1;
+ break;
+ default:
+ usage(argv[0]);
+ return 1;
+ }
+ }
+
+ if (optind == argc - 1)
+ data.servername = strdupa(argv[optind]);
+ else if (optind < argc) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ /* Get the PID and prepend it to every output on stdout/stderr
+ * This helps to parse output when multiple client/server are
+ * run from single host
+ */
+ pid = getpid();
+
+ printf("%d: | port=%d | ib_port=%d | size=%d | tx_depth=%d | sl=%d | iters=%d | duplex=%d | cma=%d |\n",
+ pid, data.port, data.ib_port, data.size, data.tx_depth,
+ sl, iters, duplex, data.use_cma);
+
+ /* Done with parameter parsing. Perform setup. */
+
+ srand48(pid * time(NULL));
+
+ page_size = sysconf(_SC_PAGESIZE);
+
+ if (data.use_cma) {
+ data.cm_channel = rdma_create_event_channel();
+ if (!data.cm_channel) {
+ fprintf(stderr, "%d:%s: rdma_create_event_channel failed\n",
+ pid, __func__);
+ return 1;
+ }
+ if (rdma_create_id(data.cm_channel, &data.cm_id, NULL, RDMA_PS_TCP)) {
+ fprintf(stderr, "%d:%s: rdma_create_id failed\n",
+ pid, __func__);
+ return 1;
+ }
+
+ if (data.servername) {
+ ctx = pp_client_connect(&data);
+ if (!ctx)
+ return 1;
+ } else {
+ ctx = pp_server_connect(&data);
+ if (!ctx)
+ return 1;
+ }
+ } else {
+ dev_list = ibv_get_device_list(NULL);
+
+ if (!ib_devname) {
+ data.ib_dev = dev_list[0];
+ if (!data.ib_dev) {
+ fprintf(stderr, "%d:%s: No IB devices found\n",
+ pid, __func__);
+ return 1;
+ }
+ } else {
+ for (; (data.ib_dev = *dev_list); ++dev_list)
+ if (!strcmp(ibv_get_device_name(data.ib_dev), ib_devname))
+ break;
+ if (!data.ib_dev) {
+ fprintf(stderr, "%d:%s: IB device %s not found\n",
+ pid, __func__, ib_devname);
+ return 1;
+ }
+ }
+ if (data.servername) {
+ ctx = pp_client_connect(&data);
+ if (!ctx)
+ return 1;
+ } else {
+ ctx = pp_server_connect(&data);
+ if (!ctx)
+ return 1;
+ }
+ data.my_dest.lid = pp_get_local_lid(ctx, data.ib_port);
+ if (!data.my_dest.lid) {
+ fprintf(stderr, "%d:%s: Local lid 0x0 detected. Is an SM running?\n",
+ pid, __func__);
+ return 1;
+ }
+ data.my_dest.qpn = ctx->qp->qp_num;
+ data.my_dest.psn = lrand48() & 0xffffff;
+ data.my_dest.rkey = ctx->mr->rkey;
+ data.my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size;
+
+ /* Create connection between client and server.
+ * We do it by exchanging data over a TCP socket connection. */
+ if (data.servername) {
+ if (pp_client_exch_dest(&data))
+ return 1;
+ } else {
+ if (pp_server_exch_dest(&data))
+ return 1;
+ }
+ }
+
+ printf("%d: Local address: LID %#04x, QPN %#06x, PSN %#06x "
+ "RKey %#08x VAddr %#016Lx\n", pid,
+ data.my_dest.lid, data.my_dest.qpn, data.my_dest.psn,
+ data.my_dest.rkey, data.my_dest.vaddr);
+
+ printf("%d: Remote address: LID %#04x, QPN %#06x, PSN %#06x, "
+ "RKey %#08x VAddr %#016Lx\n\n", pid,
+ data.rem_dest->lid, data.rem_dest->qpn, data.rem_dest->psn,
+ data.rem_dest->rkey, data.rem_dest->vaddr);
+
+ if (data.use_cma) {
+ /*
+ * Synch up and force the server to wait for the client to send
+ * the first message (MPA requirement).
+ */
+ if (data.servername) {
+ pp_send_start(ctx);
+ } else {
+ pp_wait_for_start(ctx);
+ }
+
+ } else {
+ if (pp_connect_ctx(ctx, data))
+ return 1;
+
+ /* An additional handshake is required *after* moving qp to RTR.
+ Arbitrarily reuse exch_dest for this purpose. */
+ if (data.servername) {
+ if (pp_client_exch_dest(&data))
+ return 1;
+ } else {
+ if (pp_server_exch_dest(&data))
+ return 1;
+ }
+ }
+
+ /* For half duplex tests, server just waits for client to exit */
+ if (!data.servername && !duplex) {
+ if (data.use_cma) {
+ pp_wait_for_done(ctx);
+ pp_send_done(ctx);
+ pp_close_cma(data);
+ } else {
+ pp_server_exch_dest(&data);
+ write(data.sockfd, "done", sizeof "done");
+ close(data.sockfd);
+ }
+ return 0;
+ }
+
+ ctx->list.addr = (uintptr_t) ctx->buf;
+ ctx->list.length = ctx->size;
+ ctx->list.lkey = ctx->mr->lkey;
+ ctx->wr.wr.rdma.remote_addr = data.rem_dest->vaddr;
+ ctx->wr.wr.rdma.rkey = data.rem_dest->rkey;
+ ctx->wr.wr_id = PINGPONG_RDMA_WRID;
+ ctx->wr.sg_list = &ctx->list;
+ ctx->wr.num_sge = 1;
+ ctx->wr.opcode = IBV_WR_RDMA_WRITE;
+ ctx->wr.send_flags = IBV_SEND_SIGNALED;
+ ctx->wr.next = NULL;
+
+ scnt = 0;
+ ccnt = 0;
+
+ qp = ctx->qp;
+
+ tposted = malloc(iters * sizeof *tposted);
+
+ if (!tposted) {
+ perror("malloc");
+ return 1;
+ }
+
+ tcompleted = malloc(iters * sizeof *tcompleted);
+
+ if (!tcompleted) {
+ perror("malloc");
+ return 1;
+ }
+
+ /* Done with setup. Start the test. */
+
+ while (scnt < iters || ccnt < iters) {
+
+ while (scnt < iters && scnt - ccnt < data.tx_depth) {
+ struct ibv_send_wr *bad_wr;
+ tposted[scnt] = get_cycles();
+
+ if (ibv_post_send(qp, &ctx->wr, &bad_wr)) {
+ fprintf(stderr, "%d:%s: Couldn't post send: scnt=%d\n",
+ pid, __func__, scnt);
+ return 1;
+ }
+ ++scnt;
+ }
+
+ if (ccnt < iters) {
+ struct ibv_wc wc;
+ int ne;
+ do {
+ ne = ibv_poll_cq(ctx->scq, 1, &wc);
+ } while (ne == 0);
+
+ tcompleted[ccnt] = get_cycles();
+
+ if (ne < 0) {
+ fprintf(stderr, "%d:%s: poll CQ failed %d\n", pid,
+ __func__, ne);
+ return 1;
+ }
+ if (wc.status != IBV_WC_SUCCESS) {
+ fprintf(stderr, "%d:%s: Completion with error at %s:\n",
+ pid, __func__, data.servername ? "client" : "server");
+ fprintf(stderr, "%d:%s: Failed status %d: wr_id %d\n",
+ pid, __func__, wc.status, (int) wc.wr_id);
+ fprintf(stderr, "%d:%s: scnt=%d, ccnt=%d\n",
+ pid, __func__, scnt, ccnt);
+ return 1;
+ }
+ ccnt += 1;
+ }
+ }
+
+ if (data.use_cma) {
+ /* This is racy when duplex mode is used*/
+ pp_send_done(ctx);
+ pp_wait_for_done(ctx);
+ pp_close_cma(data);
+ } else {
+ if (data.servername)
+ pp_client_exch_dest(&data);
+ else
+ pp_server_exch_dest(&data);
+
+ write(data.sockfd, "done", sizeof "done");
+ close(data.sockfd);
+
+ }
+
+ print_report(iters, data.size, duplex, tposted, tcompleted);
+
+ free(tposted);
+ free(tcompleted);
+ return 0;
+}
diff --git a/rdma_lat.c b/rdma_lat.c
new file mode 100755
index 0000000..1f65086
--- /dev/null
+++ b/rdma_lat.c
@@ -0,0 +1,1307 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2005 Hewlett Packard, Inc (Grant Grundler)
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <netdb.h>
+#include <malloc.h>
+#include <getopt.h>
+#include <arpa/inet.h>
+#include <byteswap.h>
+#include <time.h>
+
+#include <infiniband/verbs.h>
+#include <rdma/rdma_cma.h>
+
+#include "get_clock.h"
+
+#define PINGPONG_RDMA_WRID 3
+#define MAX_INLINE 400
+
+static int inline_size = MAX_INLINE;
+static int sl = 0;
+static int page_size;
+static pid_t pid;
+
+struct report_options {
+ int unsorted;
+ int histogram;
+ int cycles; /* report delta's in cycles, not microsec's */
+};
+
+
+struct pingpong_context {
+ struct ibv_context *context;
+ struct ibv_pd *pd;
+ struct ibv_mr *mr;
+ struct ibv_cq *rcq;
+ struct ibv_cq *scq;
+ struct ibv_qp *qp;
+ void *buf;
+ volatile char *post_buf;
+ volatile char *poll_buf;
+ int size;
+ int tx_depth;
+ struct ibv_sge list;
+ struct ibv_send_wr wr;
+};
+
+struct pingpong_dest {
+ int lid;
+ int qpn;
+ int psn;
+ unsigned rkey;
+ unsigned long long vaddr;
+};
+
+struct pp_data {
+ int port;
+ int ib_port;
+ unsigned size;
+ int tx_depth;
+ int use_cma;
+ int sockfd;
+ char *servername;
+ struct pingpong_dest my_dest;
+ struct pingpong_dest *rem_dest;
+ struct ibv_device *ib_dev;
+ struct rdma_event_channel *cm_channel;
+ struct rdma_cm_id *cm_id;
+
+};
+
+static void pp_post_recv(struct pingpong_context *);
+static void pp_wait_for_done(struct pingpong_context *);
+static void pp_send_done(struct pingpong_context *);
+static void pp_wait_for_start(struct pingpong_context *);
+static void pp_send_start(struct pingpong_context *);
+static void pp_close_cma(struct pp_data );
+static struct pingpong_context *pp_init_ctx(void *, struct pp_data *);
+
+
+static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port)
+{
+ struct ibv_port_attr attr;
+
+ if (ibv_query_port(ctx->context, port, &attr))
+ return 0;
+
+ return attr.lid;
+}
+
+static struct ibv_device *pp_find_dev(const char *ib_devname)
+{
+ struct ibv_device **dev_list;
+ struct ibv_device *ib_dev = NULL;
+
+ dev_list = ibv_get_device_list(NULL);
+
+ if (!ib_devname) {
+ ib_dev = dev_list[0];
+ if (!ib_dev)
+ fprintf(stderr, "No IB devices found\n");
+ } else {
+ for (; (ib_dev = *dev_list); ++dev_list) {
+ if (!strcmp(ibv_get_device_name(ib_dev), ib_devname))
+ break;
+ }
+ if (!ib_dev)
+ fprintf(stderr, "IB device %s not found\n", ib_devname);
+ }
+ return ib_dev;
+}
+
+#define KEY_MSG_SIZE (sizeof "0000:000000:000000:00000000:0000000000000000")
+#define KEY_PRINT_FMT "%04x:%06x:%06x:%08x:%016Lx"
+
+static int pp_write_keys(int sockfd, const struct pingpong_dest *my_dest)
+{
+ char msg[KEY_MSG_SIZE];
+
+ sprintf(msg, KEY_PRINT_FMT, my_dest->lid, my_dest->qpn,
+ my_dest->psn, my_dest->rkey, my_dest->vaddr);
+
+ if (write(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("client write");
+ fprintf(stderr, "Couldn't send local address\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int pp_read_keys(int sockfd, const struct pingpong_dest *my_dest,
+ struct pingpong_dest *rem_dest)
+{
+ int parsed;
+ char msg[KEY_MSG_SIZE];
+
+ if (read(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("pp_read_keys");
+ fprintf(stderr, "Couldn't read remote address\n");
+ return -1;
+ }
+
+ parsed = sscanf(msg, KEY_PRINT_FMT, &rem_dest->lid, &rem_dest->qpn,
+ &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr);
+
+ if (parsed != 5) {
+ fprintf(stderr, "Couldn't parse line <%.*s>\n",
+ (int)sizeof msg, msg);
+ return -1;
+ }
+
+ return 0;
+}
+
+static struct pingpong_context *pp_client_connect(struct pp_data *data)
+{
+ struct addrinfo *res, *t;
+ struct addrinfo hints = {
+ .ai_family = AF_UNSPEC,
+ .ai_socktype = SOCK_STREAM
+ };
+ char *service;
+ int n;
+ int sockfd = -1;
+ int n_retries = 10;
+ struct rdma_cm_event *event;
+ struct sockaddr_in sin;
+ struct pingpong_context *ctx = NULL;
+ struct rdma_conn_param conn_param;
+
+ if (asprintf(&service, "%d", data->port) < 0)
+ goto err4;
+
+ n = getaddrinfo(data->servername, service, &hints, &res);
+
+ if (n < 0) {
+ fprintf(stderr, "%d:%s: %s for %s:%d\n",
+ pid, __func__, gai_strerror(n),
+ data->servername, data->port);
+ goto err4;
+ }
+
+ if (data->use_cma) {
+ sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr;
+ sin.sin_family = AF_INET;
+ sin.sin_port = htons(data->port);
+retry_addr:
+ if (rdma_resolve_addr(data->cm_id, NULL,
+ (struct sockaddr *)&sin, 2000)) {
+ fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n",
+ pid, __func__ );
+ goto err2;
+ }
+
+ if (rdma_get_cm_event(data->cm_channel, &event))
+ goto err2;
+
+ if (event->event == RDMA_CM_EVENT_ADDR_ERROR
+ && n_retries-- > 0) {
+ rdma_ack_cm_event(event);
+ goto retry_addr;
+ }
+
+ if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
+ fprintf(stderr, "%d:%s: unexpected CM event %d\n",
+ pid, __func__, event->event);
+ goto err1;
+ }
+ rdma_ack_cm_event(event);
+
+retry_route:
+ if (rdma_resolve_route(data->cm_id, 2000)) {
+ fprintf(stderr, "%d:%s: rdma_resolve_route failed\n",
+ pid, __func__);
+ goto err2;
+ }
+
+ if (rdma_get_cm_event(data->cm_channel, &event))
+ goto err2;
+
+ if (event->event == RDMA_CM_EVENT_ROUTE_ERROR
+ && n_retries-- > 0) {
+ rdma_ack_cm_event(event);
+ goto retry_route;
+ }
+
+ if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
+ fprintf(stderr, "%d:%s: unexpected CM event %d\n",
+ pid, __func__, event->event);
+ rdma_ack_cm_event(event);
+ goto err1;
+ }
+ rdma_ack_cm_event(event);
+ ctx = pp_init_ctx(data->cm_id, data);
+ if (!ctx) {
+ fprintf(stderr, "%d:%s: pp_init_ctx failed\n", pid, __func__);
+ goto err2;
+ }
+ data->my_dest.psn = lrand48() & 0xffffff;
+ data->my_dest.qpn = 0;
+ data->my_dest.rkey = ctx->mr->rkey;
+ data->my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size;
+
+ memset(&conn_param, 0, sizeof conn_param);
+ conn_param.responder_resources = 1;
+ conn_param.initiator_depth = 1;
+ conn_param.retry_count = 5;
+ conn_param.private_data = &data->my_dest;
+ conn_param.private_data_len = sizeof(data->my_dest);
+
+ if (rdma_connect(data->cm_id, &conn_param)) {
+ fprintf(stderr, "%d:%s: rdma_connect failure\n", pid, __func__);
+ goto err2;
+ }
+
+ if (rdma_get_cm_event(data->cm_channel, &event))
+ goto err2;
+
+ if (event->event != RDMA_CM_EVENT_ESTABLISHED) {
+ fprintf(stderr, "%d:%s: unexpected CM event %d\n",
+ pid, __func__, event->event);
+ goto err1;
+ }
+ if (!event->param.conn.private_data ||
+ (event->param.conn.private_data_len < sizeof(*data->rem_dest))) {
+ fprintf(stderr, "%d:%s: bad private data ptr %p len %d\n",
+ pid, __func__, event->param.conn.private_data,
+ event->param.conn.private_data_len);
+ goto err1;
+ }
+ data->rem_dest = malloc(sizeof *data->rem_dest);
+ if (!data->rem_dest)
+ goto err1;
+
+ memcpy(data->rem_dest, event->param.conn.private_data, sizeof(*data->rem_dest));
+ rdma_ack_cm_event(event);
+ } else {
+ for (t = res; t; t = t->ai_next) {
+ sockfd = socket(t->ai_family, t->ai_socktype,
+ t->ai_protocol);
+ if (sockfd >= 0) {
+ if (!connect(sockfd, t->ai_addr, t->ai_addrlen))
+ break;
+ close(sockfd);
+ sockfd = -1;
+ }
+ }
+ if (sockfd < 0) {
+ fprintf(stderr, "%d:%s: Couldn't connect to %s:%d\n",
+ pid, __func__, data->servername, data->port);
+ goto err3;
+ }
+ ctx = pp_init_ctx(data->ib_dev, data);
+ if (!ctx)
+ goto err3;
+ data->sockfd = sockfd;
+ }
+
+ freeaddrinfo(res);
+ return ctx;
+
+err1:
+ rdma_ack_cm_event(event);
+err2:
+ rdma_destroy_id(data->cm_id);
+ rdma_destroy_event_channel(data->cm_channel);
+err3:
+ freeaddrinfo(res);
+err4:
+ return NULL;
+
+}
+
+
+static int pp_client_exch_dest(struct pp_data *data)
+{
+ if (data->rem_dest != NULL)
+ free(data->rem_dest);
+
+ data->rem_dest = malloc(sizeof *data->rem_dest);
+ if (!data->rem_dest)
+ return -1;
+
+ if (pp_write_keys(data->sockfd, &data->my_dest))
+ return -1;
+
+ return pp_read_keys(data->sockfd, &data->my_dest, data->rem_dest);
+}
+
+static struct pingpong_context *pp_server_connect(struct pp_data *data)
+{
+ struct addrinfo *res, *t;
+ struct addrinfo hints = {
+ .ai_flags = AI_PASSIVE,
+ .ai_family = AF_UNSPEC,
+ .ai_socktype = SOCK_STREAM
+ };
+ char *service;
+ int sockfd = -1, connfd;
+ int n;
+ struct rdma_cm_event *event;
+ struct sockaddr_in sin;
+ struct pingpong_context *ctx = NULL;
+ struct rdma_cm_id *child_cm_id;
+ struct rdma_conn_param conn_param;
+
+ if (asprintf(&service, "%d", data->port) < 0)
+ goto err5;
+
+ if ( (n = getaddrinfo(NULL, service, &hints, &res)) < 0 ) {
+ fprintf(stderr, "%d:%s: %s for port %d\n", pid, __func__,
+ gai_strerror(n), data->port);
+ goto err5;
+ }
+
+ if (data->use_cma) {
+ sin.sin_addr.s_addr = 0;
+ sin.sin_family = AF_INET;
+ sin.sin_port = htons(data->port);
+ if (rdma_bind_addr(data->cm_id, (struct sockaddr *)&sin)) {
+ fprintf(stderr, "%d:%s: rdma_bind_addr failed\n", pid, __func__);
+ goto err3;
+ }
+
+ if (rdma_listen(data->cm_id, 0)) {
+ fprintf(stderr, "%d:%s: rdma_listen failed\n", pid, __func__);
+ goto err3;
+ }
+
+ if (rdma_get_cm_event(data->cm_channel, &event))
+ goto err3;
+
+ if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
+ fprintf(stderr, "%d:%s: bad event waiting for connect request %d\n",
+ pid, __func__, event->event);
+ goto err2;
+ }
+
+ if (!event->param.conn.private_data ||
+ (event->param.conn.private_data_len < sizeof(*data->rem_dest))) {
+ fprintf(stderr, "%d:%s: bad private data len %d\n", pid,
+ __func__, event->param.conn.private_data_len);
+ goto err2;
+ }
+
+ data->rem_dest = malloc(sizeof *data->rem_dest);
+ if (!data->rem_dest)
+ goto err2;
+
+ memcpy(data->rem_dest, event->param.conn.private_data, sizeof(*data->rem_dest));
+
+ child_cm_id = (struct rdma_cm_id *)event->id;
+ ctx = pp_init_ctx(child_cm_id, data);
+ if (!ctx) {
+ free(data->rem_dest);
+ goto err1;
+ }
+ data->my_dest.psn = lrand48() & 0xffffff;
+ data->my_dest.qpn = 0;
+ data->my_dest.rkey = ctx->mr->rkey;
+ data->my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size;
+
+ memset(&conn_param, 0, sizeof conn_param);
+ conn_param.responder_resources = 1;
+ conn_param.initiator_depth = 1;
+ conn_param.private_data = &data->my_dest;
+ conn_param.private_data_len = sizeof(data->my_dest);
+ if (rdma_accept(child_cm_id, &conn_param)) {
+ fprintf(stderr, "%d:%s: rdma_accept failed\n", pid, __func__);
+ goto err1;
+ }
+ rdma_ack_cm_event(event);
+ if (rdma_get_cm_event(data->cm_channel, &event)) {
+ fprintf(stderr, "%d:%s: rdma_get_cm_event error\n", pid, __func__);
+ rdma_destroy_id(child_cm_id);
+ goto err3;
+ }
+ if (event->event != RDMA_CM_EVENT_ESTABLISHED) {
+ fprintf(stderr, "%d:%s: bad event waiting for established %d\n",
+ pid, __func__, event->event);
+ goto err1;
+ }
+ rdma_ack_cm_event(event);
+ } else {
+ for (t = res; t; t = t->ai_next) {
+ sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
+ if (sockfd >= 0) {
+ n = 1;
+
+ setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n);
+
+ if (!bind(sockfd, t->ai_addr, t->ai_addrlen))
+ break;
+ close(sockfd);
+ sockfd = -1;
+ }
+ }
+
+ if (sockfd < 0) {
+ fprintf(stderr, "%d:%s: Couldn't listen to port %d\n", pid,
+ __func__, data->port);
+ goto err4;
+ }
+
+ listen(sockfd, 1);
+ connfd = accept(sockfd, NULL, 0);
+ if (connfd < 0) {
+ perror("server accept");
+ fprintf(stderr, "%d:%s: accept() failed\n", pid, __func__);
+ close(sockfd);
+ goto err4;
+ }
+
+ close(sockfd);
+
+ ctx = pp_init_ctx(data->ib_dev, data);
+ if (!ctx)
+ goto err4;
+ data->sockfd = connfd;
+ }
+ freeaddrinfo(res);
+ return ctx;
+
+err1:
+ rdma_destroy_id(child_cm_id);
+err2:
+ rdma_ack_cm_event(event);
+err3:
+ rdma_destroy_id(data->cm_id);
+ rdma_destroy_event_channel(data->cm_channel);
+err4:
+ freeaddrinfo(res);
+err5:
+ return NULL;
+
+}
+
+static int pp_server_exch_dest(struct pp_data *data)
+{
+ if (data->rem_dest != NULL)
+ free(data->rem_dest);
+ data->rem_dest = malloc(sizeof *data->rem_dest);
+
+ if (!data->rem_dest)
+ return -1;
+
+ if (pp_read_keys(data->sockfd, &data->my_dest, data->rem_dest))
+ return -1;
+
+ return pp_write_keys(data->sockfd, &data->my_dest);
+}
+
+static struct pingpong_context *pp_init_ctx(void *ptr, struct pp_data *data)
+{
+ struct pingpong_context *ctx;
+ struct ibv_device *ib_dev;
+ struct rdma_cm_id *cm_id;
+
+ ctx = malloc(sizeof *ctx);
+ if (!ctx)
+ return NULL;
+
+ ctx->size = data->size;
+ ctx->tx_depth = data->tx_depth;
+
+ ctx->buf = memalign(page_size, ctx->size * 2);
+ if (!ctx->buf) {
+ fprintf(stderr, "%d:%s: Couldn't allocate work buf.\n",
+ pid, __func__);
+ return NULL;
+ }
+
+ memset(ctx->buf, 0, ctx->size * 2);
+
+ ctx->post_buf = (char *)ctx->buf + (ctx->size -1);
+ ctx->poll_buf = (char *)ctx->buf + (2 * ctx->size -1);
+
+
+ if (data->use_cma) {
+ cm_id = (struct rdma_cm_id *)ptr;
+ ctx->context = cm_id->verbs;
+ if (!ctx->context) {
+ fprintf(stderr, "%d:%s: Unbound cm_id!!\n", pid,
+ __func__);
+ return NULL;
+ }
+
+ } else {
+ ib_dev = (struct ibv_device *)ptr;
+ ctx->context = ibv_open_device(ib_dev);
+ if (!ctx->context) {
+ fprintf(stderr, "%d:%s: Couldn't get context for %s\n",
+ pid, __func__, ibv_get_device_name(ib_dev));
+ return NULL;
+ }
+ }
+
+ ctx->pd = ibv_alloc_pd(ctx->context);
+ if (!ctx->pd) {
+ fprintf(stderr, "%d:%s: Couldn't allocate PD\n", pid, __func__);
+ return NULL;
+ }
+
+ /* We dont really want IBV_ACCESS_LOCAL_WRITE, but IB spec says:
+ * The Consumer is not allowed to assign Remote Write or Remote Atomic to
+ * a Memory Region that has not been assigned Local Write. */
+ ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, ctx->size * 2,
+ IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+ if (!ctx->mr) {
+ fprintf(stderr, "%d:%s: Couldn't allocate MR\n", pid, __func__);
+ return NULL;
+ }
+
+ ctx->rcq = ibv_create_cq(ctx->context, 1, NULL, NULL, 0);
+ if (!ctx->rcq) {
+ fprintf(stderr, "%d:%s: Couldn't create recv CQ\n", pid,
+ __func__);
+ return NULL;
+ }
+
+ ctx->scq = ibv_create_cq(ctx->context, ctx->tx_depth, ctx, NULL, 0);
+ if (!ctx->scq) {
+ fprintf(stderr, "%d:%s: Couldn't create send CQ\n", pid,
+ __func__);
+ return NULL;
+ }
+
+
+ struct ibv_qp_init_attr attr = {
+ .send_cq = ctx->scq,
+ .recv_cq = ctx->rcq,
+ .cap = {
+ .max_send_wr = ctx->tx_depth,
+ /* Work around: driver doesnt support
+ * recv_wr = 0 */
+ .max_recv_wr = 1,
+ .max_send_sge = 1,
+ .max_recv_sge = 1,
+ .max_inline_data = inline_size,
+ },
+ .qp_type = IBV_QPT_RC
+ };
+
+ if (data->use_cma) {
+ if (rdma_create_qp(cm_id, ctx->pd, &attr)) {
+ fprintf(stderr, "%d:%s: Couldn't create QP\n", pid, __func__);
+ return NULL;
+ }
+ ctx->qp = cm_id->qp;
+ pp_post_recv(ctx);
+ } else {
+ ctx->qp = ibv_create_qp(ctx->pd, &attr);
+ if (!ctx->qp) {
+ fprintf(stderr, "%d:%s: Couldn't create QP\n", pid, __func__);
+ return NULL;
+ }
+ {
+ struct ibv_qp_attr attr;
+
+ attr.qp_state = IBV_QPS_INIT;
+ attr.pkey_index = 0;
+ attr.port_num = data->ib_port;
+ attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE;
+
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_PKEY_INDEX |
+ IBV_QP_PORT |
+ IBV_QP_ACCESS_FLAGS)) {
+ fprintf(stderr, "%d:%s: Failed to modify QP to INIT\n",
+ pid, __func__);
+ return NULL;
+ }
+ }
+ }
+
+ return ctx;
+}
+
+static int pp_connect_ctx(struct pingpong_context *ctx, struct pp_data *data)
+{
+ struct ibv_qp_attr attr = {
+ .qp_state = IBV_QPS_RTR,
+ .path_mtu = IBV_MTU_256,
+ .dest_qp_num = data->rem_dest->qpn,
+ .rq_psn = data->rem_dest->psn,
+ .max_dest_rd_atomic = 1,
+ .min_rnr_timer = 12,
+ .ah_attr.is_global = 0,
+ .ah_attr.dlid = data->rem_dest->lid,
+ .ah_attr.sl = sl,
+ .ah_attr.src_path_bits = 0,
+ .ah_attr.port_num = data->ib_port
+ };
+
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_AV |
+ IBV_QP_PATH_MTU |
+ IBV_QP_DEST_QPN |
+ IBV_QP_RQ_PSN |
+ IBV_QP_MAX_DEST_RD_ATOMIC |
+ IBV_QP_MIN_RNR_TIMER)) {
+ fprintf(stderr, "%s: Failed to modify QP to RTR\n", __func__);
+ return 1;
+ }
+
+ attr.qp_state = IBV_QPS_RTS;
+ attr.timeout = 14;
+ attr.retry_cnt = 7;
+ attr.rnr_retry = 7;
+ attr.sq_psn = data->my_dest.psn;
+ attr.max_rd_atomic = 1;
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_TIMEOUT |
+ IBV_QP_RETRY_CNT |
+ IBV_QP_RNR_RETRY |
+ IBV_QP_SQ_PSN |
+ IBV_QP_MAX_QP_RD_ATOMIC)) {
+ fprintf(stderr, "%s: Failed to modify QP to RTS\n", __func__);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int pp_open_port(struct pingpong_context *ctx, struct pp_data *data )
+{
+ char addr_fmt[] = "%8s address: LID %#04x QPN %#06x PSN %#06x RKey %#08x VAddr %#016Lx\n";
+
+ /* Create connection between client and server.
+ * We do it by exchanging data over a TCP socket connection. */
+
+ data->my_dest.lid = pp_get_local_lid(ctx, data->ib_port);
+ data->my_dest.qpn = ctx->qp->qp_num;
+ data->my_dest.psn = lrand48() & 0xffffff;
+ if (!data->my_dest.lid) {
+ fprintf(stderr, "Local lid 0x0 detected. Is an SM running?\n");
+ return -1;
+ }
+ data->my_dest.rkey = ctx->mr->rkey;
+ data->my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size;
+
+ printf(addr_fmt, "local", data->my_dest.lid, data->my_dest.qpn, data->my_dest.psn,
+ data->my_dest.rkey, data->my_dest.vaddr);
+
+ if (data->servername) {
+ if (pp_client_exch_dest(data))
+ return 1;
+ } else {
+ if (pp_server_exch_dest(data))
+ return 1;
+ }
+
+ printf(addr_fmt, "remote", data->rem_dest->lid, data->rem_dest->qpn,
+ data->rem_dest->psn, data->rem_dest->rkey,
+ data->rem_dest->vaddr);
+
+ if (pp_connect_ctx(ctx, data))
+ return 1;
+
+ /* An additional handshake is required *after* moving qp to RTR.
+ Arbitrarily reuse exch_dest for this purpose. */
+ if (data->servername) {
+ if (pp_client_exch_dest(data))
+ return -1;
+ } else {
+ if (pp_server_exch_dest(data))
+ return -1;
+ }
+
+ if (write(data->sockfd, "done", sizeof "done") != sizeof "done"){
+ perror("write");
+ fprintf(stderr, "Couldn't write to socket\n");
+ return 1;
+ }
+
+ close(data->sockfd);
+
+ return 0;
+}
+
+static void pp_post_recv(struct pingpong_context *ctx)
+{
+ struct ibv_sge list;
+ struct ibv_recv_wr wr, *bad_wr;
+ int rc;
+
+ list.addr = (uintptr_t) ctx->buf;
+ list.length = 1;
+ list.lkey = ctx->mr->lkey;
+ wr.next = NULL;
+ wr.wr_id = 0xdeadbeef;
+ wr.sg_list = &list;
+ wr.num_sge = 1;
+
+ rc = ibv_post_recv(ctx->qp, &wr, &bad_wr);
+ if (rc) {
+ perror("ibv_post_recv");
+ fprintf(stderr, "%d:%s: ibv_post_recv failed %d\n", pid,
+ __func__, rc);
+ }
+}
+
+static void pp_wait_for_done(struct pingpong_context *ctx)
+{
+ struct ibv_wc wc;
+ int ne;
+
+ do {
+ usleep(500);
+ ne = ibv_poll_cq(ctx->rcq, 1, &wc);
+ } while (ne == 0);
+
+ if (wc.status)
+ fprintf(stderr, "%d:%s: bad wc status %d\n", pid, __func__,
+ wc.status);
+ if (!(wc.opcode & IBV_WC_RECV))
+ fprintf(stderr, "%d:%s: bad wc opcode %d\n", pid, __func__,
+ wc.opcode);
+ if (wc.wr_id != 0xdeadbeef)
+ fprintf(stderr, "%d:%s: bad wc wr_id 0x%x\n", pid, __func__,
+ (int)wc.wr_id);
+}
+
+static void pp_send_done(struct pingpong_context *ctx)
+{
+ struct ibv_send_wr *bad_wr;
+ struct ibv_wc wc;
+ int ne;
+
+ ctx->list.addr = (uintptr_t) ctx->buf;
+ ctx->list.length = 1;
+ ctx->list.lkey = ctx->mr->lkey;
+ ctx->wr.wr_id = 0xcafebabe;
+ ctx->wr.sg_list = &ctx->list;
+ ctx->wr.num_sge = 1;
+ ctx->wr.opcode = IBV_WR_SEND;
+ ctx->wr.send_flags = IBV_SEND_SIGNALED;
+ ctx->wr.next = NULL;
+ if (ibv_post_send(ctx->qp, &ctx->wr, &bad_wr)) {
+ fprintf(stderr, "%d:%s: ibv_post_send failed\n", pid, __func__);
+ return;
+ }
+ do {
+ usleep(500);
+ ne = ibv_poll_cq(ctx->scq, 1, &wc);
+ } while (ne == 0);
+
+ if (wc.status)
+ fprintf(stderr, "%d:%s: bad wc status %d\n", pid, __func__,
+ wc.status);
+ if (wc.opcode != IBV_WC_SEND)
+ fprintf(stderr, "%d:%s: bad wc opcode %d\n", pid, __func__,
+ wc.opcode);
+ if (wc.wr_id != 0xcafebabe)
+ fprintf(stderr, "%d:%s: bad wc wr_id 0x%x\n", pid, __func__,
+ (int)wc.wr_id);
+}
+
+static void pp_wait_for_start(struct pingpong_context *ctx)
+{
+ struct ibv_wc wc;
+ int ne;
+
+ do {
+ usleep(500);
+ ne = ibv_poll_cq(ctx->rcq, 1, &wc);
+ } while (ne == 0);
+
+ if (wc.status)
+ fprintf(stderr, "%d:%s: bad wc status %d\n", pid, __func__,
+ wc.status);
+ if (!(wc.opcode & IBV_WC_RECV))
+ fprintf(stderr, "%d:%s: bad wc opcode %d\n", pid, __func__,
+ wc.opcode);
+ if (wc.wr_id != 0xdeadbeef)
+ fprintf(stderr, "%d:%s: bad wc wr_id 0x%x\n", pid, __func__,
+ (int)wc.wr_id);
+ pp_post_recv(ctx);
+}
+
+static void pp_send_start(struct pingpong_context *ctx)
+{
+ struct ibv_send_wr *bad_wr;
+ struct ibv_wc wc;
+ int ne;
+
+ ctx->list.addr = (uintptr_t) ctx->buf;
+ ctx->list.length = 1;
+ ctx->list.lkey = ctx->mr->lkey;
+ ctx->wr.wr_id = 0xabbaabba;
+ ctx->wr.sg_list = &ctx->list;
+ ctx->wr.num_sge = 1;
+ ctx->wr.opcode = IBV_WR_SEND;
+ ctx->wr.send_flags = IBV_SEND_SIGNALED;
+ ctx->wr.next = NULL;
+ if (ibv_post_send(ctx->qp, &ctx->wr, &bad_wr)) {
+ fprintf(stderr, "%d:%s: ibv_post_send failed\n", pid, __func__);
+ return;
+ }
+ do {
+ usleep(500);
+ ne = ibv_poll_cq(ctx->scq, 1, &wc);
+ } while (ne == 0);
+
+ if (wc.status)
+ fprintf(stderr, "%d:%s: bad wc status %d\n", pid, __func__,
+ wc.status);
+ if (wc.opcode != IBV_WC_SEND)
+ fprintf(stderr, "%d:%s: bad wc opcode %d\n", pid, __func__,
+ wc.opcode);
+ if (wc.wr_id != 0xabbaabba)
+ fprintf(stderr, "%d:%s: bad wc wr_id 0x%x\n", pid, __func__,
+ (int)wc.wr_id);
+}
+
+static void pp_close_cma(struct pp_data data)
+{
+ struct rdma_cm_event *event;
+ int rc;
+
+ if (data.servername) {
+ rc = rdma_disconnect(data.cm_id);
+ if (rc) {
+ perror("rdma_disconnect");
+ fprintf(stderr, "%d:%s: rdma disconnect error\n", pid,
+ __func__);
+ return;
+ }
+ }
+
+ rdma_get_cm_event(data.cm_channel, &event);
+ if (event->event != RDMA_CM_EVENT_DISCONNECTED)
+ fprintf(stderr, "%d:%s: unexpected event during disconnect %d\n",
+ pid, __func__, event->event);
+ rdma_ack_cm_event(event);
+ rdma_destroy_id(data.cm_id);
+ rdma_destroy_event_channel(data.cm_channel);
+}
+
+static void usage(const char *argv0)
+{
+ printf("Usage:\n");
+ printf(" %s start a server and wait for connection\n", argv0);
+ printf(" %s <host> connect to server at <host>\n", argv0);
+ printf("\n");
+ printf("Options:\n");
+ printf(" -p, --port=<port> listen on/connect to port <port> (default 18515)\n");
+ printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n");
+ printf(" -i, --ib-port=<port> use port <port> of IB device (default 1)\n");
+ printf(" -s, --size=<size> size of message to exchange (default 1)\n");
+ printf(" -t, --tx-depth=<dep> size of tx queue (default 50)\n");
+ printf(" -n, --iters=<iters> number of exchanges (at least 2, default 1000)\n");
+ printf(" -S, --sl=<sl> SL (default 0)\n");
+ printf(" -I, --inline_size=<size> max size of message to be sent in inline mode (default 400)\n");
+ printf(" -C, --report-cycles report times in cpu cycle units (default microseconds)\n");
+ printf(" -H, --report-histogram print out all results (default print summary only)\n");
+ printf(" -U, --report-unsorted (implies -H) print out unsorted results (default sorted)\n");
+ printf(" -c, --cma Use the RDMA CMA to setup the RDMA connection\n");
+}
+
+/*
+ * When there is an
+ * odd number of samples, the median is the middle number.
+ * even number of samples, the median is the mean of the
+ * two middle numbers.
+ *
+ */
+static inline cycles_t get_median(int n, cycles_t delta[])
+{
+ if ((n - 1) % 2)
+ return (delta[n / 2] + delta[n / 2 - 1]) / 2;
+ else
+ return delta[n / 2];
+}
+
+static int cycles_compare(const void * aptr, const void * bptr)
+{
+ const cycles_t *a = aptr;
+ const cycles_t *b = bptr;
+ if (*a < *b) return -1;
+ if (*a > *b) return 1;
+ return 0;
+}
+
+static void print_report(struct report_options * options,
+ unsigned int iters, cycles_t *tstamp)
+{
+ double cycles_to_units;
+ cycles_t median;
+ unsigned int i;
+ const char* units;
+ cycles_t *delta = malloc((iters - 1) * sizeof *delta);
+
+ if (!delta) {
+ perror("malloc");
+ return;
+ }
+
+ for (i = 0; i < iters - 1; ++i)
+ delta[i] = tstamp[i + 1] - tstamp[i];
+
+
+ if (options->cycles) {
+ cycles_to_units = 1;
+ units = "cycles";
+ } else {
+ cycles_to_units = get_cpu_mhz(0);
+ units = "usec";
+ }
+
+ if (options->unsorted) {
+ printf("#, %s\n", units);
+ for(i = 0; i < iters - 1; ++i)
+ printf("%d, %g\n", i + 1, delta[i] / cycles_to_units / 2);
+ }
+
+ qsort(delta, iters - 1, sizeof *delta, cycles_compare);
+
+ if (options->histogram) {
+ printf("#, %s\n", units);
+ for(i = 0; i < iters - 1; ++i)
+ printf("%d, %g\n", i + 1, delta[i] / cycles_to_units / 2);
+ }
+
+ median = get_median(iters - 1, delta);
+
+ printf("Latency typical: %g %s\n", median / cycles_to_units / 2, units);
+ printf("Latency best : %g %s\n", delta[0] / cycles_to_units / 2, units);
+ printf("Latency worst : %g %s\n", delta[iters - 2] / cycles_to_units / 2, units);
+
+ free(delta);
+}
+
+int main(int argc, char *argv[])
+{
+ const char *ib_devname = NULL;
+ const char *servername = NULL;
+ int iters = 1000;
+ struct report_options report = {};
+
+ struct pingpong_context *ctx;
+
+ struct ibv_qp *qp;
+ struct ibv_send_wr *wr;
+ volatile char *poll_buf;
+ volatile char *post_buf;
+
+ int scnt, rcnt, ccnt;
+
+ cycles_t *tstamp;
+
+ struct pp_data data = {
+ .port = 18515,
+ .ib_port = 1,
+ .size = 1,
+ .tx_depth = 50,
+ .use_cma = 0,
+ .servername = NULL,
+ .rem_dest = NULL,
+ .ib_dev = NULL,
+ .cm_channel = NULL,
+ .cm_id = NULL
+ };
+
+ /* Parameter parsing. */
+ while (1) {
+ int c;
+
+ static struct option long_options[] = {
+ { .name = "port", .has_arg = 1, .val = 'p' },
+ { .name = "ib-dev", .has_arg = 1, .val = 'd' },
+ { .name = "ib-port", .has_arg = 1, .val = 'i' },
+ { .name = "size", .has_arg = 1, .val = 's' },
+ { .name = "iters", .has_arg = 1, .val = 'n' },
+ { .name = "tx-depth", .has_arg = 1, .val = 't' },
+ { .name = "sl", .has_arg = 1, .val = 'S' },
+ { .name = "inline_size", .has_arg = 1, .val = 'I' },
+ { .name = "report-cycles", .has_arg = 0, .val = 'C' },
+ { .name = "report-histogram",.has_arg = 0, .val = 'H' },
+ { .name = "report-unsorted",.has_arg = 0, .val = 'U' },
+ { .name = "cma", .has_arg = 0, .val = 'c' },
+ { 0 }
+ };
+
+ c = getopt_long(argc, argv, "p:d:i:s:n:t:S:I:CHUc", long_options, NULL);
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'p':
+ data.port = strtol(optarg, NULL, 0);
+ if (data.port < 0 || data.port > 65535) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ case 'd':
+ ib_devname = strdupa(optarg);
+ break;
+
+ case 'i':
+ data.ib_port = strtol(optarg, NULL, 0);
+ if (data.ib_port < 0) {
+ usage(argv[0]);
+ return 2;
+ }
+ break;
+
+ case 's':
+ data.size = strtol(optarg, NULL, 0);
+ if (data.size < 1) { usage(argv[0]); return 3; }
+ break;
+
+ case 't':
+ data.tx_depth = strtol(optarg, NULL, 0);
+ if (data.tx_depth < 1) { usage(argv[0]); return 4; }
+ break;
+
+ case 'n':
+ iters = strtol(optarg, NULL, 0);
+ if (iters < 2) {
+ usage(argv[0]);
+ return 5;
+ }
+ break;
+
+ case 'S':
+ sl = strtol(optarg, NULL, 0);
+ if (sl > 15) { usage(argv[0]); return 6; }
+ break;
+
+ case 'I':
+ inline_size = strtol(optarg, NULL, 0);
+ break;
+
+ case 'C':
+ report.cycles = 1;
+ break;
+
+ case 'H':
+ report.histogram = 1;
+ break;
+
+ case 'U':
+ report.unsorted = 1;
+ break;
+
+ case 'c':
+ data.use_cma = 1;
+ break;
+
+ default:
+ usage(argv[0]);
+ return 7;
+ }
+ }
+
+ if (optind == argc - 1)
+ data.servername = strdupa(argv[optind]);
+ else if (optind < argc) {
+ usage(argv[0]);
+ return 6;
+ }
+
+ /*
+ * Done with parameter parsing. Perform setup.
+ */
+ pid = getpid();
+
+ srand48(pid * time(NULL));
+ page_size = sysconf(_SC_PAGESIZE);
+
+
+ if (data.use_cma) {
+ data.cm_channel = rdma_create_event_channel();
+ if (!data.cm_channel) {
+ fprintf(stderr, "%d:%s: rdma_create_event_channel failed\n",
+ pid, __func__);
+ return 1;
+ }
+ if (rdma_create_id(data.cm_channel, &data.cm_id, NULL, RDMA_PS_TCP)) {
+ fprintf(stderr, "%d:%s: rdma_create_id failed\n",
+ pid, __func__);
+ return 1;
+ }
+
+ if (data.servername) {
+ ctx = pp_client_connect(&data);
+ if (!ctx)
+ return 1;
+ } else {
+ ctx = pp_server_connect(&data);
+ if (!ctx)
+ return 1;
+ }
+
+ printf("%d: Local address: LID %#04x, QPN %#06x, PSN %#06x "
+ "RKey %#08x VAddr %#016Lx\n", pid,
+ data.my_dest.lid, data.my_dest.qpn, data.my_dest.psn,
+ data.my_dest.rkey, data.my_dest.vaddr);
+
+ printf("%d: Remote address: LID %#04x, QPN %#06x, PSN %#06x, "
+ "RKey %#08x VAddr %#016Lx\n\n", pid,
+ data.rem_dest->lid, data.rem_dest->qpn, data.rem_dest->psn,
+ data.rem_dest->rkey, data.rem_dest->vaddr);
+
+ if (data.servername) {
+ pp_send_start(ctx);
+ } else {
+ pp_wait_for_start(ctx);
+ }
+
+ } else {
+ data.ib_dev = pp_find_dev(ib_devname);
+ if (!data.ib_dev)
+ return 7;
+
+ if (data.servername) {
+ ctx = pp_client_connect(&data);
+ if (!ctx)
+ return 8;
+ } else {
+ ctx = pp_server_connect(&data);
+ if (!ctx)
+ return 8;
+ }
+
+ if (pp_open_port(ctx, &data))
+ return 9;
+ }
+ wr = &ctx->wr;
+ ctx->list.addr = (uintptr_t) ctx->buf;
+ ctx->list.length = ctx->size;
+ ctx->list.lkey = ctx->mr->lkey;
+ wr->wr.rdma.remote_addr = data.rem_dest->vaddr;
+ wr->wr.rdma.rkey = data.rem_dest->rkey;
+ ctx->wr.wr_id = PINGPONG_RDMA_WRID;
+ ctx->wr.sg_list = &ctx->list;
+ ctx->wr.num_sge = 1;
+ ctx->wr.opcode = IBV_WR_RDMA_WRITE;
+ if (ctx->size > inline_size || ctx->size == 0) {
+ ctx->wr.send_flags = IBV_SEND_SIGNALED;
+ } else {
+ ctx->wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;
+ }
+ ctx->wr.next = NULL;
+
+ scnt = 0;
+ rcnt = 0;
+ ccnt = 0;
+ poll_buf = ctx->poll_buf;
+ post_buf = ctx->post_buf;
+ qp = ctx->qp;
+
+ tstamp = malloc(iters * sizeof *tstamp);
+ if (!tstamp) {
+ perror("malloc");
+ return 10;
+ }
+
+ /* Done with setup. Start the test. */
+
+ while (scnt < iters || ccnt < iters || rcnt < iters) {
+
+ /* Wait till buffer changes. */
+ if (rcnt < iters && !(scnt < 1 && data.servername)) {
+ ++rcnt;
+ while (*poll_buf != (char)rcnt)
+ ;
+ /* Here the data is already in the physical memory.
+ If we wanted to actually use it, we may need
+ a read memory barrier here. */
+ }
+
+ if (scnt < iters) {
+ struct ibv_send_wr *bad_wr;
+ tstamp[scnt] = get_cycles();
+
+ *post_buf = (char)++scnt;
+ if (ibv_post_send(qp, wr, &bad_wr)) {
+ fprintf(stderr, "Couldn't post send: scnt=%d\n",
+ scnt);
+ return 11;
+ }
+ }
+
+ if (ccnt < iters) {
+ struct ibv_wc wc;
+ int ne;
+ ++ccnt;
+ do {
+ ne = ibv_poll_cq(ctx->scq, 1, &wc);
+ } while (ne == 0);
+
+ if (ne < 0) {
+ fprintf(stderr, "poll CQ failed %d\n", ne);
+ return 12;
+ }
+ if (wc.status != IBV_WC_SUCCESS) {
+ fprintf(stderr, "Completion wth error at %s:\n",
+ servername ? "client" : "server");
+ fprintf(stderr, "Failed status %d: wr_id %d\n",
+ wc.status, (int) wc.wr_id);
+ fprintf(stderr, "scnt=%d, rcnt=%d, ccnt=%d\n",
+ scnt, rcnt, ccnt);
+ return 13;
+ }
+ }
+ }
+ if (data.use_cma) {
+ pp_send_done(ctx);
+ pp_wait_for_done(ctx);
+ pp_close_cma(data);
+ }
+
+ print_report(&report, iters, tstamp);
+ return 0;
+}
diff --git a/read_bw.c b/read_bw.c
new file mode 100755
index 0000000..2dd55ef
--- /dev/null
+++ b/read_bw.c
@@ -0,0 +1,1049 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2006 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <netdb.h>
+#include <malloc.h>
+#include <getopt.h>
+#include <arpa/inet.h>
+#include <byteswap.h>
+#include <time.h>
+
+#include <infiniband/verbs.h>
+
+#include "get_clock.h"
+
+#define PINGPONG_READ_WRID 1
+#define VERSION 1.1
+#define ALL 1
+#define RC 0
+
+struct user_parameters {
+ const char *servername;
+ int connection_type;
+ int mtu;
+ int all; /* run all msg size */
+ int iters;
+ int tx_depth;
+ int max_out_read;
+ int use_event;
+ int qp_timeout;
+ int gid_index; /* if value not negative, we use gid AND gid_index=value */
+};
+static int sl = 0;
+static int page_size;
+cycles_t *tposted;
+cycles_t *tcompleted;
+struct pingpong_context {
+ struct ibv_context *context;
+ struct ibv_comp_channel *channel;
+ struct ibv_pd *pd;
+ struct ibv_mr *mr;
+ struct ibv_cq *cq;
+ struct ibv_qp *qp;
+ void *buf;
+ unsigned size;
+ int tx_depth;
+ struct ibv_sge list;
+ struct ibv_send_wr wr;
+ union ibv_gid dgid;
+};
+
+struct pingpong_dest {
+ int lid;
+ int qpn;
+ int psn;
+ unsigned rkey;
+ unsigned long long vaddr;
+ union ibv_gid dgid;
+};
+
+
+static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port)
+{
+ struct ibv_port_attr attr;
+
+ if (ibv_query_port(ctx->context, port, &attr))
+ return 0;
+
+ return attr.lid;
+}
+
+static int pp_client_connect(const char *servername, int port)
+{
+ struct addrinfo *res, *t;
+ struct addrinfo hints = {
+ .ai_family = AF_UNSPEC,
+ .ai_socktype = SOCK_STREAM
+ };
+ char *service;
+ int n;
+ int sockfd = -1;
+
+ if (asprintf(&service, "%d", port) < 0)
+ return -1;
+
+ n = getaddrinfo(servername, service, &hints, &res);
+
+ if (n < 0) {
+ fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port);
+ return n;
+ }
+
+ for (t = res; t; t = t->ai_next) {
+ sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
+ if (sockfd >= 0) {
+ if (!connect(sockfd, t->ai_addr, t->ai_addrlen))
+ break;
+ close(sockfd);
+ sockfd = -1;
+ }
+ }
+
+ freeaddrinfo(res);
+
+ if (sockfd < 0) {
+ fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port);
+ return sockfd;
+ }
+ return sockfd;
+}
+
+struct pingpong_dest * pp_client_exch_dest(int sockfd,
+ const struct pingpong_dest *my_dest, struct user_parameters *user_parm)
+{
+ struct pingpong_dest *rem_dest = NULL;
+ char msg[sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00"];
+ int parsed;
+
+ sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x",
+ my_dest->lid, my_dest->qpn, my_dest->psn,my_dest->rkey,my_dest->vaddr,
+ my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2],
+ my_dest->dgid.raw[3], my_dest->dgid.raw[4], my_dest->dgid.raw[5],
+ my_dest->dgid.raw[6], my_dest->dgid.raw[7], my_dest->dgid.raw[8],
+ my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11],
+ my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14],
+ my_dest->dgid.raw[15]);
+ if (write(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("client write");
+ fprintf(stderr, "Couldn't send local address\n");
+ goto out;
+ }
+
+ if (read(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("client read");
+ fprintf(stderr, "Couldn't read remote address\n");
+ goto out;
+ }
+
+ rem_dest = malloc(sizeof *rem_dest);
+ if (!rem_dest)
+ goto out;
+
+ if (user_parm->gid_index < 0) {
+ parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &rem_dest->lid, &rem_dest->qpn,
+ &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr);
+ if (parsed != 5) {
+ fprintf(stderr, "Couldn't parse line <%.*s>\n",(int)sizeof msg, msg);
+ free(rem_dest);
+ rem_dest = NULL;
+ goto out;
+ }
+ }else{
+ char *pstr = msg, *term;
+ char tmp[20];
+ int i;
+
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA
+
+ for (i = 0; i < 15; ++i) {
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16);
+ }
+ pstr += term - pstr + 1;
+ strcpy(tmp, pstr);
+ rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16);
+ }
+out:
+ return rem_dest;
+}
+
+int pp_server_connect(int port)
+{
+ struct addrinfo *res, *t;
+ struct addrinfo hints = {
+ .ai_flags = AI_PASSIVE,
+ .ai_family = AF_UNSPEC,
+ .ai_socktype = SOCK_STREAM
+ };
+ char *service;
+ int sockfd = -1, connfd;
+ int n;
+
+ if (asprintf(&service, "%d", port) < 0)
+ return -1;
+
+ n = getaddrinfo(NULL, service, &hints, &res);
+
+ if (n < 0) {
+ fprintf(stderr, "%s for port %d\n", gai_strerror(n), port);
+ return n;
+ }
+
+ for (t = res; t; t = t->ai_next) {
+ sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
+ if (sockfd >= 0) {
+ n = 1;
+
+ setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n);
+
+ if (!bind(sockfd, t->ai_addr, t->ai_addrlen))
+ break;
+ close(sockfd);
+ sockfd = -1;
+ }
+ }
+
+ freeaddrinfo(res);
+
+ if (sockfd < 0) {
+ fprintf(stderr, "Couldn't listen to port %d\n", port);
+ return sockfd;
+ }
+
+ listen(sockfd, 1);
+ connfd = accept(sockfd, NULL, 0);
+ if (connfd < 0) {
+ perror("server accept");
+ fprintf(stderr, "accept() failed\n");
+ close(sockfd);
+ return connfd;
+ }
+
+ close(sockfd);
+ return connfd;
+}
+
+static struct pingpong_dest *pp_server_exch_dest(int connfd, const struct pingpong_dest *my_dest, struct user_parameters *user_parm)
+{
+ char msg[sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00"];
+ struct pingpong_dest *rem_dest = NULL;
+ int parsed;
+ int n;
+
+ n = read(connfd, msg, sizeof msg);
+ if (n != sizeof msg) {
+ perror("server read");
+ fprintf(stderr, "%d/%d: Couldn't read remote address\n", n, (int) sizeof msg);
+ goto out;
+ }
+
+ rem_dest = malloc(sizeof *rem_dest);
+ if (!rem_dest)
+ goto out;
+
+ if (user_parm->gid_index < 0) {
+ parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &rem_dest->lid, &rem_dest->qpn,
+ &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr);
+ if (parsed != 5) {
+ fprintf(stderr, "Couldn't parse line <%.*s>\n",(int)sizeof msg, msg);
+ free(rem_dest);
+ rem_dest = NULL;
+ goto out;
+ }
+ }else{
+ char *pstr = msg, *term;
+ char tmp[20];
+ int i;
+
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA
+
+ for (i = 0; i < 15; ++i) {
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16);
+ }
+ pstr += term - pstr + 1;
+ strcpy(tmp, pstr);
+ rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16);
+ }
+
+ sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x",
+ my_dest->lid, my_dest->qpn, my_dest->psn, my_dest->rkey, my_dest->vaddr,
+ my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2],
+ my_dest->dgid.raw[3], my_dest->dgid.raw[4], my_dest->dgid.raw[5],
+ my_dest->dgid.raw[6], my_dest->dgid.raw[7], my_dest->dgid.raw[8],
+ my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11],
+ my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14],
+ my_dest->dgid.raw[15]);
+ if (write(connfd, msg, sizeof msg) != sizeof msg) {
+ perror("server write");
+ fprintf(stderr, "Couldn't send local address\n");
+ free(rem_dest);
+ rem_dest = NULL;
+ goto out;
+ }
+out:
+ return rem_dest;
+}
+
+static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev,
+ unsigned size,
+ int tx_depth, int port,
+ struct user_parameters *user_parm)
+{
+ struct pingpong_context *ctx;
+ struct ibv_device_attr device_attr;
+ ctx = malloc(sizeof *ctx);
+ if (!ctx)
+ return NULL;
+
+ ctx->size = size;
+ ctx->tx_depth = tx_depth;
+
+ ctx->buf = memalign(page_size, size * 2);
+ if (!ctx->buf) {
+ fprintf(stderr, "Couldn't allocate work buf.\n");
+ return NULL;
+ }
+
+ memset(ctx->buf, 0, size * 2);
+
+ ctx->context = ibv_open_device(ib_dev);
+ if (!ctx->context) {
+ fprintf(stderr, "Couldn't get context for %s\n",
+ ibv_get_device_name(ib_dev));
+ return NULL;
+ }
+ if (user_parm->mtu == 0) {/*user did not ask for specific mtu */
+ if (ibv_query_device(ctx->context, &device_attr)) {
+ fprintf(stderr, "Failed to query device props");
+ return NULL;
+ }
+ if (device_attr.vendor_part_id == 23108 || user_parm->gid_index > -1)
+ user_parm->mtu = 1024;
+ else
+ user_parm->mtu = 2048;
+ }
+ if (user_parm->use_event) {
+ ctx->channel = ibv_create_comp_channel(ctx->context);
+ if (!ctx->channel) {
+ fprintf(stderr, "Couldn't create completion channel\n");
+ return NULL;
+ }
+ } else
+ ctx->channel = NULL;
+ ctx->pd = ibv_alloc_pd(ctx->context);
+ if (!ctx->pd) {
+ fprintf(stderr, "Couldn't allocate PD\n");
+ return NULL;
+ }
+
+ /* We dont really want IBV_ACCESS_LOCAL_WRITE, but IB spec says:
+ * The Consumer is not allowed to assign Remote Write or Remote Atomic to
+ * a Memory Region that has not been assigned Local Write. */
+ ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size * 2,
+ IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ);
+ if (!ctx->mr) {
+ fprintf(stderr, "Couldn't allocate MR\n");
+ return NULL;
+ }
+
+ ctx->cq = ibv_create_cq(ctx->context, tx_depth, NULL, ctx->channel, 0);
+ if (!ctx->cq) {
+ fprintf(stderr, "Couldn't create CQ\n");
+ return NULL;
+ }
+
+ {
+ struct ibv_qp_init_attr attr;
+ memset(&attr, 0, sizeof(struct ibv_qp_init_attr));
+ attr.send_cq = ctx->cq;
+ attr.recv_cq = ctx->cq;
+ attr.cap.max_send_wr = tx_depth;
+ /* Work around: driver doesnt support
+ * recv_wr = 0 */
+ attr.cap.max_recv_wr = 1;
+ attr.cap.max_send_sge = 1;
+ attr.cap.max_recv_sge = 1;
+ attr.qp_type = IBV_QPT_RC;
+ ctx->qp = ibv_create_qp(ctx->pd, &attr);
+ if (!ctx->qp) {
+ fprintf(stderr, "Couldn't create QP\n");
+ return NULL;
+ }
+ }
+
+ {
+ struct ibv_qp_attr attr;
+
+ attr.qp_state = IBV_QPS_INIT;
+ attr.pkey_index = 0;
+ attr.port_num = port;
+ attr.qp_access_flags = IBV_ACCESS_REMOTE_READ;
+
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_PKEY_INDEX |
+ IBV_QP_PORT |
+ IBV_QP_ACCESS_FLAGS)) {
+ fprintf(stderr, "Failed to modify QP to INIT\n");
+ return NULL;
+ }
+ }
+
+ return ctx;
+}
+static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn,
+ struct pingpong_dest *dest, struct user_parameters *user_parm)
+{
+ struct ibv_qp_attr attr;
+ memset(&attr, 0, sizeof attr);
+
+ attr.qp_state = IBV_QPS_RTR;
+ switch (user_parm->mtu) {
+ case 256 :
+ attr.path_mtu = IBV_MTU_256;
+ break;
+ case 512 :
+ attr.path_mtu = IBV_MTU_512;
+ break;
+ case 1024 :
+ attr.path_mtu = IBV_MTU_1024;
+ break;
+ case 2048 :
+ attr.path_mtu = IBV_MTU_2048;
+ break;
+ case 4096 :
+ attr.path_mtu = IBV_MTU_4096;
+ break;
+ }
+ printf("Mtu : %d\n", user_parm->mtu);
+ attr.dest_qp_num = dest->qpn;
+ attr.rq_psn = dest->psn;
+ attr.max_dest_rd_atomic = user_parm->max_out_read;
+ attr.min_rnr_timer = 12;
+ if (user_parm->gid_index<0) {
+ attr.ah_attr.is_global = 0;
+ attr.ah_attr.dlid = dest->lid;
+ attr.ah_attr.sl = sl;
+ } else {
+ attr.ah_attr.is_global = 1;
+ attr.ah_attr.grh.dgid = dest->dgid;
+ attr.ah_attr.grh.hop_limit = 1;
+ attr.ah_attr.sl = 0;
+ }
+ attr.ah_attr.src_path_bits = 0;
+ attr.ah_attr.port_num = port;
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_AV |
+ IBV_QP_PATH_MTU |
+ IBV_QP_DEST_QPN |
+ IBV_QP_RQ_PSN |
+ IBV_QP_MIN_RNR_TIMER |
+ IBV_QP_MAX_DEST_RD_ATOMIC)) {
+ fprintf(stderr, "Failed to modify RC QP to RTR\n");
+ return 1;
+ }
+ attr.timeout = user_parm->qp_timeout;
+ attr.retry_cnt = 7;
+ attr.rnr_retry = 7;
+ attr.qp_state = IBV_QPS_RTS;
+ attr.sq_psn = my_psn;
+ attr.max_rd_atomic = user_parm->max_out_read;
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_SQ_PSN |
+ IBV_QP_TIMEOUT |
+ IBV_QP_RETRY_CNT |
+ IBV_QP_RNR_RETRY |
+ IBV_QP_MAX_QP_RD_ATOMIC)) {
+ fprintf(stderr, "Failed to modify RC QP to RTS\n");
+ return 1;
+ }
+ return 0;
+}
+
+static void usage(const char *argv0)
+{
+ printf("Usage:\n");
+ printf(" %s start a server and wait for connection\n", argv0);
+ printf(" %s <host> connect to server at <host>\n", argv0);
+ printf("\n");
+ printf("Options:\n");
+ printf(" -p, --port=<port> listen on/connect to port <port> (default 18515)\n");
+ printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n");
+ printf(" -i, --ib-port=<port> use port <port> of IB device (default 1)\n");
+ printf(" -m, --mtu=<mtu> mtu size (256 - 4096. default for hermon is 2048)\n");
+ printf(" -o, --outs=<num> num of outstanding read/atom(default 4)\n");
+ printf(" -s, --size=<size> size of message to exchange (default 65536)\n");
+ printf(" -a, --all Run sizes from 2 till 2^23\n");
+ printf(" -t, --tx-depth=<dep> size of tx queue (default 100)\n");
+ printf(" -n, --iters=<iters> number of exchanges (at least 2, default 1000)\n");
+ printf(" -u, --qp-timeout=<timeout> QP timeout, timeout value is 4 usec * 2 ^(timeout), default 14\n");
+ printf(" -S, --sl=<sl> SL (default 0)\n");
+ printf(" -x, --gid-index=<index> test uses GID with GID index taken from command line (for RDMAoE index should be 0)\n");
+ printf(" -b, --bidirectional measure bidirectional bandwidth (default unidirectional)\n");
+ printf(" -V, --version display version number\n");
+ printf(" -e, --events sleep on CQ events (default poll)\n");
+ printf(" -F, --CPU-freq do not fail even if cpufreq_ondemand module is loaded\n");
+}
+
+static void print_report(unsigned int iters, unsigned size, int duplex,
+ cycles_t *tposted, cycles_t *tcompleted, int no_cpu_freq_fail)
+{
+ double cycles_to_units;
+ unsigned long tsize; /* Transferred size, in megabytes */
+ int i, j;
+ int opt_posted = 0, opt_completed = 0;
+ cycles_t opt_delta;
+ cycles_t t;
+
+
+ opt_delta = tcompleted[opt_posted] - tposted[opt_completed];
+
+ /* Find the peak bandwidth */
+ for (i = 0; i < iters; ++i)
+ for (j = i; j < iters; ++j) {
+ t = (tcompleted[j] - tposted[i]) / (j - i + 1);
+ if (t < opt_delta) {
+ opt_delta = t;
+ opt_posted = i;
+ opt_completed = j;
+ }
+ }
+
+ cycles_to_units = get_cpu_mhz(no_cpu_freq_fail) * 1000000;
+
+ tsize = duplex ? 2 : 1;
+ tsize = tsize * size;
+ printf("%7d %d %7.2f %7.2f\n",
+ size,iters,tsize * cycles_to_units / opt_delta / 0x100000,
+ tsize * iters * cycles_to_units /(tcompleted[iters - 1] - tposted[0]) / 0x100000);
+}
+int run_iter(struct pingpong_context *ctx, struct user_parameters *user_param,
+ struct pingpong_dest *rem_dest, int size)
+{
+ struct ibv_qp *qp;
+ int scnt, ccnt ;
+
+ ctx->list.addr = (uintptr_t) ctx->buf;
+ ctx->list.length = size;
+ ctx->list.lkey = ctx->mr->lkey;
+ ctx->wr.wr.rdma.remote_addr = rem_dest->vaddr;
+ ctx->wr.wr.rdma.rkey = rem_dest->rkey;
+ ctx->wr.wr_id = PINGPONG_READ_WRID;
+ ctx->wr.sg_list = &ctx->list;
+ ctx->wr.num_sge = 1;
+ ctx->wr.opcode = IBV_WR_RDMA_READ;
+ ctx->wr.send_flags = IBV_SEND_SIGNALED;
+ ctx->wr.next = NULL;
+
+ scnt = 0;
+ ccnt = 0;
+
+ qp = ctx->qp;
+
+ /* Done with setup. Start the test. */
+ while (scnt < user_param->iters || ccnt < user_param->iters) {
+ while (scnt < user_param->iters && (scnt - ccnt) < user_param->tx_depth ) {
+ struct ibv_send_wr *bad_wr;
+ tposted[scnt] = get_cycles();
+ if (ibv_post_send(qp, &ctx->wr, &bad_wr)) {
+ fprintf(stderr, "Couldn't post send: scnt=%d\n",
+ scnt);
+ return 1;
+ }
+ ++scnt;
+ }
+ if (ccnt < user_param->iters) {
+ struct ibv_wc wc;
+ int ne;
+ if (user_param->use_event) {
+ struct ibv_cq *ev_cq;
+ void *ev_ctx;
+ if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) {
+ fprintf(stderr, "Failed to get cq_event\n");
+ return 1;
+ }
+ if (ev_cq != ctx->cq) {
+ fprintf(stderr, "CQ event for unknown CQ %p\n", ev_cq);
+ return 1;
+ }
+ if (ibv_req_notify_cq(ctx->cq, 0)) {
+ fprintf(stderr, "Couldn't request CQ notification\n");
+ return 1;
+ }
+ }
+ do {
+ ne = ibv_poll_cq(ctx->cq, 1, &wc);
+ if (ne) {
+ tcompleted[ccnt] = get_cycles();
+ if (wc.status != IBV_WC_SUCCESS) {
+ fprintf(stderr, "Completion wth error at %s:\n",
+ user_param->servername ? "client" : "server");
+ fprintf(stderr, "Failed status %d: wr_id %d syndrom 0x%x\n",
+ wc.status, (int) wc.wr_id, wc.vendor_err);
+ fprintf(stderr, "scnt=%d, ccnt=%d\n",
+ scnt, ccnt);
+ return 1;
+ }
+ ccnt = ccnt + ne;
+ }
+ } while (ne > 0 );
+
+ if (ne < 0) {
+ fprintf(stderr, "poll CQ failed %d\n", ne);
+ return 1;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ struct ibv_device **dev_list;
+ struct ibv_device *ib_dev;
+ struct pingpong_context *ctx;
+ struct pingpong_dest my_dest;
+ struct pingpong_dest *rem_dest;
+ struct user_parameters user_param;
+ char *ib_devname = NULL;
+ int port = 18515;
+ int ib_port = 1;
+ long long size = 65536;
+ int sockfd;
+ int duplex = 0;
+ int i = 0;
+ int no_cpu_freq_fail = 0;
+ union ibv_gid gid;
+
+ /* init default values to user's parameters */
+ memset(&user_param, 0, sizeof(struct user_parameters));
+ user_param.mtu = 0;
+ user_param.iters = 1000;
+ user_param.tx_depth = 100;
+ user_param.servername = NULL;
+ user_param.use_event = 0;
+ user_param.max_out_read = 4; /* the device capability on gen2 */
+ user_param.qp_timeout = 14;
+ user_param.gid_index = -1; /*gid will not be used*/
+ /* Parameter parsing. */
+ while (1) {
+ int c;
+
+ static struct option long_options[] = {
+ { .name = "port", .has_arg = 1, .val = 'p' },
+ { .name = "ib-dev", .has_arg = 1, .val = 'd' },
+ { .name = "ib-port", .has_arg = 1, .val = 'i' },
+ { .name = "mtu", .has_arg = 1, .val = 'm' },
+ { .name = "outs", .has_arg = 1, .val = 'o' },
+ { .name = "size", .has_arg = 1, .val = 's' },
+ { .name = "iters", .has_arg = 1, .val = 'n' },
+ { .name = "tx-depth", .has_arg = 1, .val = 't' },
+ { .name = "qp-timeout", .has_arg = 1, .val = 'u' },
+ { .name = "sl", .has_arg = 1, .val = 'S' },
+ { .name = "gid-index", .has_arg = 1, .val = 'x' },
+ { .name = "all", .has_arg = 0, .val = 'a' },
+ { .name = "bidirectional", .has_arg = 0, .val = 'b' },
+ { .name = "version", .has_arg = 0, .val = 'V' },
+ { .name = "events", .has_arg = 0, .val = 'e' },
+ { .name = "CPU-freq", .has_arg = 0, .val = 'F' },
+ { 0 }
+ };
+
+ c = getopt_long(argc, argv, "p:d:i:m:o:s:n:t:u:S:x:abVeF", long_options, NULL);
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'p':
+ port = strtol(optarg, NULL, 0);
+ if (port < 0 || port > 65535) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ case 'd':
+ ib_devname = strdupa(optarg);
+ break;
+ case 'e':
+ ++user_param.use_event;
+ break;
+ case 'm':
+ user_param.mtu = strtol(optarg, NULL, 0);
+ break;
+ case 'o':
+ user_param.max_out_read = strtol(optarg, NULL, 0);
+ break;
+ case 'a':
+ user_param.all = ALL;
+ break;
+ case 'V':
+ printf("read_bw version : %.2f\n",VERSION);
+ return 0;
+ break;
+ case 'i':
+ ib_port = strtol(optarg, NULL, 0);
+ if (ib_port < 0) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ case 's':
+ size = strtoll(optarg, NULL, 0);
+ if (size < 1 || size > UINT_MAX / 2) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ case 't':
+ user_param.tx_depth = strtol(optarg, NULL, 0);
+ if (user_param.tx_depth < 1) { usage(argv[0]); return 1; }
+ break;
+
+ case 'n':
+ user_param.iters = strtol(optarg, NULL, 0);
+ if (user_param.iters < 2) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ break;
+
+ case 'b':
+ duplex = 1;
+ break;
+
+ case 'F':
+ no_cpu_freq_fail = 1;
+ break;
+
+ case 'u':
+ user_param.qp_timeout = strtol(optarg, NULL, 0);
+ break;
+
+ case 'S':
+ sl = strtol(optarg, NULL, 0);
+ if (sl > 15) { usage(argv[0]); return 1; }
+ break;
+
+ case 'x':
+ user_param.gid_index = strtol(optarg, NULL, 0);
+ if (user_param.gid_index > 63) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ default:
+ usage(argv[0]);
+ return 1;
+ }
+ }
+
+ if (optind == argc - 1)
+ user_param.servername = strdupa(argv[optind]);
+ else if (optind < argc) {
+ usage(argv[0]);
+ return 1;
+ }
+ printf("------------------------------------------------------------------\n");
+ if (duplex == 1)
+ printf(" RDMA_Read Bidirectional BW Test\n");
+ else
+ printf(" RDMA_Read BW Test\n");
+
+ printf("Connection type : RC\n");
+ if (user_param.gid_index > -1) {
+ printf("Using GID to support RDMAoE configuration. Refer to port type as Ethernet, default MTU 1024B\n");
+ }
+
+ /* Done with parameter parsing. Perform setup. */
+ if (user_param.all == ALL)
+ /*since we run all sizes */
+ size = 8388608; /*2^23 */
+
+ srand48(getpid() * time(NULL));
+
+ page_size = sysconf(_SC_PAGESIZE);
+
+ dev_list = ibv_get_device_list(NULL);
+
+ if (!ib_devname) {
+ ib_dev = dev_list[0];
+ if (!ib_dev) {
+ fprintf(stderr, "No IB devices found\n");
+ return 1;
+ }
+ } else {
+ for (; (ib_dev = *dev_list); ++dev_list)
+ if (!strcmp(ibv_get_device_name(ib_dev), ib_devname))
+ break;
+ if (!ib_dev) {
+ fprintf(stderr, "IB device %s not found\n", ib_devname);
+ return 1;
+ }
+ }
+
+ ctx = pp_init_ctx(ib_dev, size, user_param.tx_depth, ib_port, &user_param);
+ if (!ctx)
+ return 1;
+
+ if (user_param.gid_index != -1) {
+ int err=0;
+ err = ibv_query_gid (ctx->context, ib_port, user_param.gid_index, &gid);
+ if (err) {
+ return -1;
+ }
+ ctx->dgid=gid;
+ }
+
+ /* Create connection between client and server.
+ * We do it by exchanging data over a TCP socket connection. */
+
+ my_dest.lid = pp_get_local_lid(ctx, ib_port);
+ my_dest.qpn = ctx->qp->qp_num;
+ my_dest.psn = lrand48() & 0xffffff;
+ if (user_param.gid_index < 0) {/*We do not fail test upon lid in RDMA0E/Eth conf*/
+ if (!my_dest.lid) {
+ fprintf(stderr, "Local lid 0x0 detected. Is an SM running? If you are running on an RMDAoE interface you must use GIDs\n");
+ return 1;
+ }
+ }
+ my_dest.dgid = gid;
+ my_dest.rkey = ctx->mr->rkey;
+ my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size;
+
+ printf(" local address: LID %#04x, QPN %#06x, PSN %#06x "
+ "RKey %#08x VAddr %#016Lx\n",
+ my_dest.lid, my_dest.qpn, my_dest.psn,
+ my_dest.rkey, my_dest.vaddr);
+ if (user_param.gid_index > -1) {
+ printf(" GID %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+ my_dest.dgid.raw[0],my_dest.dgid.raw[1],
+ my_dest.dgid.raw[2], my_dest.dgid.raw[3], my_dest.dgid.raw[4],
+ my_dest.dgid.raw[5], my_dest.dgid.raw[6], my_dest.dgid.raw[7],
+ my_dest.dgid.raw[8], my_dest.dgid.raw[9], my_dest.dgid.raw[10],
+ my_dest.dgid.raw[11], my_dest.dgid.raw[12], my_dest.dgid.raw[13],
+ my_dest.dgid.raw[14], my_dest.dgid.raw[15]);
+ }
+
+ if (user_param.servername) {
+ sockfd = pp_client_connect(user_param.servername, port);
+ if (sockfd < 0)
+ return 1;
+ rem_dest = pp_client_exch_dest(sockfd, &my_dest, &user_param);
+ } else {
+ sockfd = pp_server_connect(port);
+ if (sockfd < 0)
+ return 1;
+ rem_dest = pp_server_exch_dest(sockfd, &my_dest, &user_param);
+ }
+
+ if (!rem_dest)
+ return 1;
+
+ printf(" remote address: LID %#04x, QPN %#06x, PSN %#06x, "
+ "RKey %#08x VAddr %#016Lx\n",
+ rem_dest->lid, rem_dest->qpn, rem_dest->psn,
+ rem_dest->rkey, rem_dest->vaddr);
+ if (user_param.gid_index > -1) {
+ printf(" GID %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+ rem_dest->dgid.raw[0],rem_dest->dgid.raw[1],
+ rem_dest->dgid.raw[2], rem_dest->dgid.raw[3], rem_dest->dgid.raw[4],
+ rem_dest->dgid.raw[5], rem_dest->dgid.raw[6], rem_dest->dgid.raw[7],
+ rem_dest->dgid.raw[8], rem_dest->dgid.raw[9], rem_dest->dgid.raw[10],
+ rem_dest->dgid.raw[11], rem_dest->dgid.raw[12], rem_dest->dgid.raw[13],
+ rem_dest->dgid.raw[14], rem_dest->dgid.raw[15]);
+ }
+
+ if (pp_connect_ctx(ctx, ib_port, my_dest.psn, rem_dest, &user_param))
+ return 1;
+
+ /* An additional handshake is required *after* moving qp to RTR.
+ Arbitrarily reuse exch_dest for this purpose. */
+ if (user_param.servername)
+ rem_dest = pp_client_exch_dest(sockfd, &my_dest, &user_param);
+ else
+ rem_dest = pp_server_exch_dest(sockfd, &my_dest, &user_param);
+
+ if (!rem_dest)
+ return 1;
+
+
+ /* For half duplex tests, server just waits for client to exit */
+
+ if (!user_param.servername && !duplex) {
+ rem_dest = pp_server_exch_dest(sockfd, &my_dest, &user_param);
+ if (write(sockfd, "done", sizeof "done") != sizeof "done"){
+ perror("server write");
+ fprintf(stderr, "Couldn't write to socket\n");
+ return 1;
+ }
+ close(sockfd);
+ return 0;
+ } else if (user_param.use_event) {
+ printf("Test with events.\n");
+ if (ibv_req_notify_cq(ctx->cq, 0)) {
+ fprintf(stderr, "Couldn't request CQ notification\n");
+ return 1;
+ }
+ }
+
+ printf("------------------------------------------------------------------\n");
+ printf(" #bytes #iterations BW peak[MB/sec] BW average[MB/sec] \n");
+
+ tposted = malloc(user_param.iters * sizeof *tposted);
+
+ if (!tposted) {
+ perror("malloc");
+ return 1;
+ }
+
+ tcompleted = malloc(user_param.iters * sizeof *tcompleted);
+
+ if (!tcompleted) {
+ perror("malloc");
+ return 1;
+ }
+
+ if (user_param.all == ALL) {
+ for (i = 1; i < 24 ; ++i) {
+ size = 1 << i;
+ if(run_iter(ctx, &user_param, rem_dest, size))
+ return 17;
+ print_report(user_param.iters, size, duplex, tposted, tcompleted, no_cpu_freq_fail);
+ }
+ } else {
+ if(run_iter(ctx, &user_param, rem_dest, size))
+ return 18;
+ print_report(user_param.iters, size, duplex, tposted, tcompleted, no_cpu_freq_fail);
+ }
+
+ if (user_param.servername)
+ rem_dest = pp_client_exch_dest(sockfd, &my_dest, &user_param);
+ else
+ rem_dest = pp_server_exch_dest(sockfd, &my_dest, &user_param);
+
+ if (write(sockfd, "done", sizeof "done") != sizeof "done"){
+ perror("server write");
+ fprintf(stderr, "Couldn't write to socket\n");
+ return 1;
+ }
+ close(sockfd);
+
+ free(tposted);
+ free(tcompleted);
+
+ printf("------------------------------------------------------------------\n");
+ return 0;
+}
diff --git a/read_lat.c b/read_lat.c
new file mode 100755
index 0000000..73d9eb1
--- /dev/null
+++ b/read_lat.c
@@ -0,0 +1,1108 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2005 Hewlett Packard, Inc (Grant Grundler)
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <netdb.h>
+#include <malloc.h>
+#include <getopt.h>
+#include <arpa/inet.h>
+#include <byteswap.h>
+#include <time.h>
+
+#include <infiniband/verbs.h>
+
+#include "get_clock.h"
+
+#define PINGPONG_READ_WRID 1
+#define VERSION 1.1
+#define ALL 1
+static int sl = 0;
+static int page_size;
+cycles_t *tstamp;
+struct pingpong_dest my_dest;
+struct user_parameters {
+ const char *servername;
+ int connection_type;
+ int mtu;
+ int all; /* run all msg size */
+ int iters;
+ int tx_depth;
+ int sockfd;
+ int max_out_read;
+ int use_event;
+ int qp_timeout;
+ int gid_index; /* if value not negative, we use gid AND gid_index=value */
+
+};
+struct report_options {
+ int unsorted;
+ int histogram;
+ int cycles; /* report delta's in cycles, not microsec's */
+};
+
+struct pingpong_context {
+ struct ibv_context *context;
+ struct ibv_comp_channel *channel;
+ struct ibv_pd *pd;
+ struct ibv_mr *mr;
+ struct ibv_cq *cq;
+ struct ibv_qp *qp;
+ void *buf;
+ volatile char *post_buf;
+ volatile char *poll_buf;
+ int size;
+ int tx_depth;
+ struct ibv_sge list;
+ struct ibv_send_wr wr;
+ union ibv_gid dgid;
+};
+
+struct pingpong_dest {
+ int lid;
+ int qpn;
+ int psn;
+ unsigned rkey;
+ unsigned long long vaddr;
+ union ibv_gid dgid;
+};
+
+
+static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port)
+{
+ struct ibv_port_attr attr;
+
+ if (ibv_query_port(ctx->context, port, &attr))
+ return 0;
+
+ return attr.lid;
+}
+
+static struct ibv_device *pp_find_dev(const char *ib_devname) {
+ struct ibv_device **dev_list;
+ struct ibv_device *ib_dev = NULL;
+
+ dev_list = ibv_get_device_list(NULL);
+
+ if (!ib_devname) {
+ ib_dev = dev_list[0];
+ if (!ib_dev)
+ fprintf(stderr, "No IB devices found\n");
+ } else {
+ for (; (ib_dev = *dev_list); ++dev_list)
+ if (!strcmp(ibv_get_device_name(ib_dev), ib_devname))
+ break;
+ if (!ib_dev)
+ fprintf(stderr, "IB device %s not found\n", ib_devname);
+ }
+ return ib_dev;
+}
+
+#define KEY_MSG_SIZE (sizeof "0000:000000:000000:00000000:0000000000000000")
+#define KEY_PRINT_FMT "%04x:%06x:%06x:%08x:%016Lx"
+#define KEY_MSG_SIZE_GID (sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00")
+#define KEY_PRINT_FMT_GID "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x"
+
+static int pp_write_keys(int sockfd, const struct pingpong_dest *my_dest, struct user_parameters *user_parm)
+{
+ if (user_parm->gid_index < 0) {
+ char msg[KEY_MSG_SIZE];
+
+ sprintf(msg, KEY_PRINT_FMT, my_dest->lid, my_dest->qpn,
+ my_dest->psn, my_dest->rkey, my_dest->vaddr);
+
+ if (write(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("client write");
+ fprintf(stderr, "Couldn't send local address\n");
+ return -1;
+ }
+
+ return 0;
+ } else {
+ char msg[KEY_MSG_SIZE_GID];
+
+ sprintf(msg, KEY_PRINT_FMT_GID, my_dest->lid, my_dest->qpn,
+ my_dest->psn, my_dest->rkey, my_dest->vaddr,
+ my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2], my_dest->dgid.raw[3],
+ my_dest->dgid.raw[4], my_dest->dgid.raw[5], my_dest->dgid.raw[6], my_dest->dgid.raw[7],
+ my_dest->dgid.raw[8], my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11],
+ my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14], my_dest->dgid.raw[15]);
+ if (write(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("client write");
+ fprintf(stderr, "Couldn't send local address\n");
+ return -1;
+ }
+
+ return 0;
+ }
+}
+
+static int pp_read_keys(int sockfd, const struct pingpong_dest *my_dest,
+ struct pingpong_dest *rem_dest, struct user_parameters *user_parm)
+{
+ if (user_parm->gid_index < 0) {
+ int parsed;
+ char msg[KEY_MSG_SIZE];
+
+ if (read(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("pp_read_keys");
+ fprintf(stderr, "Couldn't read remote address\n");
+ return -1;
+ }
+
+ parsed = sscanf(msg, KEY_PRINT_FMT, &rem_dest->lid, &rem_dest->qpn,
+ &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr);
+
+ if (parsed != 5) {
+ fprintf(stderr, "Couldn't parse line <%.*s>\n",
+ (int)sizeof msg, msg);
+ return -1;
+ }
+
+ return 0;
+ } else {
+ char msg[KEY_MSG_SIZE_GID];
+ if (read(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("pp_read_keys");
+ fprintf(stderr, "Couldn't read remote address\n");
+ return -1;
+ }
+ char *pstr = msg, *term;
+ char tmp[20];
+ int i;
+
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA
+
+ for (i = 0; i < 15; ++i) {
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16);
+ }
+ pstr += term - pstr + 1;
+ strcpy(tmp, pstr);
+ rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16);
+ return 0;
+ }
+}
+
+static int pp_client_connect(const char *servername, int port)
+{
+ struct addrinfo *res, *t;
+ struct addrinfo hints =
+ {
+ .ai_family = AF_UNSPEC,
+ .ai_socktype = SOCK_STREAM
+ };
+ char *service;
+ int n;
+ int sockfd = -1;
+
+ if (asprintf(&service, "%d", port) < 0)
+ return -1;
+
+ n = getaddrinfo(servername, service, &hints, &res);
+
+ if (n < 0) {
+ fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port);
+ return n;
+ }
+
+ for (t = res; t; t = t->ai_next) {
+ sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
+ if (sockfd >= 0) {
+ if (!connect(sockfd, t->ai_addr, t->ai_addrlen))
+ break;
+ close(sockfd);
+ sockfd = -1;
+ }
+ }
+
+ freeaddrinfo(res);
+
+ if (sockfd < 0) {
+ fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port);
+ return sockfd;
+ }
+ return sockfd;
+}
+
+static int pp_client_exch_dest(int sockfd, const struct pingpong_dest *my_dest,
+ struct pingpong_dest *rem_dest, struct user_parameters *user_parm)
+{
+ if (pp_write_keys(sockfd, my_dest, user_parm))
+ return -1;
+
+ return pp_read_keys(sockfd, my_dest, rem_dest, user_parm);
+}
+
+static int pp_server_connect(int port)
+{
+ struct addrinfo *res, *t;
+ struct addrinfo hints = {
+ .ai_flags = AI_PASSIVE,
+ .ai_family = AF_UNSPEC,
+ .ai_socktype = SOCK_STREAM
+ };
+ char *service;
+ int sockfd = -1, connfd;
+ int n;
+
+ if (asprintf(&service, "%d", port) < 0)
+ return -1;
+
+ n = getaddrinfo(NULL, service, &hints, &res);
+
+ if (n < 0) {
+ fprintf(stderr, "%s for port %d\n", gai_strerror(n), port);
+ return n;
+ }
+
+ for (t = res; t; t = t->ai_next) {
+ sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
+ if (sockfd >= 0) {
+ n = 1;
+
+ setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n);
+
+ if (!bind(sockfd, t->ai_addr, t->ai_addrlen))
+ break;
+ close(sockfd);
+ sockfd = -1;
+ }
+ }
+
+ freeaddrinfo(res);
+
+ if (sockfd < 0) {
+ fprintf(stderr, "Couldn't listen to port %d\n", port);
+ return sockfd;
+ }
+
+ listen(sockfd, 1);
+ connfd = accept(sockfd, NULL, 0);
+ if (connfd < 0) {
+ perror("server accept");
+ fprintf(stderr, "accept() failed\n");
+ close(sockfd);
+ return connfd;
+ }
+
+ close(sockfd);
+ return connfd;
+}
+
+static int pp_server_exch_dest(int sockfd, const struct pingpong_dest *my_dest,
+ struct pingpong_dest* rem_dest, struct user_parameters *user_parm)
+{
+
+ if (pp_read_keys(sockfd, my_dest, rem_dest, user_parm))
+ return -1;
+
+ return pp_write_keys(sockfd, my_dest, user_parm);
+}
+
+static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,
+ int tx_depth, int port,
+ struct user_parameters *user_parm)
+{
+ struct pingpong_context *ctx;
+ struct ibv_device_attr device_attr;
+ ctx = malloc(sizeof *ctx);
+ if (!ctx)
+ return NULL;
+
+ ctx->size = size;
+ ctx->tx_depth = tx_depth;
+
+ ctx->buf = memalign(page_size, size * 2);
+ if (!ctx->buf) {
+ fprintf(stderr, "Couldn't allocate work buf.\n");
+ return NULL;
+ }
+
+ memset(ctx->buf, 0, size * 2);
+
+ ctx->post_buf = (char*)ctx->buf + (size - 1);
+ ctx->poll_buf = (char*)ctx->buf + (2 * size - 1);
+
+ ctx->context = ibv_open_device(ib_dev);
+ if (!ctx->context) {
+ fprintf(stderr, "Couldn't get context for %s\n",
+ ibv_get_device_name(ib_dev));
+ return NULL;
+ }
+ if (user_parm->mtu == 0) {/*user did not ask for specific mtu */
+ if (ibv_query_device(ctx->context, &device_attr)) {
+ fprintf(stderr, "Failed to query device props");
+ return NULL;
+ }
+ if (device_attr.vendor_part_id == 23108 || user_parm->gid_index > -1) {
+ user_parm->mtu = 1024;
+ } else {
+ user_parm->mtu = 2048;
+ }
+ }
+ if (user_parm->use_event) {
+ ctx->channel = ibv_create_comp_channel(ctx->context);
+ if (!ctx->channel) {
+ fprintf(stderr, "Couldn't create completion channel\n");
+ return NULL;
+ }
+ } else
+ ctx->channel = NULL;
+ ctx->pd = ibv_alloc_pd(ctx->context);
+ if (!ctx->pd) {
+ fprintf(stderr, "Couldn't allocate PD\n");
+ return NULL;
+ }
+
+ ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size * 2,
+ IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ);
+ if (!ctx->mr) {
+ fprintf(stderr, "Couldn't allocate MR\n");
+ return NULL;
+ }
+
+ ctx->cq = ibv_create_cq(ctx->context, tx_depth, NULL, ctx->channel, 0);
+ if (!ctx->cq) {
+ fprintf(stderr, "Couldn't create CQ\n");
+ return NULL;
+ }
+
+ {
+ struct ibv_qp_init_attr attr;
+ memset(&attr, 0, sizeof(struct ibv_qp_init_attr));
+ attr.send_cq = ctx->cq;
+ attr.recv_cq = ctx->cq;
+ attr.cap.max_send_wr = tx_depth;
+ /* Work around: driver doesnt support
+ * recv_wr = 0 */
+ attr.cap.max_recv_wr = 1;
+ attr.cap.max_send_sge = 1;
+ attr.cap.max_recv_sge = 1;
+ if (user_parm->connection_type==1) {
+ attr.qp_type = IBV_QPT_UC;
+ } else {
+ attr.qp_type = IBV_QPT_RC;
+ }
+ ctx->qp = ibv_create_qp(ctx->pd, &attr);
+ if (!ctx->qp) {
+ fprintf(stderr, "Couldn't create QP\n");
+ return NULL;
+ }
+ }
+
+ {
+ struct ibv_qp_attr attr = {
+ .qp_state = IBV_QPS_INIT,
+ .pkey_index = 0,
+ .port_num = port,
+ .qp_access_flags = IBV_ACCESS_REMOTE_READ
+ };
+
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_PKEY_INDEX |
+ IBV_QP_PORT |
+ IBV_QP_ACCESS_FLAGS)) {
+ fprintf(stderr, "Failed to modify QP to INIT\n");
+ return NULL;
+ }
+ }
+
+ ctx->wr.wr_id = PINGPONG_READ_WRID;
+ ctx->wr.sg_list = &ctx->list;
+ ctx->wr.num_sge = 1;
+ ctx->wr.opcode = IBV_WR_RDMA_READ;
+ ctx->wr.send_flags = IBV_SEND_SIGNALED;
+ ctx->wr.next = NULL;
+
+ return ctx;
+}
+
+static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn,
+ struct pingpong_dest *dest,struct user_parameters *user_parm)
+{
+ struct ibv_qp_attr attr;
+ memset(&attr, 0, sizeof(struct ibv_qp_attr));
+ attr.qp_state = IBV_QPS_RTR;
+ switch (user_parm->mtu) {
+ case 256 :
+ attr.path_mtu = IBV_MTU_256;
+ break;
+ case 512 :
+ attr.path_mtu = IBV_MTU_512;
+ break;
+ case 1024 :
+ attr.path_mtu = IBV_MTU_1024;
+ break;
+ case 2048 :
+ attr.path_mtu = IBV_MTU_2048;
+ break;
+ case 4096 :
+ attr.path_mtu = IBV_MTU_4096;
+ break;
+ }
+ printf("Mtu : %d\n", user_parm->mtu);
+ attr.dest_qp_num = dest->qpn;
+ attr.rq_psn = dest->psn;
+ attr.max_dest_rd_atomic = user_parm->max_out_read;
+ attr.min_rnr_timer = 12;
+ if (user_parm->gid_index < 0) {
+ attr.ah_attr.is_global = 0;
+ attr.ah_attr.dlid = dest->lid;
+ attr.ah_attr.sl = sl;
+ } else {
+ attr.ah_attr.is_global = 1;
+ attr.ah_attr.grh.dgid = dest->dgid;
+ attr.ah_attr.grh.hop_limit = 1;
+ attr.ah_attr.sl = 0;
+ }
+ attr.ah_attr.src_path_bits = 0;
+ attr.ah_attr.port_num = port;
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_AV |
+ IBV_QP_PATH_MTU |
+ IBV_QP_DEST_QPN |
+ IBV_QP_RQ_PSN |
+ IBV_QP_MIN_RNR_TIMER |
+ IBV_QP_MAX_DEST_RD_ATOMIC)) {
+ fprintf(stderr, "Failed to modify RC QP to RTR\n");
+ return 1;
+ }
+ attr.timeout = user_parm->qp_timeout;
+ attr.retry_cnt = 7;
+ attr.rnr_retry = 7;
+ attr.qp_state = IBV_QPS_RTS;
+ attr.sq_psn = my_psn;
+
+ if (user_parm->connection_type==0) {
+ attr.max_rd_atomic = user_parm->max_out_read;
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_SQ_PSN |
+ IBV_QP_TIMEOUT |
+ IBV_QP_RETRY_CNT |
+ IBV_QP_RNR_RETRY |
+ IBV_QP_MAX_QP_RD_ATOMIC)) {
+ fprintf(stderr, "Failed to modify RC QP to RTS\n");
+ return 1;
+ }
+ } else {
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_SQ_PSN)) {
+ fprintf(stderr, "Failed to modify UC QP to RTS\n");
+ return 1;
+ }
+
+ }
+ return 0;
+}
+
+static int pp_open_port(struct pingpong_context *ctx, const char * servername,
+ int ib_port, int port, struct pingpong_dest *rem_dest,
+ struct user_parameters *user_parm)
+{
+ char addr_fmt[] = "%8s address: LID %#04x QPN %#06x PSN %#06x RKey %#08x VAddr %#016Lx\n";
+ int sockfd;
+ int rc;
+ union ibv_gid gid;
+
+
+ /* Create connection between client and server.
+ * We do it by exchanging data over a TCP socket connection. */
+
+ if (user_parm->gid_index != -1) {
+ int err=0;
+ err = ibv_query_gid (ctx->context, ib_port, user_parm->gid_index, &gid);
+ if (err) {
+ return -1;
+ }
+ ctx->dgid=gid;
+ }
+
+ my_dest.lid = pp_get_local_lid(ctx, ib_port);
+ my_dest.dgid = gid;
+ my_dest.qpn = ctx->qp->qp_num;
+ my_dest.psn = lrand48() & 0xffffff;
+ if (user_parm->gid_index < 0) {/*We do not fail test upon lid in RDMAoE/Eth conf*/
+ if (!my_dest.lid) {
+ fprintf(stderr, "Local lid 0x0 detected. Is an SM running? If you are running on an RMDAoE interface you must use GIDs\n");
+ return 1;
+ }
+ }
+ my_dest.rkey = ctx->mr->rkey;
+ my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size;
+
+ printf(addr_fmt, "local", my_dest.lid, my_dest.qpn, my_dest.psn,
+ my_dest.rkey, my_dest.vaddr);
+ if (user_parm->gid_index > -1) {
+ printf(" GID: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+ my_dest.dgid.raw[0],my_dest.dgid.raw[1],
+ my_dest.dgid.raw[2], my_dest.dgid.raw[3], my_dest.dgid.raw[4],
+ my_dest.dgid.raw[5], my_dest.dgid.raw[6], my_dest.dgid.raw[7],
+ my_dest.dgid.raw[8], my_dest.dgid.raw[9], my_dest.dgid.raw[10],
+ my_dest.dgid.raw[11], my_dest.dgid.raw[12], my_dest.dgid.raw[13],
+ my_dest.dgid.raw[14], my_dest.dgid.raw[15]);
+ }
+
+ sockfd = servername ? pp_client_connect(servername, port) :
+ pp_server_connect(port);
+
+ if (sockfd < 0) {
+ printf("pp_connect_sock(%s,%d) failed (%d)!\n",
+ servername, port, sockfd);
+ return sockfd;
+ }
+
+ rc = servername ? pp_client_exch_dest(sockfd, &my_dest, rem_dest, user_parm) :
+ pp_server_exch_dest(sockfd, &my_dest, rem_dest, user_parm);
+ if (rc)
+ return rc;
+
+ printf(addr_fmt, "remote", rem_dest->lid, rem_dest->qpn, rem_dest->psn,
+ rem_dest->rkey, rem_dest->vaddr);
+ if (user_parm->gid_index > -1) {
+ printf(" GID: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+ rem_dest->dgid.raw[0],rem_dest->dgid.raw[1],
+ rem_dest->dgid.raw[2], rem_dest->dgid.raw[3], rem_dest->dgid.raw[4],
+ rem_dest->dgid.raw[5], rem_dest->dgid.raw[6], rem_dest->dgid.raw[7],
+ rem_dest->dgid.raw[8], rem_dest->dgid.raw[9], rem_dest->dgid.raw[10],
+ rem_dest->dgid.raw[11], rem_dest->dgid.raw[12], rem_dest->dgid.raw[13],
+ rem_dest->dgid.raw[14], rem_dest->dgid.raw[15]);
+ }
+
+ if ((rc = pp_connect_ctx(ctx, ib_port, my_dest.psn, rem_dest,user_parm)))
+ return rc;
+
+ /* An additional handshake is required *after* moving qp to RTR.
+ * Arbitrarily reuse exch_dest for this purpose.
+ */
+
+ rc = servername ? pp_client_exch_dest(sockfd, &my_dest, rem_dest, user_parm) :
+ pp_server_exch_dest(sockfd, &my_dest, rem_dest, user_parm);
+
+ if (rc)
+ return rc;
+
+ return sockfd;
+}
+
+static void usage(const char *argv0)
+{
+ printf("Usage:\n");
+ printf(" %s start a server and wait for connection\n", argv0);
+ printf(" %s <host> connect to server at <host>\n", argv0);
+ printf("\n");
+ printf("Options:\n");
+ printf(" -p, --port=<port> listen on/connect to port <port> (default 18515)\n");
+ printf(" -c, --connection=<RC/UC> connection type RC/UC (default RC)\n");
+ printf(" -m, --mtu=<mtu> mtu size (256 - 4096. default for hermon is 2048)\n");
+ printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n");
+ printf(" -i, --ib-port=<port> use port <port> of IB device (default 1)\n");
+ printf(" -s, --size=<size> size of message to exchange (default 1)\n");
+ printf(" -t, --tx-depth=<dep> size of tx queue (default 50)\n");
+ printf(" -n, --iters=<iters> number of exchanges (at least 2, default 1000)\n");
+ printf(" -o, --outs=<num> num of outstanding read/atom(default 4)\n");
+ printf(" -u, --qp-timeout=<timeout> QP timeout, timeout value is 4 usec * 2 ^(timeout), default 14\n");
+ printf(" -S, --sl=<sl> SL (default 0)\n");
+ printf(" -x, --gid-index=<index> test uses GID with GID index taken from command line (for RDMAoE index should be 0)\n");
+ printf(" -a, --all Run sizes from 2 till 2^23\n");
+ printf(" -C, --report-cycles report times in cpu cycle units (default microseconds)\n");
+ printf(" -H, --report-histogram print out all results (default print summary only)\n");
+ printf(" -U, --report-unsorted (implies -H) print out unsorted results (default sorted)\n");
+ printf(" -V, --version display version number\n");
+ printf(" -e, --events sleep on CQ events (default poll)\n");
+ printf(" -F, --CPU-freq do not fail test on different cpu frequencies\n");
+}
+
+/*
+ * When there is an
+ * odd number of samples, the median is the middle number.
+ * even number of samples, the median is the mean of the
+ * two middle numbers.
+ *
+ */
+static inline cycles_t get_median(int n, cycles_t delta[])
+{
+ if ((n - 1) % 2)
+ return (delta[n / 2] + delta[n / 2 - 1]) / 2;
+ else
+ return delta[n / 2];
+}
+
+static int cycles_compare(const void * aptr, const void * bptr)
+{
+ const cycles_t *a = aptr;
+ const cycles_t *b = bptr;
+ if (*a < *b) return -1;
+ if (*a > *b) return 1;
+ return 0;
+
+}
+
+static void print_report(struct report_options * options,
+ unsigned int iters, cycles_t *tstamp,int size, int no_cpu_freq_fail)
+{
+ double cycles_to_units;
+ cycles_t median;
+ unsigned int i;
+ const char* units;
+ cycles_t *delta = malloc((iters - 1) * sizeof *delta);
+
+ if (!delta) {
+ perror("malloc");
+ return;
+ }
+
+ for (i = 0; i < iters - 1; ++i)
+ delta[i] = tstamp[i + 1] - tstamp[i];
+
+
+ if (options->cycles) {
+ cycles_to_units = 1;
+ units = "cycles";
+ } else {
+ cycles_to_units = get_cpu_mhz(no_cpu_freq_fail);
+ units = "usec";
+ }
+
+ if (options->unsorted) {
+ printf("#, %s\n", units);
+ for (i = 0; i < iters - 1; ++i)
+ printf("%d, %g\n", i + 1, delta[i] / cycles_to_units );
+ }
+
+ qsort(delta, iters - 1, sizeof *delta, cycles_compare);
+
+ if (options->histogram) {
+ printf("#, %s\n", units);
+ for (i = 0; i < iters - 1; ++i)
+ printf("%d, %g\n", i + 1, delta[i] / cycles_to_units );
+ }
+
+ median = get_median(iters - 1, delta);
+ printf("%7d %d %7.2f %7.2f %7.2f\n",
+ size,iters,delta[0] / cycles_to_units ,
+ delta[iters - 2] / cycles_to_units ,median / cycles_to_units );
+
+ free(delta);
+}
+
+int run_iter(struct pingpong_context *ctx, struct user_parameters *user_param,
+ struct pingpong_dest *rem_dest, int size)
+{
+ struct ibv_qp *qp;
+ struct ibv_send_wr *wr;
+ volatile char *poll_buf;
+ volatile char *post_buf;
+
+ int scnt, ccnt;
+ int iters;
+ int tx_depth;
+
+ struct ibv_wc wc;
+ int ne;
+
+ if (!user_param->servername)
+ return 0;
+
+ iters = user_param->iters;
+ tx_depth = user_param->tx_depth;
+ wr = &ctx->wr;
+ ctx->list.addr = (uintptr_t) ctx->buf;
+ ctx->list.length = size;
+ ctx->list.lkey = ctx->mr->lkey;
+ wr->wr.rdma.remote_addr = rem_dest->vaddr;
+ wr->wr.rdma.rkey = rem_dest->rkey;
+ scnt = 0;
+ ccnt = 0;
+ poll_buf = ctx->poll_buf;
+ post_buf = ctx->post_buf;
+ qp = ctx->qp;
+
+ /* Done with setup. Start the test. */
+
+ while (scnt < user_param->iters ) {
+ struct ibv_send_wr *bad_wr;
+ *post_buf = (char)++scnt;
+ tstamp[scnt - 1] = get_cycles();
+ if (ibv_post_send(qp, wr, &bad_wr)) {
+ fprintf(stderr, "Couldn't post send: scnt=%d\n",
+ scnt);
+ return 11;
+ }
+ if (user_param->use_event) {
+ struct ibv_cq *ev_cq;
+ void *ev_ctx;
+
+ if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) {
+ fprintf(stderr, "Failed to get cq_event\n");
+ return 1;
+ }
+
+ if (ev_cq != ctx->cq) {
+ fprintf(stderr, "CQ event for unknown RCQ %p\n", ev_cq);
+ return 1;
+ }
+
+ if (ibv_req_notify_cq(ctx->cq, 0)) {
+ fprintf(stderr, "Couldn't request CQ notification\n");
+ return 1;
+ }
+ }
+ do {
+ ne = ibv_poll_cq(ctx->cq, 1, &wc);
+ } while (!user_param->use_event && ne < 1);
+
+ if (ne < 0) {
+ fprintf(stderr, "poll CQ failed %d\n", ne);
+ return 12;
+ }
+ if (wc.status != IBV_WC_SUCCESS) {
+ fprintf(stderr, "Completion wth error at %s:\n",
+ user_param->servername ? "client" : "server");
+ fprintf(stderr, "Failed status %d: wr_id %d\n",
+ wc.status, (int) wc.wr_id);
+ fprintf(stderr, "scnt=%d, ccnt=%d\n",
+ scnt, ccnt);
+ return 13;
+ }
+ }
+ return 0;
+}
+int main(int argc, char *argv[])
+{
+ const char *ib_devname = NULL;
+ int port = 18515;
+ int ib_port = 1;
+ int size = 2;
+ int tmp_size;
+ int i = 0;
+ struct report_options report = {};
+
+ struct pingpong_context *ctx;
+ struct pingpong_dest rem_dest;
+ struct ibv_device *ib_dev;
+ struct user_parameters user_param;
+ int no_cpu_freq_fail = 0;
+
+ /* init default values to user's parameters */
+ memset(&user_param, 0, sizeof(struct user_parameters));
+ user_param.mtu = 0;
+ user_param.iters = 1000;
+ user_param.tx_depth = 50;
+ user_param.servername = NULL;
+ user_param.use_event = 0;
+ user_param.max_out_read = 4; /* the device capability on gen2 */
+ user_param.qp_timeout = 14;
+ user_param.gid_index = -1; /*gid will not be used*/
+ /* Parameter parsing. */
+ while (1) {
+ int c;
+
+ static struct option long_options[] = {
+ { .name = "port", .has_arg = 1, .val = 'p' },
+ { .name = "connection", .has_arg = 1, .val = 'c' },
+ { .name = "mtu", .has_arg = 1, .val = 'm' },
+ { .name = "ib-dev", .has_arg = 1, .val = 'd' },
+ { .name = "ib-port", .has_arg = 1, .val = 'i' },
+ { .name = "size", .has_arg = 1, .val = 's' },
+ { .name = "iters", .has_arg = 1, .val = 'n' },
+ { .name = "outs", .has_arg = 1, .val = 'o' },
+ { .name = "tx-depth", .has_arg = 1, .val = 't' },
+ { .name = "qp-timeout", .has_arg = 1, .val = 'u' },
+ { .name = "sl", .has_arg = 1, .val = 'S' },
+ { .name = "gid-index", .has_arg = 1, .val = 'x' },
+ { .name = "all", .has_arg = 0, .val = 'a' },
+ { .name = "report-cycles", .has_arg = 0, .val = 'C' },
+ { .name = "report-histogram",.has_arg = 0, .val = 'H' },
+ { .name = "report-unsorted",.has_arg = 0, .val = 'U' },
+ { .name = "version", .has_arg = 0, .val = 'V' },
+ { .name = "events", .has_arg = 0, .val = 'e' },
+ { .name = "CPU-freq", .has_arg = 0, .val = 'F' },
+ { 0 }
+ };
+
+ c = getopt_long(argc, argv, "p:c:m:d:i:s:o:n:t:u:S:x:aeHUVF", long_options, NULL);
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'p':
+ port = strtol(optarg, NULL, 0);
+ if (port < 0 || port > 65535) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+ case 'c':
+ if (strcmp("UC",optarg)==0)
+ user_param.connection_type=1;
+ /* default is 0 for any other option RC*/
+ break;
+ case 'e':
+ ++user_param.use_event;
+ break;
+
+ case 'm':
+ user_param.mtu = strtol(optarg, NULL, 0);
+ break;
+ case 'o':
+ user_param.max_out_read = strtol(optarg, NULL, 0);
+ break;
+ case 'a':
+ user_param.all = ALL;
+ break;
+ case 'V':
+ printf("perftest version : %.2f\n",VERSION);
+ return 0;
+ break;
+ case 'd':
+ ib_devname = strdupa(optarg);
+ break;
+
+ case 'i':
+ ib_port = strtol(optarg, NULL, 0);
+ if (ib_port < 0) {
+ usage(argv[0]);
+ return 2;
+ }
+ break;
+
+ case 's':
+ size = strtol(optarg, NULL, 0);
+ if (size < 1) {
+ usage(argv[0]); return 3;
+ }
+ break;
+
+ case 't':
+ user_param.tx_depth = strtol(optarg, NULL, 0);
+ if (user_param.tx_depth < 1) {
+ usage(argv[0]); return 4;
+ }
+ break;
+
+ case 'n':
+ user_param.iters = strtol(optarg, NULL, 0);
+ if (user_param.iters < 2) {
+ usage(argv[0]);
+ return 5;
+ }
+
+ break;
+
+ case 'C':
+ report.cycles = 1;
+ break;
+
+ case 'H':
+ report.histogram = 1;
+ break;
+
+ case 'U':
+ report.unsorted = 1;
+ break;
+
+ case 'F':
+ no_cpu_freq_fail = 1;
+ break;
+
+ case 'u':
+ user_param.qp_timeout = strtol(optarg, NULL, 0);
+ break;
+
+ case 'S':
+ sl = strtol(optarg, NULL, 0);
+ if (sl > 15) { usage(argv[0]); return 5; }
+ break;
+
+ case 'x':
+ user_param.gid_index = strtol(optarg, NULL, 0);
+ if (user_param.gid_index > 63) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ default:
+ usage(argv[0]);
+ return 6;
+ }
+ }
+
+ if (optind == argc - 1)
+ user_param.servername = strdupa(argv[optind]);
+ else if (optind < argc) {
+ usage(argv[0]);
+ return 6;
+ }
+
+ /*
+ * Done with parameter parsing. Perform setup.
+ */
+ tstamp = malloc(user_param.iters * sizeof *tstamp);
+ if (!tstamp) {
+ perror("malloc");
+ return 10;
+ }
+ printf("------------------------------------------------------------------\n");
+ printf(" RDMA_Read Latency Test\n");
+ printf("Connection type : RC\n");
+ /* anyway make sure the connection is RC */
+ if (user_param.gid_index > -1) {
+ printf("Using GID to support RDMAoE configuration. Refer to port type as Ethernet, default MTU 1024B\n");
+ }
+ tmp_size = size;
+ if (user_param.all == ALL) {
+ /*since we run all sizes */
+ size = 8388608; /*2^23 */
+ } else if (size < 128) {
+ /* can cut up to 70 nsec probably related to cache line size */
+ size = 128;
+ }
+ user_param.connection_type = 0;
+ srand48(getpid() * time(NULL));
+ page_size = sysconf(_SC_PAGESIZE);
+
+ ib_dev = pp_find_dev(ib_devname);
+ if (!ib_dev)
+ return 7;
+
+ ctx = pp_init_ctx(ib_dev, size, user_param.tx_depth, ib_port,&user_param);
+ if (!ctx)
+ return 8;
+
+ user_param.sockfd=pp_open_port(ctx, user_param.servername, ib_port, port, &rem_dest,&user_param);
+ if (user_param.sockfd==-1) {
+ return 9;
+ }
+ /* fix for true size in small msg size */
+ if (tmp_size < 128) {
+ size = tmp_size ;
+ }
+ if (user_param.use_event) {
+ printf("Test with events.\n");
+ if (ibv_req_notify_cq(ctx->cq, 0)) {
+ fprintf(stderr, "Couldn't request RCQ notification\n");
+ return 1;
+ }
+ }
+ printf("------------------------------------------------------------------\n");
+ printf(" #bytes #iterations t_min[usec] t_max[usec] t_typical[usec]\n");
+ if (user_param.all == ALL) {
+ for (i = 1; i < 24 ; ++i) {
+ size = 1 << i;
+ if(run_iter(ctx, &user_param, &rem_dest, size))
+ return 17;
+ if(user_param.servername) {
+ print_report(&report, user_param.iters, tstamp, size, no_cpu_freq_fail);
+ }
+ }
+ } else {
+ if(run_iter(ctx, &user_param, &rem_dest, size))
+ return 18;
+ if(user_param.servername) {
+ print_report(&report, user_param.iters, tstamp, size, no_cpu_freq_fail);
+ }
+ }
+
+ /* done close sockets */
+ if(user_param.servername) {
+ /*Signal client is finished */
+ pp_client_exch_dest(user_param.sockfd, &my_dest, &rem_dest, &user_param);
+ if (write(user_param.sockfd, "done", sizeof "done") != sizeof "done"){
+ perror("client write");
+ fprintf(stderr, "Couldn't write to socket\n");
+ return 1;
+ }
+ close(user_param.sockfd);
+ } else {
+ /*Server is finished wait for client */
+ pp_server_exch_dest(user_param.sockfd, &my_dest, &rem_dest, &user_param);
+ if (write(user_param.sockfd, "done", sizeof "done") != sizeof "done"){
+ perror("server write");
+ fprintf(stderr, "Couldn't write to socket\n");
+ return 1;
+ }
+ close(user_param.sockfd);
+ }
+ printf("------------------------------------------------------------------\n");
+ free(tstamp);
+ return 0;
+}
diff --git a/runme b/runme
new file mode 100755
index 0000000..8d5ae6c
--- /dev/null
+++ b/runme
@@ -0,0 +1,19 @@
+#!/bin/sh
+# trivial script to launch a server/client test with ssh
+# must be launched from client
+# example: runme 10.0.0.1 /home/perftest/rdma_lat -s 10
+
+if [ $# -lt 1 ] ; then
+ echo "Usage: runme <server> <test> <test options>"
+ exit 3
+fi
+
+server=$1
+shift
+ssh $server $* &
+#give server time to start
+sleep 2
+$* $server
+status=$?
+wait
+exit $status
diff --git a/send_bw.c b/send_bw.c
new file mode 100755
index 0000000..f842fb9
--- /dev/null
+++ b/send_bw.c
@@ -0,0 +1,1489 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <netdb.h>
+#include <malloc.h>
+#include <getopt.h>
+#include <arpa/inet.h>
+#include <byteswap.h>
+#include <time.h>
+#include <errno.h>
+
+#include <infiniband/verbs.h>
+
+#include "get_clock.h"
+
+#define PINGPONG_SEND_WRID 1
+#define PINGPONG_RECV_WRID 2
+#define RC 0
+#define UC 1
+#define UD 3
+#define VERSION 1.1
+#define SIGNAL 1
+#define MAX_INLINE 400
+#define ALL 1
+#define MCG_LID 0xc001
+#define MCG_GID {255,1,0,0,0,2,201,133,0,0,0,0,0,0,0,0}
+
+struct user_parameters {
+ const char *servername;
+ int connection_type;
+ int mtu;
+ int all; /* run all msg size */
+ int signal_comp;
+ int iters;
+ int tx_depth;
+ int rx_depth;
+ int duplex;
+ int use_event;
+ int use_mcg;
+ int inline_size;
+ int qp_timeout;
+ int gid_index; /* if value not negative, we use gid AND gid_index=value */
+};
+static int sl = 0;
+static int page_size;
+cycles_t *tposted;
+cycles_t *tcompleted;
+int post_recv;
+struct pingpong_context {
+ struct ibv_context *context;
+ struct ibv_comp_channel *channel;
+ struct ibv_pd *pd;
+ struct ibv_mr *mr;
+ struct ibv_cq *cq;
+ struct ibv_qp *qp;
+ void *buf;
+ unsigned size;
+ int tx_depth;
+ int rx_depth;
+ struct ibv_sge list;
+ struct ibv_sge recv_list;
+ struct ibv_send_wr wr;
+ struct ibv_recv_wr rwr;
+ struct ibv_ah *ah;
+ union ibv_gid dgid;
+};
+
+struct pingpong_dest {
+ int lid;
+ int qpn;
+ int psn;
+ unsigned rkey;
+ unsigned long long vaddr;
+ union ibv_gid dgid;
+};
+
+static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port)
+{
+ struct ibv_port_attr attr;
+
+ if (ibv_query_port(ctx->context, port, &attr))
+ return 0;
+
+ return attr.lid;
+}
+
+static int pp_client_connect(const char *servername, int port)
+{
+ struct addrinfo *res, *t;
+ struct addrinfo hints = {
+ .ai_family = AF_UNSPEC,
+ .ai_socktype = SOCK_STREAM
+ };
+ char *service;
+ int n;
+ int sockfd = -1;
+
+ if (asprintf(&service, "%d", port) < 0)
+ return -1;
+
+ n = getaddrinfo(servername, service, &hints, &res);
+
+ if (n < 0) {
+ fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port);
+ return n;
+ }
+
+ for (t = res; t; t = t->ai_next) {
+ sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
+ if (sockfd >= 0) {
+ if (!connect(sockfd, t->ai_addr, t->ai_addrlen))
+ break;
+ close(sockfd);
+ sockfd = -1;
+ }
+ }
+
+ freeaddrinfo(res);
+
+ if (sockfd < 0) {
+ fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port);
+ return sockfd;
+ }
+ return sockfd;
+}
+
+struct pingpong_dest * pp_client_exch_dest(int sockfd,
+ const struct pingpong_dest *my_dest, struct user_parameters *user_parm)
+{
+ struct pingpong_dest *rem_dest = NULL;
+ char msg[sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00"];
+ int parsed;
+
+ sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x",
+ my_dest->lid, my_dest->qpn, my_dest->psn,my_dest->rkey,my_dest->vaddr,
+ my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2],
+ my_dest->dgid.raw[3], my_dest->dgid.raw[4], my_dest->dgid.raw[5],
+ my_dest->dgid.raw[6], my_dest->dgid.raw[7], my_dest->dgid.raw[8],
+ my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11],
+ my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14],
+ my_dest->dgid.raw[15]);
+ if (write(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("client write");
+ fprintf(stderr, "Couldn't send local address\n");
+ goto out;
+ }
+
+ if (read(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("client read");
+ fprintf(stderr, "Couldn't read remote address\n");
+ goto out;
+ }
+
+ rem_dest = malloc(sizeof *rem_dest);
+ if (!rem_dest)
+ goto out;
+
+ if (user_parm->gid_index < 0) {
+ parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &rem_dest->lid, &rem_dest->qpn,
+ &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr);
+ if (parsed != 5) {
+ fprintf(stderr, "Couldn't parse line <%.*s>\n",(int)sizeof msg, msg);
+ free(rem_dest);
+ rem_dest = NULL;
+ goto out;
+ }
+ }else{
+ char *pstr = msg, *term;
+ char tmp[20];
+ int i;
+
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA
+
+ for (i = 0; i < 15; ++i) {
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16);
+ }
+ pstr += term - pstr + 1;
+ strcpy(tmp, pstr);
+ rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16);
+ }
+out:
+ return rem_dest;
+}
+
+int pp_server_connect(int port)
+{
+ struct addrinfo *res, *t;
+ struct addrinfo hints = {
+ .ai_flags = AI_PASSIVE,
+ .ai_family = AF_UNSPEC,
+ .ai_socktype = SOCK_STREAM
+ };
+ char *service;
+ int sockfd = -1, connfd;
+ int n;
+
+ if (asprintf(&service, "%d", port) < 0)
+ return -1;
+
+ n = getaddrinfo(NULL, service, &hints, &res);
+
+ if (n < 0) {
+ fprintf(stderr, "%s for port %d\n", gai_strerror(n), port);
+ return n;
+ }
+
+ for (t = res; t; t = t->ai_next) {
+ sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
+ if (sockfd >= 0) {
+ n = 1;
+
+ setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n);
+
+ if (!bind(sockfd, t->ai_addr, t->ai_addrlen))
+ break;
+ close(sockfd);
+ sockfd = -1;
+ }
+ }
+
+ freeaddrinfo(res);
+
+ if (sockfd < 0) {
+ fprintf(stderr, "Couldn't listen to port %d\n", port);
+ return sockfd;
+ }
+
+ listen(sockfd, 1);
+ connfd = accept(sockfd, NULL, 0);
+ if (connfd < 0) {
+ perror("server accept");
+ fprintf(stderr, "accept() failed\n");
+ close(sockfd);
+ return connfd;
+ }
+
+ close(sockfd);
+ return connfd;
+}
+
+static struct pingpong_dest *pp_server_exch_dest(int connfd, const struct pingpong_dest *my_dest, struct user_parameters *user_parm)
+{
+ char msg[sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00"];
+ struct pingpong_dest *rem_dest = NULL;
+ int parsed;
+ int n;
+
+ n = read(connfd, msg, sizeof msg);
+ if (n != sizeof msg) {
+ perror("server read");
+ fprintf(stderr, "%d/%d: Couldn't read remote address\n", n, (int) sizeof msg);
+ goto out;
+ }
+
+ rem_dest = malloc(sizeof *rem_dest);
+ if (!rem_dest)
+ goto out;
+
+ if (user_parm->gid_index < 0) {
+ parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &rem_dest->lid, &rem_dest->qpn,
+ &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr);
+ if (parsed != 5) {
+ fprintf(stderr, "Couldn't parse line <%.*s>\n",(int)sizeof msg, msg);
+ free(rem_dest);
+ rem_dest = NULL;
+ goto out;
+ }
+ }else{
+ char *pstr = msg, *term;
+ char tmp[20];
+ int i;
+
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA
+
+ for (i = 0; i < 15; ++i) {
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16);
+ }
+ pstr += term - pstr + 1;
+ strcpy(tmp, pstr);
+ rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16);
+ }
+
+ sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x",
+ my_dest->lid, my_dest->qpn, my_dest->psn, my_dest->rkey, my_dest->vaddr,
+ my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2],
+ my_dest->dgid.raw[3], my_dest->dgid.raw[4], my_dest->dgid.raw[5],
+ my_dest->dgid.raw[6], my_dest->dgid.raw[7], my_dest->dgid.raw[8],
+ my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11],
+ my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14],
+ my_dest->dgid.raw[15]);
+ if (write(connfd, msg, sizeof msg) != sizeof msg) {
+ perror("server write");
+ fprintf(stderr, "Couldn't send local address\n");
+ free(rem_dest);
+ rem_dest = NULL;
+ goto out;
+ }
+out:
+ return rem_dest;
+}
+
+static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev,
+ unsigned size,
+ int tx_depth, int rx_depth, int port,
+ struct user_parameters *user_parm)
+{
+ struct pingpong_context *ctx;
+ struct ibv_device_attr device_attr;
+
+ ctx = malloc(sizeof *ctx);
+ if (!ctx)
+ return NULL;
+
+ ctx->size = size;
+ ctx->tx_depth = tx_depth;
+ ctx->rx_depth = rx_depth + tx_depth;
+ /* in case of UD need space for the GRH */
+ if (user_parm->connection_type==UD) {
+ ctx->buf = memalign(page_size, ( size + 40 ) * 2);
+ if (!ctx->buf) {
+ fprintf(stderr, "Couldn't allocate work buf.\n");
+ return NULL;
+ }
+ memset(ctx->buf, 0, ( size + 40 ) * 2);
+ } else {
+ ctx->buf = memalign(page_size, size * 2);
+ if (!ctx->buf) {
+ fprintf(stderr, "Couldn't allocate work buf.\n");
+ return NULL;
+ }
+ memset(ctx->buf, 0, size * 2);
+ }
+
+
+ ctx->context = ibv_open_device(ib_dev);
+ if (!ctx->context) {
+ fprintf(stderr, "Couldn't get context for %s\n",
+ ibv_get_device_name(ib_dev));
+ return NULL;
+ }
+ if (user_parm->mtu == 0) {/*user did not ask for specific mtu */
+ if (ibv_query_device(ctx->context, &device_attr)) {
+ fprintf(stderr, "Failed to query device props");
+ return NULL;
+ }
+ if (device_attr.vendor_part_id == 23108 || user_parm->gid_index > -1) {
+ user_parm->mtu = 1024;
+ } else {
+ user_parm->mtu = 2048;
+ }
+ }
+ if (user_parm->use_event) {
+ ctx->channel = ibv_create_comp_channel(ctx->context);
+ if (!ctx->channel) {
+ fprintf(stderr, "Couldn't create completion channel\n");
+ return NULL;
+ }
+ } else
+ ctx->channel = NULL;
+ ctx->pd = ibv_alloc_pd(ctx->context);
+ if (!ctx->pd) {
+ fprintf(stderr, "Couldn't allocate PD\n");
+ return NULL;
+ }
+
+ /* We dont really want IBV_ACCESS_LOCAL_WRITE, but IB spec says:
+ * The Consumer is not allowed to assign Remote Write or Remote Atomic to
+ * a Memory Region that has not been assigned Local Write. */
+ if (user_parm->connection_type==UD) {
+ ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, (size + 40 ) * 2,
+ IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+ if (!ctx->mr) {
+ fprintf(stderr, "Couldn't allocate MR\n");
+ return NULL;
+ }
+ } else {
+ ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size * 2,
+ IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+ if (!ctx->mr) {
+ fprintf(stderr, "Couldn't allocate MR\n");
+ return NULL;
+ }
+ }
+
+ ctx->cq = ibv_create_cq(ctx->context, ctx->rx_depth, NULL, ctx->channel, 0);
+ if (!ctx->cq) {
+ fprintf(stderr, "Couldn't create CQ\n");
+ return NULL;
+ }
+ {
+ struct ibv_qp_init_attr attr;
+ memset(&attr, 0, sizeof(struct ibv_qp_init_attr));
+ attr.send_cq = ctx->cq;
+ attr.recv_cq = ctx->cq;
+ attr.cap.max_send_wr = tx_depth;
+ /* Work around: driver doesnt support
+ * recv_wr = 0 */
+ attr.cap.max_recv_wr = ctx->rx_depth;
+ attr.cap.max_send_sge = 1;
+ attr.cap.max_recv_sge = 1;
+ attr.cap.max_inline_data = user_parm->inline_size;
+ switch (user_parm->connection_type) {
+ case RC :
+ attr.qp_type = IBV_QPT_RC;
+ break;
+ case UC :
+ attr.qp_type = IBV_QPT_UC;
+ break;
+ case UD :
+ attr.qp_type = IBV_QPT_UD;
+ break;
+ default:
+ fprintf(stderr, "Unknown connection type %d \n",user_parm->connection_type);
+ return NULL;
+ }
+ /*attr.sq_sig_all = 0;*/
+
+ ctx->qp = ibv_create_qp(ctx->pd, &attr);
+ if (!ctx->qp) {
+ fprintf(stderr, "Couldn't create QP\n");
+ return NULL;
+ }
+
+ }
+
+ {
+ struct ibv_qp_attr attr;
+
+ attr.qp_state = IBV_QPS_INIT;
+ attr.pkey_index = 0;
+ attr.port_num = port;
+ if (user_parm->connection_type==UD)
+ attr.qkey = 0x11111111;
+ else
+ attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE;
+
+ if (user_parm->connection_type==UD) {
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_PKEY_INDEX |
+ IBV_QP_PORT |
+ IBV_QP_QKEY)) {
+ fprintf(stderr, "Failed to modify UD QP to INIT\n");
+ return NULL;
+ }
+
+ if ((user_parm->use_mcg) && (!user_parm->servername || user_parm->duplex)) {
+ union ibv_gid gid;
+ uint8_t mcg_gid[16] = MCG_GID;
+
+ /* use the local QP number as part of the mcg */
+ mcg_gid[11] = (user_parm->servername) ? 0 : 1;
+ *(uint32_t *)(&mcg_gid[12]) = ctx->qp->qp_num;
+ memcpy(gid.raw, mcg_gid, 16);
+
+ if (ibv_attach_mcast(ctx->qp, &gid, MCG_LID)) {
+ fprintf(stderr, "Couldn't attach QP to mcg\n");
+ return NULL;
+ }
+ }
+ } else if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_PKEY_INDEX |
+ IBV_QP_PORT |
+ IBV_QP_ACCESS_FLAGS)) {
+ fprintf(stderr, "Failed to modify QP to INIT\n");
+ return NULL;
+ }
+ }
+ return ctx;
+}
+
+static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn,
+ struct pingpong_dest *dest, struct user_parameters *user_parm)
+{
+ struct ibv_qp_attr attr;
+ memset(&attr, 0, sizeof attr);
+
+ attr.qp_state = IBV_QPS_RTR;
+ switch (user_parm->mtu) {
+ case 256 :
+ attr.path_mtu = IBV_MTU_256;
+ break;
+ case 512 :
+ attr.path_mtu = IBV_MTU_512;
+ break;
+ case 1024 :
+ attr.path_mtu = IBV_MTU_1024;
+ break;
+ case 2048 :
+ attr.path_mtu = IBV_MTU_2048;
+ break;
+ case 4096 :
+ attr.path_mtu = IBV_MTU_4096;
+ break;
+ }
+ printf("Mtu : %d\n", user_parm->mtu);
+ attr.dest_qp_num = dest->qpn;
+ attr.rq_psn = dest->psn;
+ if (user_parm->connection_type == RC) {
+ attr.max_dest_rd_atomic = 1;
+ attr.min_rnr_timer = 12;
+ }
+ if (user_parm->gid_index < 0) {
+ attr.ah_attr.is_global = 0;
+ attr.ah_attr.dlid = dest->lid;
+ attr.ah_attr.sl = sl;
+ } else {
+ attr.ah_attr.is_global = 1;
+ attr.ah_attr.grh.dgid = dest->dgid;
+ attr.ah_attr.grh.hop_limit = 1;
+ attr.ah_attr.sl = 0;
+ }
+ attr.ah_attr.src_path_bits = 0;
+ attr.ah_attr.port_num = port;
+ if ((user_parm->connection_type==UD) && (user_parm->use_mcg)) {
+ uint8_t mcg_gid[16] = MCG_GID;
+ /* send the message to the mcg of the other side */
+ mcg_gid[11] = (user_parm->servername) ? 1 : 0;
+ *(uint32_t *)(&mcg_gid[12]) = dest->qpn;
+ attr.ah_attr.dlid = MCG_LID;
+ attr.ah_attr.is_global = 1;
+ attr.ah_attr.grh.sgid_index = 0;
+ memcpy(attr.ah_attr.grh.dgid.raw, mcg_gid, 16);
+ }
+ if (user_parm->connection_type == RC) {
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_AV |
+ IBV_QP_PATH_MTU |
+ IBV_QP_DEST_QPN |
+ IBV_QP_RQ_PSN |
+ IBV_QP_MIN_RNR_TIMER |
+ IBV_QP_MAX_DEST_RD_ATOMIC)) {
+ fprintf(stderr, "Failed to modify RC QP to RTR\n");
+ return 1;
+ }
+ attr.timeout = user_parm->qp_timeout;
+ attr.retry_cnt = 7;
+ attr.rnr_retry = 7;
+ } else if (user_parm->connection_type == UC) {
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_AV |
+ IBV_QP_PATH_MTU |
+ IBV_QP_DEST_QPN |
+ IBV_QP_RQ_PSN)) {
+ fprintf(stderr, "Failed to modify UC QP to RTR\n");
+ return 1;
+ }
+ } else {
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE )) {
+ fprintf(stderr, "Failed to modify UC QP to RTR\n");
+ return 1;
+ }
+ }
+ attr.qp_state = IBV_QPS_RTS;
+ attr.sq_psn = my_psn;
+ attr.max_rd_atomic = 1;
+ if (user_parm->connection_type == RC) {
+ attr.max_rd_atomic = 1;
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_SQ_PSN |
+ IBV_QP_TIMEOUT |
+ IBV_QP_RETRY_CNT |
+ IBV_QP_RNR_RETRY |
+ IBV_QP_MAX_QP_RD_ATOMIC)) {
+ fprintf(stderr, "Failed to modify RC QP to RTS\n");
+ return 1;
+ }
+ } else { /*both UC and UD */
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_SQ_PSN)) {
+ fprintf(stderr, "Failed to modify UC QP to RTS\n");
+ return 1;
+ }
+
+ }
+ if (user_parm->connection_type==UD) {
+ ctx->ah = ibv_create_ah(ctx->pd, &attr.ah_attr);
+ if (!ctx->ah) {
+ fprintf(stderr, "Failed to create AH for UD\n");
+ return 1;
+ }
+ }
+ /* post recieve max msg size*/
+ {
+ int i;
+ struct ibv_recv_wr *bad_wr_recv;
+ //recieve
+ ctx->rwr.wr_id = PINGPONG_RECV_WRID;
+ ctx->rwr.sg_list = &ctx->recv_list;
+ ctx->rwr.num_sge = 1;
+ ctx->rwr.next = NULL;
+ ctx->recv_list.addr = (uintptr_t) ctx->buf;
+ if (user_parm->connection_type==UD) {
+ ctx->recv_list.length = ctx->size + 40;
+ } else {
+ ctx->recv_list.length = ctx->size;
+ }
+ ctx->recv_list.lkey = ctx->mr->lkey;
+ for (i = 0; i < ctx->rx_depth; ++i)
+ if (ibv_post_recv(ctx->qp, &ctx->rwr, &bad_wr_recv)) {
+ fprintf(stderr, "Couldn't post recv: counter=%d\n", i);
+ return 14;
+ }
+ }
+ post_recv = ctx->rx_depth;
+ return 0;
+}
+
+static void usage(const char *argv0)
+{
+ printf("Usage:\n");
+ printf(" %s start a server and wait for connection\n", argv0);
+ printf(" %s <host> connect to server at <host>\n", argv0);
+ printf("\n");
+ printf("Options:\n");
+ printf(" -p, --port=<port> listen on/connect to port <port> (default 18515)\n");
+ printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n");
+ printf(" -i, --ib-port=<port> use port <port> of IB device (default 1)\n");
+ printf(" -c, --connection=<RC/UC/UD> connection type RC/UC/UD (default RC)\n");
+ printf(" -m, --mtu=<mtu> mtu size (256 - 4096. default for hermon is 2048)\n");
+ printf(" -s, --size=<size> size of message to exchange (default 65536)\n");
+ printf(" -a, --all Run sizes from 2 till 2^23\n");
+ printf(" -t, --tx-depth=<dep> size of tx queue (default 300)\n");
+ printf(" -g, --mcg send messages to multicast group(only available in UD connection\n");
+ printf(" -r, --rx-depth=<dep> make rx queue bigger than tx (default 600)\n");
+ printf(" -n, --iters=<iters> number of exchanges (at least 2, default 1000)\n");
+ printf(" -I, --inline_size=<size> max size of message to be sent in inline mode (default 400)\n");
+ printf(" -u, --qp-timeout=<timeout> QP timeout, timeout value is 4 usec * 2 ^(timeout), default 14\n");
+ printf(" -S, --sl=<sl> SL (default 0)\n");
+ printf(" -x, --gid-index=<index> test uses GID with GID index taken from command line (for RDMAoE index should be 0)\n");
+ printf(" -b, --bidirectional measure bidirectional bandwidth (default unidirectional)\n");
+ printf(" -V, --version display version number\n");
+ printf(" -e, --events sleep on CQ events (default poll)\n");
+ printf(" -N, --no peak-bw cancel peak-bw calculation (default with peak-bw)\n");
+ printf(" -F, --CPU-freq do not fail even if cpufreq_ondemand module is loaded\n");
+}
+
+static void print_report(unsigned int iters, unsigned size, int duplex,
+ cycles_t *tposted, cycles_t *tcompleted, int noPeak, int no_cpu_freq_fail)
+{
+ double cycles_to_units;
+ unsigned long tsize; /* Transferred size, in megabytes */
+ int i, j;
+ int opt_posted = 0, opt_completed = 0;
+ cycles_t opt_delta;
+ cycles_t t;
+
+
+ opt_delta = tcompleted[opt_posted] - tposted[opt_completed];
+
+ if (!noPeak) {
+ /* Find the peak bandwidth, unless asked not to in command line */
+ for (i = 0; i < iters; ++i)
+ for (j = i; j < iters; ++j) {
+ t = (tcompleted[j] - tposted[i]) / (j - i + 1);
+ if (t < opt_delta) {
+ opt_delta = t;
+ opt_posted = i;
+ opt_completed = j;
+ }
+ }
+ }
+
+ cycles_to_units = get_cpu_mhz(no_cpu_freq_fail) * 1000000;
+
+ tsize = duplex ? 2 : 1;
+ tsize = tsize * size;
+ printf("%7d %d %7.2f %7.2f\n",
+ size,iters,!(noPeak) * tsize * cycles_to_units / opt_delta / 0x100000,
+ tsize * iters * cycles_to_units /(tcompleted[iters - 1] - tposted[0]) / 0x100000);
+}
+int run_iter_bi(struct pingpong_context *ctx, struct user_parameters *user_param,
+ struct pingpong_dest *rem_dest, int size)
+{
+ struct ibv_qp *qp;
+ int scnt, ccnt, rcnt;
+ struct ibv_recv_wr *bad_wr_recv;
+ if (user_param->connection_type == UD) {
+ if (size > 2048) {
+ if (user_param->gid_index < 0) {
+ size = 2048;
+ } else {
+ size = 1024;
+ }
+ }
+ }
+ /*********************************************
+ * Important note :
+ * In case of UD/UC this is NOT the way to measure
+ * BW sicen we are running with loop on the send side
+ * while we should run on the recieve side or enable retry in SW
+ * Since the sender may be faster than the reciver than although
+ * we had posted recieve it is not enough and might end this will
+ * result in deadlock of test since both sides are stuck on poll cq
+ * In this test i do not solve this for the general test ,need to write
+ * seperate test for UC/UD but in case the tx_depth is ~1/3 from the
+ * number of iterations this should be ok .
+ * Also note that the sender is limited in the number of send, ans
+ * i try to make the reciver full
+ *********************************************/
+
+ if (user_param->connection_type == UD)
+ ctx->recv_list.length = ctx->size + 40;
+ else
+ ctx->recv_list.length = ctx->size;
+ if (size > user_param->inline_size) /*complaince to perf_main */
+ ctx->wr.send_flags = IBV_SEND_SIGNALED;
+ else
+ ctx->wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;
+
+ ctx->list.length = size;
+ scnt = 0;
+ ccnt = 0;
+ rcnt = 0;
+ qp = ctx->qp;
+
+ while (ccnt < user_param->iters || rcnt < user_param->iters ) {
+ struct ibv_wc wc;
+ int ne;
+ while (scnt < user_param->iters &&
+ (scnt - ccnt) < user_param->tx_depth / 2) {
+ struct ibv_send_wr *bad_wr;
+ tposted[scnt] = get_cycles();
+ if (ibv_post_send(qp, &ctx->wr, &bad_wr)) {
+ fprintf(stderr, "Couldn't post send: scnt=%d\n",
+ scnt);
+ return 1;
+ }
+ ++scnt;
+ }
+ if (user_param->use_event) {
+ struct ibv_cq *ev_cq;
+ void *ev_ctx;
+ if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) {
+ fprintf(stderr, "Failed to get cq_event\n");
+ return 1;
+ }
+ if (ev_cq != ctx->cq) {
+ fprintf(stderr, "CQ event for unknown CQ %p\n", ev_cq);
+ return 1;
+ }
+ if (ibv_req_notify_cq(ctx->cq, 0)) {
+ fprintf(stderr, "Couldn't request CQ notification\n");
+ return 1;
+ }
+ }
+ for (;;) {
+ ne = ibv_poll_cq(ctx->cq, 1, &wc);
+ if (ne <= 0)
+ break;
+
+ if (wc.status != IBV_WC_SUCCESS) {
+ fprintf(stderr, "Completion wth error at %s:\n",
+ user_param->servername ? "client" : "server");
+ fprintf(stderr, "Failed status %d: wr_id %d syndrom 0x%x\n",
+ wc.status, (int) wc.wr_id, wc.vendor_err);
+ fprintf(stderr, "scnt=%d, ccnt=%d\n",
+ scnt, ccnt);
+ return 1;
+ }
+ switch ((int) wc.wr_id) {
+ case PINGPONG_SEND_WRID:
+ tcompleted[ccnt] = get_cycles();
+ ccnt += 1;
+ break;
+ case PINGPONG_RECV_WRID:
+ if (--post_recv <= ctx->rx_depth - 2) {
+ while (rcnt < user_param->iters &&
+ (ctx->rx_depth - post_recv) > 0 ) {
+ ++post_recv;
+ if (ibv_post_recv(qp, &ctx->rwr, &bad_wr_recv)) {
+ fprintf(stderr, "Couldn't post recv: rcnt=%d\n",
+ rcnt);
+ return 15;
+ }
+ }
+ }
+ rcnt += 1;
+ break;
+ default:
+ fprintf(stderr, "Completion for unknown wr_id %d\n",
+ (int) wc.wr_id);
+ break;
+ }
+ }
+
+ if (ne < 0) {
+ fprintf(stderr, "poll CQ failed %d\n", ne);
+ return 1;
+ }
+ }
+
+ return(0);
+}
+int run_iter_uni(struct pingpong_context *ctx, struct user_parameters *user_param,
+ struct pingpong_dest *rem_dest, int size)
+{
+ struct ibv_qp *qp;
+ int scnt, ccnt, rcnt;
+ struct ibv_recv_wr *bad_wr_recv;
+
+ if (user_param->connection_type == UD) {
+ if (size > 2048) {
+ if (user_param->gid_index < 0) {
+ size = 2048;
+ } else {
+ size = 1024;
+ }
+ }
+ }
+
+ if (user_param->connection_type == UD)
+ ctx->recv_list.length = ctx->size + 40;
+ else
+ ctx->recv_list.length = ctx->size;
+
+ if (size > user_param->inline_size) { /*complaince to perf_main */
+ ctx->wr.send_flags = IBV_SEND_SIGNALED;
+ } else {
+ ctx->wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;
+ }
+ ctx->list.length = size;
+ scnt = 0;
+ ccnt = 0;
+ rcnt = 0;
+ qp = ctx->qp;
+ if (!user_param->servername) {
+ while (rcnt < user_param->iters) {
+ int ne;
+ struct ibv_wc wc;
+ /*Server is polling on recieve first */
+ if (user_param->use_event) {
+ struct ibv_cq *ev_cq;
+ void *ev_ctx;
+ if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) {
+ fprintf(stderr, "Failed to get cq_event\n");
+ return 1;
+ }
+ if (ev_cq != ctx->cq) {
+ fprintf(stderr, "CQ event for unknown CQ %p\n", ev_cq);
+ return 1;
+ }
+ if (ibv_req_notify_cq(ctx->cq, 0)) {
+ fprintf(stderr, "Couldn't request CQ notification\n");
+ return 1;
+ }
+ }
+ do {
+ ne = ibv_poll_cq(ctx->cq, 1, &wc);
+ if (ne) {
+ tcompleted[ccnt] = get_cycles();
+ if (wc.status != IBV_WC_SUCCESS) {
+ fprintf(stderr, "Completion wth error at %s:\n",
+ user_param->servername ? "client" : "server");
+ fprintf(stderr, "Failed status %d: wr_id %d syndrom 0x%x\n",
+ wc.status, (int) wc.wr_id, wc.vendor_err);
+ fprintf(stderr, "scnt=%d, ccnt=%d\n",
+ scnt, ccnt);
+ return 1;
+ }
+ ++rcnt;
+ if (ibv_post_recv(qp, &ctx->rwr, &bad_wr_recv)) {
+ fprintf(stderr, "Couldn't post recv: rcnt=%d\n",
+ rcnt);
+ return 15;
+ }
+
+ }
+ } while (ne > 0 );
+
+ if (ne < 0) {
+ fprintf(stderr, "Poll Recieve CQ failed %d\n", ne);
+ return 12;
+ }
+ }
+ } else {
+ /* client is posting and not receiving. */
+ while (scnt < user_param->iters || ccnt < user_param->iters) {
+ while (scnt < user_param->iters && (scnt - ccnt) < user_param->tx_depth ) {
+ struct ibv_send_wr *bad_wr;
+ tposted[scnt] = get_cycles();
+ if (ibv_post_send(qp, &ctx->wr, &bad_wr)) {
+ fprintf(stderr, "Couldn't post send: scnt=%d\n",
+ scnt);
+ return 1;
+ }
+ ++scnt;
+ }
+ if (ccnt < user_param->iters) {
+ struct ibv_wc wc;
+ int ne;
+ if (user_param->use_event) {
+ struct ibv_cq *ev_cq;
+ void *ev_ctx;
+ if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) {
+ fprintf(stderr, "Failed to get cq_event\n");
+ return 1;
+ }
+ if (ev_cq != ctx->cq) {
+ fprintf(stderr, "CQ event for unknown CQ %p\n", ev_cq);
+ return 1;
+ }
+ if (ibv_req_notify_cq(ctx->cq, 0)) {
+ fprintf(stderr, "Couldn't request CQ notification\n");
+ return 1;
+ }
+ }
+ for (;;) {
+ ne = ibv_poll_cq(ctx->cq, 1, &wc);
+ if (ne <= 0)
+ break;
+
+ tcompleted[ccnt] = get_cycles();
+ if (wc.status != IBV_WC_SUCCESS) {
+ fprintf(stderr, "Completion wth error at %s:\n",
+ user_param->servername ? "client" : "server");
+ fprintf(stderr, "Failed status %d: wr_id %d syndrom 0x%x\n",
+ wc.status, (int) wc.wr_id, wc.vendor_err);
+ fprintf(stderr, "scnt=%d, ccnt=%d\n",
+ scnt, ccnt);
+ return 1;
+ }
+ ccnt += ne;
+ }
+
+ if (ne < 0) {
+ fprintf(stderr, "poll CQ failed %d\n", ne);
+ return 1;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ struct ibv_device **dev_list;
+ struct ibv_device *ib_dev;
+ struct pingpong_context *ctx;
+ struct pingpong_dest my_dest;
+ struct pingpong_dest *rem_dest;
+ struct user_parameters user_param;
+ struct ibv_device_attr device_attribute;
+ char *ib_devname = NULL;
+ int port = 18515;
+ int ib_port = 1;
+ long long size = 65536;
+ int sockfd;
+ int i = 0;
+ int size_max_pow = 24;
+ int noPeak = 0;/*noPeak == 0: regular peak-bw calculation done*/
+ int inline_given_in_cmd = 0;
+ struct ibv_context *context;
+ int no_cpu_freq_fail = 0;
+ union ibv_gid gid;
+ /* init default values to user's parameters */
+ memset(&user_param, 0, sizeof(struct user_parameters));
+ user_param.mtu = 0;
+ user_param.iters = 1000;
+ user_param.tx_depth = 300;
+ user_param.servername = NULL;
+ user_param.use_event = 0;
+ user_param.duplex = 0;
+ user_param.inline_size = MAX_INLINE;
+ user_param.qp_timeout = 14;
+ user_param.gid_index = -1; /*gid will not be used*/
+ /* Parameter parsing. */
+ while (1) {
+ int c;
+
+ static struct option long_options[] = {
+ { .name = "port", .has_arg = 1, .val = 'p' },
+ { .name = "ib-dev", .has_arg = 1, .val = 'd' },
+ { .name = "ib-port", .has_arg = 1, .val = 'i' },
+ { .name = "mtu", .has_arg = 1, .val = 'm' },
+ { .name = "connection", .has_arg = 1, .val = 'c' },
+ { .name = "size", .has_arg = 1, .val = 's' },
+ { .name = "iters", .has_arg = 1, .val = 'n' },
+ { .name = "tx-depth", .has_arg = 1, .val = 't' },
+ { .name = "inline_size", .has_arg = 1, .val = 'I' },
+ { .name = "rx-depth", .has_arg = 1, .val = 'r' },
+ { .name = "qp-timeout", .has_arg = 1, .val = 'u' },
+ { .name = "sl", .has_arg = 1, .val = 'S' },
+ { .name = "gid-index", .has_arg = 1, .val = 'x' },
+ { .name = "all", .has_arg = 0, .val = 'a' },
+ { .name = "bidirectional", .has_arg = 0, .val = 'b' },
+ { .name = "version", .has_arg = 0, .val = 'V' },
+ { .name = "events", .has_arg = 0, .val = 'e' },
+ { .name = "mcg", .has_arg = 0, .val = 'g' },
+ { .name = "noPeak", .has_arg = 0, .val = 'N' },
+ { .name = "CPU-freq", .has_arg = 0, .val = 'F' },
+ { 0 }
+ };
+
+ c = getopt_long(argc, argv, "p:d:i:m:c:s:n:t:I:r:u:S:x:ebaVgNF", long_options, NULL);
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'p':
+ port = strtol(optarg, NULL, 0);
+ if (port < 0 || port > 65535) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+ case 'e':
+ ++user_param.use_event;
+ break;
+ case 'g':
+ ++user_param.use_mcg;
+ break;
+ case 'd':
+ ib_devname = strdupa(optarg);
+ break;
+ case 'c':
+ if (strcmp("UC",optarg)==0)
+ user_param.connection_type=UC;
+ if (strcmp("UD",optarg)==0)
+ user_param.connection_type=UD;
+ break;
+ case 'm':
+ user_param.mtu = strtol(optarg, NULL, 0);
+ break;
+ case 'a':
+ user_param.all = ALL;
+ break;
+ case 'V':
+ printf("send_bw version : %.2f\n",VERSION);
+ return 0;
+ break;
+ case 'i':
+ ib_port = strtol(optarg, NULL, 0);
+ if (ib_port < 0) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ case 's':
+ size = strtoll(optarg, NULL, 0);
+ if (size < 1 || size > UINT_MAX / 2) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ break;
+
+ case 'x':
+ user_param.gid_index = strtol(optarg, NULL, 0);
+ if (user_param.gid_index > 63) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ case 't':
+ user_param.tx_depth = strtol(optarg, NULL, 0);
+ if (user_param.tx_depth < 1) { usage(argv[0]); return 1; }
+ break;
+
+ case 'I':
+ user_param.inline_size = strtol(optarg, NULL, 0);
+ inline_given_in_cmd =1;
+ if (user_param.inline_size > MAX_INLINE) {
+ usage(argv[0]);
+ return 7;
+ }
+
+ case 'r':
+ errno = 0;
+ user_param.rx_depth = strtol(optarg, NULL, 0);
+ if (errno) { usage(argv[0]); return 1; }
+ break;
+
+ case 'n':
+ user_param.iters = strtol(optarg, NULL, 0);
+ if (user_param.iters < 2) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ break;
+
+ case 'b':
+ user_param.duplex = 1;
+ break;
+
+ case 'N':
+ noPeak = 1;
+ break;
+
+ case 'F':
+ no_cpu_freq_fail = 1;
+ break;
+
+ case 'u':
+ user_param.qp_timeout = strtol(optarg, NULL, 0);
+ break;
+
+ case 'S':
+ sl = strtol(optarg, NULL, 0);
+ if (sl > 15) { usage(argv[0]); return 1; }
+ break;
+
+ default:
+ usage(argv[0]);
+ return 1;
+ }
+ }
+
+ if (optind == argc - 1)
+ user_param.servername = strdupa(argv[optind]);
+ else if (optind < argc) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ printf("------------------------------------------------------------------\n");
+ if (user_param.duplex == 1 && (!user_param.use_mcg || !(user_param.connection_type == UD)))
+ printf(" Send Bidirectional BW Test\n");
+ else if (user_param.duplex == 1 && user_param.use_mcg && (user_param.connection_type == UD))
+ printf(" Send Bidirectional BW Multicast Test\n");
+ else if (!user_param.duplex == 1 && user_param.use_mcg && (user_param.connection_type == UD))
+ printf(" Send BW Multicast Test\n");
+ else
+ printf(" Send BW Test\n");
+
+ if (user_param.connection_type == RC)
+ printf("Connection type : RC\n");
+ else if (user_param.connection_type == UC)
+ printf("Connection type : UC\n");
+ else{
+ printf("Connection type : UD\n");
+ }
+ if (user_param.gid_index > -1) {
+ printf("Using GID to support RDMAoE configuration. Refer to port type as Ethernet, default MTU 1024B\n");
+ }
+
+ /* Done with parameter parsing. Perform setup. */
+ if (user_param.all == ALL)
+ /*since we run all sizes */
+ size = 8388608; /*2^23 */
+ else if (user_param.connection_type == UD && size > 2048) {
+ printf("Max msg size in UD is 2048 changing to 2048\n");
+ size = 2048;
+ }
+ if (user_param.connection_type == UD && user_param.gid_index > -1 && size > 1024) {
+ printf("Max msg size in UD RDMAoE is 1024. changing to 1024\n");
+ size = 1024;
+ }
+
+ srand48(getpid() * time(NULL));
+
+ page_size = sysconf(_SC_PAGESIZE);
+
+ dev_list = ibv_get_device_list(NULL);
+
+ if (!ib_devname) {
+ ib_dev = dev_list[0];
+ if (!ib_dev) {
+ fprintf(stderr, "No IB devices found\n");
+ return 1;
+ }
+ } else {
+ for (; (ib_dev = *dev_list); ++dev_list)
+ if (!strcmp(ibv_get_device_name(ib_dev), ib_devname))
+ break;
+ if (!ib_dev) {
+ fprintf(stderr, "IB device %s not found\n", ib_devname);
+ return 1;
+ }
+ }
+
+ context = ibv_open_device(ib_dev);
+ if (ibv_query_device(context, &device_attribute)) {
+ fprintf(stderr, "Failed to query device props");
+ return 1;
+ }
+ if ((device_attribute.vendor_part_id == 25408 ||
+ device_attribute.vendor_part_id == 25418 ||
+ device_attribute.vendor_part_id == 26408 ||
+ device_attribute.vendor_part_id == 26418 ||
+ device_attribute.vendor_part_id == 26428) && (!inline_given_in_cmd)) {
+ user_param.inline_size = 1;
+ }
+ printf("Inline data is used up to %d bytes message\n", user_param.inline_size);
+
+ ctx = pp_init_ctx(ib_dev, size, user_param.tx_depth, user_param.rx_depth,
+ ib_port, &user_param);
+ if (!ctx)
+ return 1;
+
+ /* Create connection between client and server.
+ * We do it by exchanging data over a TCP socket connection. */
+
+ my_dest.lid = pp_get_local_lid(ctx, ib_port);
+ my_dest.qpn = ctx->qp->qp_num;
+ my_dest.psn = lrand48() & 0xffffff;
+ if (user_param.gid_index != -1) {
+ int err=0;
+ err = ibv_query_gid (ctx->context, ib_port, user_param.gid_index, &gid);
+ if (err) {
+ return -1;
+ }
+ ctx->dgid=gid;
+ }
+
+ if (user_param.gid_index < 0) {/*We do not fail test upon lid in RDMA0E/Eth conf*/
+ if (!my_dest.lid) {
+ fprintf(stderr, "Local lid 0x0 detected. Is an SM running? If you are running on an RMDAoE interface you must use GIDs\n");
+ return 1;
+ }
+ }
+ my_dest.dgid = gid;
+ my_dest.rkey = ctx->mr->rkey;
+ my_dest.vaddr = (uintptr_t)ctx->buf + size;
+ printf(" local address: LID %#04x, QPN %#06x, PSN %#06x\n",
+ my_dest.lid, my_dest.qpn, my_dest.psn);
+ if (user_param.gid_index > -1) {
+ printf(" GID %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+ my_dest.dgid.raw[0],my_dest.dgid.raw[1],
+ my_dest.dgid.raw[2], my_dest.dgid.raw[3], my_dest.dgid.raw[4],
+ my_dest.dgid.raw[5], my_dest.dgid.raw[6], my_dest.dgid.raw[7],
+ my_dest.dgid.raw[8], my_dest.dgid.raw[9], my_dest.dgid.raw[10],
+ my_dest.dgid.raw[11], my_dest.dgid.raw[12], my_dest.dgid.raw[13],
+ my_dest.dgid.raw[14], my_dest.dgid.raw[15]);
+ }
+
+ if (user_param.servername) {
+ sockfd = pp_client_connect(user_param.servername, port);
+ if (sockfd < 0)
+ return 1;
+ rem_dest = pp_client_exch_dest(sockfd, &my_dest, &user_param);
+ } else {
+ sockfd = pp_server_connect(port);
+ if (sockfd < 0)
+ return 1;
+ rem_dest = pp_server_exch_dest(sockfd, &my_dest, &user_param);
+ }
+
+ if (!rem_dest)
+ return 1;
+
+ printf(" remote address: LID %#04x, QPN %#06x, PSN %#06x\n",
+ rem_dest->lid, rem_dest->qpn, rem_dest->psn);
+ if (user_param.gid_index > -1) {
+ printf(" GID %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+ rem_dest->dgid.raw[0],rem_dest->dgid.raw[1],
+ rem_dest->dgid.raw[2], rem_dest->dgid.raw[3], rem_dest->dgid.raw[4],
+ rem_dest->dgid.raw[5], rem_dest->dgid.raw[6], rem_dest->dgid.raw[7],
+ rem_dest->dgid.raw[8], rem_dest->dgid.raw[9], rem_dest->dgid.raw[10],
+ rem_dest->dgid.raw[11], rem_dest->dgid.raw[12], rem_dest->dgid.raw[13],
+ rem_dest->dgid.raw[14], rem_dest->dgid.raw[15]);
+ }
+
+ if (pp_connect_ctx(ctx, ib_port, my_dest.psn, rem_dest, &user_param))
+ return 1;
+
+ /* An additional handshake is required *after* moving qp to RTR.
+ Arbitrarily reuse exch_dest for this purpose. */
+ if (user_param.servername) {
+ rem_dest = pp_client_exch_dest(sockfd, &my_dest, &user_param);
+ } else {
+ rem_dest = pp_server_exch_dest(sockfd, &my_dest, &user_param);
+ }
+ if (user_param.use_event) {
+ printf("Test with events.\n");
+ if (ibv_req_notify_cq(ctx->cq, 0)) {
+ fprintf(stderr, "Couldn't request CQ notification\n");
+ return 1;
+ }
+ }
+ printf("------------------------------------------------------------------\n");
+ printf(" #bytes #iterations BW peak[MB/sec] BW average[MB/sec] \n");
+
+ tposted = malloc(user_param.iters * sizeof *tposted);
+
+ if (!tposted) {
+ perror("malloc");
+ return 1;
+ }
+
+ tcompleted = malloc(user_param.iters * sizeof *tcompleted);
+
+ if (!tcompleted) {
+ perror("malloc");
+ return 1;
+ }
+ /* send */
+ if (user_param.connection_type == UD) {
+ ctx->list.addr = (uintptr_t) ctx->buf + 40;
+ ctx->wr.wr.ud.ah = ctx->ah;
+ ctx->wr.wr.ud.remote_qpn = rem_dest->qpn;
+ ctx->wr.wr.ud.remote_qkey = 0x11111111;
+ if (user_param.use_mcg) {
+ ctx->wr.wr.ud.remote_qpn = 0xffffff;
+ } else {
+ ctx->wr.wr.ud.remote_qpn = rem_dest->qpn;
+ }
+ } else
+ ctx->list.addr = (uintptr_t) ctx->buf;
+ ctx->list.lkey = ctx->mr->lkey;
+ ctx->wr.wr_id = PINGPONG_SEND_WRID;
+ ctx->wr.sg_list = &ctx->list;
+ ctx->wr.num_sge = 1;
+ ctx->wr.opcode = IBV_WR_SEND;
+ ctx->wr.next = NULL;
+
+ /* recieve */
+ ctx->rwr.wr_id = PINGPONG_RECV_WRID;
+ ctx->rwr.sg_list = &ctx->recv_list;
+ ctx->rwr.num_sge = 1;
+ ctx->rwr.next = NULL;
+ ctx->recv_list.addr = (uintptr_t) ctx->buf;
+ ctx->recv_list.lkey = ctx->mr->lkey;
+
+ if (user_param.all == ALL) {
+ if (user_param.connection_type == UD) {
+ if (user_param.gid_index < 0) {
+ size_max_pow = 12;
+ } else {
+ size_max_pow = 11;
+ }
+ }
+
+ for (i = 1; i < size_max_pow ; ++i) {
+ size = 1 << i;
+ if (user_param.duplex) {
+ if(run_iter_bi(ctx, &user_param, rem_dest, size))
+ return 17;
+ } else {
+ if(run_iter_uni(ctx, &user_param, rem_dest, size))
+ return 17;
+ }
+ if (user_param.servername) {
+ print_report(user_param.iters, size, user_param.duplex, tposted, tcompleted, noPeak, no_cpu_freq_fail);
+ /* sync again for the sake of UC/UC */
+ rem_dest = pp_client_exch_dest(sockfd, &my_dest, &user_param);
+ } else
+ rem_dest = pp_server_exch_dest(sockfd, &my_dest, &user_param);
+ }
+ } else {
+ if (user_param.duplex) {
+ if (run_iter_bi(ctx, &user_param, rem_dest, size))
+ return 18;
+ }
+ else {
+ if(run_iter_uni(ctx, &user_param, rem_dest, size))
+ return 18;
+ }
+
+ if (user_param.servername)
+ print_report(user_param.iters, size, user_param.duplex, tposted, tcompleted, noPeak, no_cpu_freq_fail);
+ }
+
+ /* close sockets */
+ if (user_param.servername)
+ rem_dest = pp_client_exch_dest(sockfd, &my_dest, &user_param);
+ else
+ rem_dest = pp_server_exch_dest(sockfd, &my_dest, &user_param);
+
+ if (write(sockfd, "done", sizeof "done") != sizeof "done"){
+ perror("write");
+ fprintf(stderr, "Couldn't write to socket\n");
+ return 1;
+ }
+ close(sockfd);
+
+ free(tposted);
+ free(tcompleted);
+
+ printf("------------------------------------------------------------------\n");
+ return 0;
+}
diff --git a/send_lat.c b/send_lat.c
new file mode 100755
index 0000000..82493b2
--- /dev/null
+++ b/send_lat.c
@@ -0,0 +1,1375 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2005 Hewlett Packard, Inc (Grant Grundler)
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <netdb.h>
+#include <malloc.h>
+#include <getopt.h>
+#include <arpa/inet.h>
+#include <byteswap.h>
+#include <time.h>
+
+#include <infiniband/verbs.h>
+
+#include "get_clock.h"
+
+#define PINGPONG_SEND_WRID 1
+#define PINGPONG_RECV_WRID 2
+#define RC 0
+#define UC 1
+#define UD 3
+#define VERSION 1.1
+#define SIGNAL 1
+#define MAX_INLINE 400
+#define MCG_LID 0xc001
+#define MCG_GID {255,1,0,0,0,2,201,133,0,0,0,0,0,0,0,0}
+static int sl = 0;
+static int page_size;
+cycles_t *tstamp;
+struct user_parameters {
+ const char *servername;
+ int connection_type;
+ int mtu;
+ int signal_comp;
+ int all; /* run all msg size */
+ int iters;
+ int tx_depth;
+ int use_event;
+ int inline_size;
+ int use_mcg;
+ int qp_timeout;
+ int gid_index; /* if value not negative, we use gid AND gid_index=value */
+};
+
+struct report_options {
+ int unsorted;
+ int histogram;
+ int cycles; /* report delta's in cycles, not microsec's */
+};
+
+
+struct pingpong_context {
+ struct ibv_sge list;
+ struct ibv_sge recv_list;
+ struct ibv_send_wr wr;
+ struct ibv_recv_wr rwr;
+ struct ibv_context *context;
+ struct ibv_comp_channel *channel;
+ struct ibv_pd *pd;
+ struct ibv_mr *mr;
+ struct ibv_cq *scq;
+ struct ibv_cq *rcq;
+ struct ibv_qp *qp;
+ struct ibv_ah *ah;
+ void *buf;
+ volatile char *post_buf;
+ volatile char *poll_buf;
+ int size;
+ int tx_depth;
+ union ibv_gid dgid;
+};
+
+struct pingpong_dest {
+ unsigned long long vaddr;
+ int lid;
+ int qpn;
+ int psn;
+ unsigned int rkey;
+ union ibv_gid dgid;
+};
+
+
+static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port)
+{
+ struct ibv_port_attr attr;
+
+ if (ibv_query_port(ctx->context, port, &attr))
+ return 0;
+
+ return attr.lid;
+}
+
+static struct ibv_device *pp_find_dev(const char *ib_devname) {
+ struct ibv_device **dev_list;
+ struct ibv_device *ib_dev = NULL;
+
+ dev_list = ibv_get_device_list(NULL);
+
+ if (!ib_devname) {
+ ib_dev = dev_list[0];
+ if (!ib_dev)
+ fprintf(stderr, "No IB devices found\n");
+ } else {
+ for (; (ib_dev = *dev_list); ++dev_list)
+ if (!strcmp(ibv_get_device_name(ib_dev), ib_devname))
+ break;
+ if (!ib_dev)
+ fprintf(stderr, "IB device %s not found\n", ib_devname);
+ }
+ return ib_dev;
+}
+
+#define KEY_MSG_SIZE (sizeof "0000:000000:000000:00000000:0000000000000000")
+#define KEY_PRINT_FMT "%04x:%06x:%06x:%08x:%016Lx"
+#define KEY_MSG_SIZE_GID (sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00")
+#define KEY_PRINT_FMT_GID "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x"
+
+static int pp_write_keys(int sockfd, const struct pingpong_dest *my_dest, struct user_parameters *user_parm)
+{
+ if (user_parm->gid_index < 0) {
+ char msg[KEY_MSG_SIZE];
+
+ sprintf(msg, KEY_PRINT_FMT, my_dest->lid, my_dest->qpn,
+ my_dest->psn, my_dest->rkey, my_dest->vaddr);
+
+ if (write(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("client write");
+ fprintf(stderr, "Couldn't send local address\n");
+ return -1;
+ }
+
+ return 0;
+ } else {
+ char msg[KEY_MSG_SIZE_GID];
+
+ sprintf(msg, KEY_PRINT_FMT_GID, my_dest->lid, my_dest->qpn,
+ my_dest->psn, my_dest->rkey, my_dest->vaddr,
+ my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2], my_dest->dgid.raw[3],
+ my_dest->dgid.raw[4], my_dest->dgid.raw[5], my_dest->dgid.raw[6], my_dest->dgid.raw[7],
+ my_dest->dgid.raw[8], my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11],
+ my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14], my_dest->dgid.raw[15]);
+
+ if (write(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("client write");
+ fprintf(stderr, "Couldn't send local address\n");
+ return -1;
+ }
+
+ return 0;
+ }
+}
+
+static int pp_read_keys(int sockfd, const struct pingpong_dest *my_dest,
+ struct pingpong_dest *rem_dest, struct user_parameters *user_parm)
+{
+ if (user_parm->gid_index < 0) {
+ int parsed;
+ char msg[KEY_MSG_SIZE];
+
+ if (read(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("pp_read_keys");
+ fprintf(stderr, "Couldn't read remote address\n");
+ return -1;
+ }
+
+ parsed = sscanf(msg, KEY_PRINT_FMT, &rem_dest->lid, &rem_dest->qpn,
+ &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr);
+
+ if (parsed != 5) {
+ fprintf(stderr, "Couldn't parse line <%.*s>\n",
+ (int)sizeof msg, msg);
+ return -1;
+ }
+
+ return 0;
+ } else {
+ char msg[KEY_MSG_SIZE_GID];
+ if (read(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("pp_read_keys");
+ fprintf(stderr, "Couldn't read remote address\n");
+ return -1;
+ }
+ char *pstr = msg, *term;
+ char tmp[20];
+ int i;
+
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA
+
+ for (i = 0; i < 15; ++i) {
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16);
+ }
+ pstr += term - pstr + 1;
+ strcpy(tmp, pstr);
+ rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16);
+ return 0;
+ }
+}
+
+static int pp_client_connect(const char *servername, int port)
+{
+ struct addrinfo *res, *t;
+ struct addrinfo hints =
+ {
+ .ai_family = AF_UNSPEC,
+ .ai_socktype = SOCK_STREAM
+ };
+ char *service;
+ int n;
+ int sockfd = -1;
+
+ if (asprintf(&service, "%d", port) < 0)
+ return -1;
+
+ n = getaddrinfo(servername, service, &hints, &res);
+
+ if (n < 0) {
+ fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port);
+ return n;
+ }
+
+ for (t = res; t; t = t->ai_next) {
+ sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
+ if (sockfd >= 0) {
+ if (!connect(sockfd, t->ai_addr, t->ai_addrlen))
+ break;
+ close(sockfd);
+ sockfd = -1;
+ }
+ }
+
+ freeaddrinfo(res);
+
+ if (sockfd < 0) {
+ fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port);
+ return sockfd;
+ }
+ return sockfd;
+}
+
+static int pp_client_exch_dest(int sockfd, const struct pingpong_dest *my_dest,
+ struct pingpong_dest *rem_dest, struct user_parameters *user_parm)
+{
+ if (pp_write_keys(sockfd, my_dest, user_parm))
+ return -1;
+
+ return pp_read_keys(sockfd, my_dest, rem_dest, user_parm);
+}
+
+static int pp_server_connect(int port)
+{
+ struct addrinfo *res, *t;
+ struct addrinfo hints = {
+ .ai_flags = AI_PASSIVE,
+ .ai_family = AF_UNSPEC,
+ .ai_socktype = SOCK_STREAM
+ };
+ char *service;
+ int sockfd = -1, connfd;
+ int n;
+
+ if (asprintf(&service, "%d", port) < 0)
+ return -1;
+
+ n = getaddrinfo(NULL, service, &hints, &res);
+
+ if (n < 0) {
+ fprintf(stderr, "%s for port %d\n", gai_strerror(n), port);
+ return n;
+ }
+
+ for (t = res; t; t = t->ai_next) {
+ sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
+ if (sockfd >= 0) {
+ n = 1;
+
+ setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n);
+
+ if (!bind(sockfd, t->ai_addr, t->ai_addrlen))
+ break;
+ close(sockfd);
+ sockfd = -1;
+ }
+ }
+
+ freeaddrinfo(res);
+
+ if (sockfd < 0) {
+ fprintf(stderr, "Couldn't listen to port %d\n", port);
+ return sockfd;
+ }
+
+ listen(sockfd, 1);
+ connfd = accept(sockfd, NULL, 0);
+ if (connfd < 0) {
+ perror("server accept");
+ fprintf(stderr, "accept() failed\n");
+ close(sockfd);
+ return connfd;
+ }
+
+ close(sockfd);
+ return connfd;
+}
+
+static int pp_server_exch_dest(int sockfd, const struct pingpong_dest *my_dest,
+ struct pingpong_dest* rem_dest, struct user_parameters *user_parm)
+{
+
+ if (pp_read_keys(sockfd, my_dest, rem_dest, user_parm))
+ return -1;
+
+ return pp_write_keys(sockfd, my_dest, user_parm);
+}
+
+static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,
+ int tx_depth, int port,struct user_parameters *user_parm) {
+ struct pingpong_context *ctx;
+ struct ibv_device_attr device_attr;
+
+ ctx = malloc(sizeof *ctx);
+ if (!ctx)
+ return NULL;
+
+ ctx->size = size;
+ ctx->tx_depth = tx_depth;
+ /* in case of UD need space for the GRH */
+ if (user_parm->connection_type==UD) {
+ ctx->buf = memalign(page_size, ( size + 40 ) * 2);
+ if (!ctx->buf) {
+ fprintf(stderr, "Couldn't allocate work buf.\n");
+ return NULL;
+ }
+ memset(ctx->buf, 0, ( size + 40 ) * 2);
+ } else {
+ ctx->buf = memalign(page_size, size * 2);
+ if (!ctx->buf) {
+ fprintf(stderr, "Couldn't allocate work buf.\n");
+ return NULL;
+ }
+ memset(ctx->buf, 0, size * 2);
+ }
+
+ ctx->post_buf = (char*)ctx->buf + (size - 1);
+ ctx->poll_buf = (char*)ctx->buf + (2 * size - 1);
+
+ ctx->context = ibv_open_device(ib_dev);
+ if (!ctx->context) {
+ fprintf(stderr, "Couldn't get context for %s\n",
+ ibv_get_device_name(ib_dev));
+ return NULL;
+ }
+ if (user_parm->mtu == 0) {/*user did not ask for specific mtu */
+ if (ibv_query_device(ctx->context, &device_attr)) {
+ fprintf(stderr, "Failed to query device props");
+ return NULL;
+ }
+ if (device_attr.vendor_part_id == 23108 || user_parm->gid_index > -1) {
+ user_parm->mtu = 1024;
+ } else {
+ user_parm->mtu = 2048;
+ }
+ }
+ if (user_parm->use_event) {
+ ctx->channel = ibv_create_comp_channel(ctx->context);
+ if (!ctx->channel) {
+ fprintf(stderr, "Couldn't create completion channel\n");
+ return NULL;
+ }
+ } else
+ ctx->channel = NULL;
+ ctx->pd = ibv_alloc_pd(ctx->context);
+ if (!ctx->pd) {
+ fprintf(stderr, "Couldn't allocate PD\n");
+ return NULL;
+ }
+ if (user_parm->connection_type==UD) {
+ ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, (size + 40 ) * 2,
+ IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+ if (!ctx->mr) {
+ fprintf(stderr, "Couldn't allocate MR\n");
+ return NULL;
+ }
+ } else {
+ ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size * 2,
+ IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+ if (!ctx->mr) {
+ fprintf(stderr, "Couldn't allocate MR\n");
+ return NULL;
+ }
+ }
+
+ ctx->scq = ibv_create_cq(ctx->context, tx_depth, NULL, ctx->channel, 0);
+ if (!ctx->scq) {
+ fprintf(stderr, "Couldn't create CQ\n");
+ return NULL;
+ }
+ ctx->rcq = ibv_create_cq(ctx->context, tx_depth, NULL, ctx->channel, 0);
+ if (!ctx->rcq) {
+ fprintf(stderr, "Couldn't create Recieve CQ\n");
+ return NULL;
+ }
+ {
+ struct ibv_qp_init_attr attr;
+ memset(&attr, 0, sizeof(struct ibv_qp_init_attr));
+ attr.send_cq = ctx->scq;
+ attr.recv_cq = ctx->rcq;
+ attr.cap.max_send_wr = tx_depth;
+ /* Work around: driver doesnt support
+ * recv_wr = 0 */
+ attr.cap.max_recv_wr = tx_depth;
+ attr.cap.max_send_sge = 1;
+ attr.cap.max_recv_sge = 1;
+ attr.cap.max_inline_data = user_parm->inline_size;
+ switch (user_parm->connection_type) {
+ case RC :
+ attr.qp_type = IBV_QPT_RC;
+ break;
+ case UC :
+ attr.qp_type = IBV_QPT_UC;
+ break;
+ case UD :
+ attr.qp_type = IBV_QPT_UD;
+ break;
+ default:
+ fprintf(stderr, "Unknown connection type %d \n",user_parm->connection_type);
+ return NULL;
+ }
+ attr.sq_sig_all = 0;
+ ctx->qp = ibv_create_qp(ctx->pd, &attr);
+ if (!ctx->qp) {
+ fprintf(stderr, "Couldn't create QP\n");
+ return NULL;
+ }
+ }
+
+ {
+ struct ibv_qp_attr attr;
+ memset(&attr, 0, sizeof(struct ibv_qp_init_attr));
+ attr.qp_state = IBV_QPS_INIT;
+ attr.pkey_index = 0;
+ attr.port_num = port;
+ if (user_parm->connection_type==UD) {
+ attr.qkey = 0x11111111;
+ } else {
+ attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE;
+ }
+
+ if (user_parm->connection_type==UD) {
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_PKEY_INDEX |
+ IBV_QP_PORT |
+ IBV_QP_QKEY)) {
+ fprintf(stderr, "Failed to modify UD QP to INIT\n");
+ return NULL;
+ }
+
+ if (user_parm->use_mcg) {
+ union ibv_gid gid;
+ uint8_t mcg_gid[16] = MCG_GID;
+
+ /* use the local QP number as part of the mcg */
+ mcg_gid[11] = (user_parm->servername) ? 0 : 1;
+ *(uint32_t *)(&mcg_gid[12]) = ctx->qp->qp_num;
+ memcpy(gid.raw, mcg_gid, 16);
+
+ if (ibv_attach_mcast(ctx->qp, &gid, MCG_LID)) {
+ fprintf(stderr, "Couldn't attach QP to mcg\n");
+ return NULL;
+ }
+ }
+ } else if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_PKEY_INDEX |
+ IBV_QP_PORT |
+ IBV_QP_ACCESS_FLAGS)) {
+ fprintf(stderr, "Failed to modify QP to INIT\n");
+ return NULL;
+ }
+ }
+ //send
+ ctx->wr.wr_id = PINGPONG_SEND_WRID;
+ ctx->wr.sg_list = &ctx->list;
+ ctx->wr.num_sge = 1;
+ ctx->wr.opcode = IBV_WR_SEND;
+ ctx->wr.next = NULL;
+ //recieve
+ ctx->rwr.wr_id = PINGPONG_RECV_WRID;
+ ctx->rwr.sg_list = &ctx->recv_list;
+ ctx->rwr.num_sge = 1;
+ ctx->rwr.next = NULL;
+ return ctx;
+}
+
+static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn,
+ struct pingpong_dest *dest,struct user_parameters *user_parm)
+{
+ struct ibv_qp_attr attr;
+ memset(&attr, 0, sizeof(struct ibv_qp_attr));
+ attr.qp_state = IBV_QPS_RTR;
+ if (user_parm->connection_type != UD) {
+ switch (user_parm->mtu) {
+ case 256 :
+ attr.path_mtu = IBV_MTU_256;
+ break;
+ case 512 :
+ attr.path_mtu = IBV_MTU_512;
+ break;
+ case 1024 :
+ attr.path_mtu = IBV_MTU_1024;
+ break;
+ case 2048 :
+ attr.path_mtu = IBV_MTU_2048;
+ break;
+ case 4096 :
+ attr.path_mtu = IBV_MTU_4096;
+ break;
+ }
+ printf("Mtu : %d\n", user_parm->mtu);
+ attr.dest_qp_num = dest->qpn;
+ attr.rq_psn = dest->psn;
+ }
+ if (user_parm->connection_type==RC) {
+ attr.max_dest_rd_atomic = 1;
+ attr.min_rnr_timer = 12;
+ }
+
+ if (user_parm->gid_index < 0) {
+ attr.ah_attr.is_global = 0;
+ attr.ah_attr.dlid = dest->lid;
+ attr.ah_attr.sl = sl;
+ } else {
+ attr.ah_attr.is_global = 1;
+ attr.ah_attr.grh.dgid = dest->dgid;
+ attr.ah_attr.grh.hop_limit = 1;
+ attr.ah_attr.sl = 0;
+ }
+ attr.ah_attr.src_path_bits = 0;
+ attr.ah_attr.port_num = port;
+ if ((user_parm->connection_type==UD) && (user_parm->use_mcg)) {
+ uint8_t mcg_gid[16] = MCG_GID;
+
+ /* send the message to the mcg of the other side */
+ mcg_gid[11] = (user_parm->servername) ? 1 : 0;
+ *(uint32_t *)(&mcg_gid[12]) = dest->qpn;
+
+ attr.ah_attr.dlid = MCG_LID;
+ attr.ah_attr.is_global = 1;
+ attr.ah_attr.grh.sgid_index = 0;
+ memcpy(attr.ah_attr.grh.dgid.raw, mcg_gid, 16);
+ }
+
+ if (user_parm->connection_type==RC) {
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_AV |
+ IBV_QP_PATH_MTU |
+ IBV_QP_DEST_QPN |
+ IBV_QP_RQ_PSN |
+ IBV_QP_MIN_RNR_TIMER |
+ IBV_QP_MAX_DEST_RD_ATOMIC)) {
+ fprintf(stderr, "Failed to modify RC QP to RTR\n");
+ return 1;
+ }
+ attr.timeout = user_parm->qp_timeout;
+ attr.retry_cnt = 7;
+ attr.rnr_retry = 7;
+ } else if (user_parm->connection_type==UC) {
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_AV |
+ IBV_QP_PATH_MTU |
+ IBV_QP_DEST_QPN |
+ IBV_QP_RQ_PSN)) {
+ fprintf(stderr, "Failed to modify UC QP to RTR\n");
+ return 1;
+ }
+
+ } else {
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE )) {
+ fprintf(stderr, "Failed to modify UC QP to RTR\n");
+ return 1;
+ }
+
+ }
+ attr.qp_state = IBV_QPS_RTS;
+ attr.sq_psn = my_psn;
+ if (user_parm->connection_type==RC) {
+ attr.max_rd_atomic = 1;
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_SQ_PSN |
+ IBV_QP_TIMEOUT |
+ IBV_QP_RETRY_CNT |
+ IBV_QP_RNR_RETRY |
+ IBV_QP_MAX_QP_RD_ATOMIC)) {
+ fprintf(stderr, "Failed to modify RC QP to RTS\n");
+ return 1;
+ }
+ } else { /*both UC and UD */
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_SQ_PSN)) {
+ fprintf(stderr, "Failed to modify UC/UD QP to RTS\n");
+ return 1;
+ }
+
+ }
+ if (user_parm->connection_type==UD) {
+ ctx->ah = ibv_create_ah(ctx->pd, &attr.ah_attr);
+ if (!ctx->ah) {
+ fprintf(stderr, "Failed to create AH for UD\n");
+ return 1;
+ }
+ }
+ /* post recieve max msg size*/
+ {
+ int i;
+ struct ibv_recv_wr *bad_wr_recv;
+
+ ctx->recv_list.addr = (uintptr_t) ctx->buf;
+ if (user_parm->connection_type==UD) {
+ ctx->recv_list.length = ctx->size + 40;
+ } else {
+ ctx->recv_list.length = ctx->size;
+ }
+ ctx->recv_list.lkey = ctx->mr->lkey;
+ for (i = 0; i < user_parm->tx_depth / 2; ++i) {
+ if (ibv_post_recv(ctx->qp, &ctx->rwr, &bad_wr_recv)) {
+ fprintf(stderr, "Couldn't post recv: counter=%d\n",
+ i);
+ return 14;
+ }
+ }
+ }
+ return 0;
+}
+
+static int pp_open_port(struct pingpong_context *ctx, const char * servername,
+ int ib_port, int port, struct pingpong_dest *rem_dest,struct user_parameters *user_parm)
+{
+ char addr_fmt[] = "%8s address: LID %#04x QPN %#06x PSN %#06x\n";
+ struct pingpong_dest my_dest;
+ int sockfd;
+ int rc;
+ union ibv_gid gid;
+
+
+ /* Create connection between client and server.
+ * We do it by exchanging data over a TCP socket connection. */
+
+ my_dest.lid = pp_get_local_lid(ctx, ib_port);
+ my_dest.qpn = ctx->qp->qp_num;
+ my_dest.psn = lrand48() & 0xffffff;
+ if (user_parm->gid_index < 0) {/*We do not fail test upon lid in RDMA0E/Eth conf*/
+ if (!my_dest.lid) {
+ fprintf(stderr, "Local lid 0x0 detected. Is an SM running?\n");
+ return -1;
+ }
+ }
+ if (user_parm->gid_index != -1) {
+ int err=0;
+ err = ibv_query_gid (ctx->context, ib_port, user_parm->gid_index, &gid);
+ if (err) {
+ return -1;
+ }
+ ctx->dgid=gid;
+ }
+ my_dest.dgid = gid;
+ my_dest.rkey = ctx->mr->rkey;
+ my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size;
+
+ printf(addr_fmt, "local", my_dest.lid, my_dest.qpn, my_dest.psn);
+ if (user_parm->gid_index > -1) {
+ printf(" GID: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+ my_dest.dgid.raw[0],my_dest.dgid.raw[1],
+ my_dest.dgid.raw[2], my_dest.dgid.raw[3], my_dest.dgid.raw[4],
+ my_dest.dgid.raw[5], my_dest.dgid.raw[6], my_dest.dgid.raw[7],
+ my_dest.dgid.raw[8], my_dest.dgid.raw[9], my_dest.dgid.raw[10],
+ my_dest.dgid.raw[11], my_dest.dgid.raw[12], my_dest.dgid.raw[13],
+ my_dest.dgid.raw[14], my_dest.dgid.raw[15]);
+ }
+
+ sockfd = servername ? pp_client_connect(servername, port) :
+ pp_server_connect(port);
+
+ if (sockfd < 0) {
+ printf("pp_connect_sock(%s,%d) failed (%d)!\n",
+ servername, port, sockfd);
+ return sockfd;
+ }
+
+ rc = servername ? pp_client_exch_dest(sockfd, &my_dest, rem_dest, user_parm) :
+ pp_server_exch_dest(sockfd, &my_dest, rem_dest, user_parm);
+ if (rc)
+ return rc;
+
+ printf(addr_fmt, "remote", rem_dest->lid, rem_dest->qpn, rem_dest->psn,
+ rem_dest->rkey, rem_dest->vaddr);
+ if (user_parm->gid_index > -1) {
+ printf(" GID: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+ rem_dest->dgid.raw[0],rem_dest->dgid.raw[1],
+ rem_dest->dgid.raw[2], rem_dest->dgid.raw[3], rem_dest->dgid.raw[4],
+ rem_dest->dgid.raw[5], rem_dest->dgid.raw[6], rem_dest->dgid.raw[7],
+ rem_dest->dgid.raw[8], rem_dest->dgid.raw[9], rem_dest->dgid.raw[10],
+ rem_dest->dgid.raw[11], rem_dest->dgid.raw[12], rem_dest->dgid.raw[13],
+ rem_dest->dgid.raw[14], rem_dest->dgid.raw[15]);
+ }
+
+ if ((rc = pp_connect_ctx(ctx, ib_port, my_dest.psn, rem_dest,user_parm)))
+ return rc;
+
+ /* An additional handshake is required *after* moving qp to RTR.
+ * Arbitrarily reuse exch_dest for this purpose.
+ */
+
+ rc = servername ? pp_client_exch_dest(sockfd, &my_dest, rem_dest, user_parm) :
+ pp_server_exch_dest(sockfd, &my_dest, rem_dest, user_parm);
+
+ if (rc)
+ return rc;
+
+ if (write(sockfd, "done", sizeof "done") != sizeof "done"){
+ perror("write");
+ fprintf(stderr, "Couldn't write to socket\n");
+ return 1;
+ }
+
+ close(sockfd);
+ return 0;
+}
+
+static void usage(const char *argv0)
+{
+ printf("Usage:\n");
+ printf(" %s start a server and wait for connection\n", argv0);
+ printf(" %s <host> connect to server at <host>\n", argv0);
+ printf("\n");
+ printf("Options:\n");
+ printf(" -p, --port=<port> listen on/connect to port <port> (default 18515)\n");
+ printf(" -c, --connection=<RC/UC/UD> connection type RC/UC/UD (default RC)\n");
+ printf(" -m, --mtu=<mtu> mtu size (256 - 4096. default for hermon is 2048)\n");
+ printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n");
+ printf(" -i, --ib-port=<port> use port <port> of IB device (default 1)\n");
+ printf(" -s, --size=<size> size of message to exchange (default 1)\n");
+ printf(" -t, --tx-depth=<dep> size of tx queue (default 50)\n");
+ printf(" -l, --signal signal completion on each msg\n");
+ printf(" -a, --all Run sizes from 2 till 2^23\n");
+ printf(" -n, --iters=<iters> number of exchanges (at least 2, default 1000)\n");
+ printf(" -I, --inline_size=<size> max size of message to be sent in inline mode (default 400)\n");
+ printf(" -u, --qp-timeout=<timeout> QP timeout, timeout value is 4 usec * 2 ^(timeout), default 14\n");
+ printf(" -S, --sl=<sl> SL (default 0)\n");
+ printf(" -x, --gid-index=<index> test uses GID with GID index taken from command line (for RDMAoE index should be 0)\n");
+ printf(" -C, --report-cycles report times in cpu cycle units (default microseconds)\n");
+ printf(" -H, --report-histogram print out all results (default print summary only)\n");
+ printf(" -U, --report-unsorted (implies -H) print out unsorted results (default sorted)\n");
+ printf(" -V, --version display version number\n");
+ printf(" -e, --events sleep on CQ events (default poll)\n");
+ printf(" -g, --mcg send messages to multicast group(only available in UD connection\n");
+ printf(" -F, --CPU-freq do not fail even if cpufreq_ondemand module is loaded\n");
+}
+
+/*
+ * When there is an
+ * odd number of samples, the median is the middle number.
+ * even number of samples, the median is the mean of the
+ * two middle numbers.
+ *
+ */
+static inline cycles_t get_median(int n, cycles_t delta[])
+{
+ if ((n - 1) % 2)
+ return(delta[n / 2] + delta[n / 2 - 1]) / 2;
+ else
+ return delta[n / 2];
+}
+
+static int cycles_compare(const void * aptr, const void * bptr)
+{
+ const cycles_t *a = aptr;
+ const cycles_t *b = bptr;
+ if (*a < *b) return -1;
+ if (*a > *b) return 1;
+ return 0;
+
+}
+
+static void print_report(struct report_options * options,
+ unsigned int iters, cycles_t *tstamp,int size, int no_cpu_freq_fail)
+{
+ double cycles_to_units;
+ cycles_t median;
+ unsigned int i;
+ const char* units;
+ cycles_t *delta = malloc((iters - 1) * sizeof *delta);
+
+ if (!delta) {
+ perror("malloc");
+ return;
+ }
+
+ for (i = 0; i < iters - 1; ++i)
+ delta[i] = tstamp[i + 1] - tstamp[i];
+
+
+ if (options->cycles) {
+ cycles_to_units = 1;
+ units = "cycles";
+ } else {
+ cycles_to_units = get_cpu_mhz(no_cpu_freq_fail);
+ units = "usec";
+ }
+
+ if (options->unsorted) {
+ printf("#, %s\n", units);
+ for (i = 0; i < iters - 1; ++i)
+ printf("%d, %g\n", i + 1, delta[i] / cycles_to_units / 2);
+ }
+
+ qsort(delta, iters - 1, sizeof *delta, cycles_compare);
+
+ if (options->histogram) {
+ printf("#, %s\n", units);
+ for (i = 0; i < iters - 1; ++i)
+ printf("%d, %g\n", i + 1, delta[i] / cycles_to_units / 2);
+ }
+
+ median = get_median(iters - 1, delta);
+ printf("%7d %d %7.2f %7.2f %7.2f\n",
+ size,iters,delta[0] / cycles_to_units / 2,
+ delta[iters - 2] / cycles_to_units / 2,median / cycles_to_units / 2);
+ free(delta);
+}
+
+int run_iter(struct pingpong_context *ctx, struct user_parameters *user_param,
+ struct pingpong_dest *rem_dest, int size)
+{
+ struct ibv_qp *qp;
+ struct ibv_send_wr *wr;
+ struct ibv_recv_wr rwr;
+ struct ibv_recv_wr *bad_wr_recv;
+ volatile char *poll_buf;
+ volatile char *post_buf;
+
+ int scnt, rcnt, ccnt, poll;
+ int iters;
+ int tx_depth;
+ iters = user_param->iters;
+ tx_depth = user_param->tx_depth;
+
+
+ if (user_param->connection_type==UD) {
+ if (size > 2048) {
+ if (user_param->gid_index < 0) {
+ size = 2048;
+ } else {
+ size = 1024;
+ }
+ }
+ }
+
+ ///send //
+ wr = &ctx->wr;
+ if (user_param->connection_type==UD) {
+ ctx->list.addr = (uintptr_t) ctx->buf + 40;
+ } else {
+ ctx->list.addr = (uintptr_t) ctx->buf;
+ }
+ ctx->list.length = size;
+ ctx->list.lkey = ctx->mr->lkey;
+ if (user_param->connection_type==UD) {
+ ctx->wr.wr.ud.ah = ctx->ah;
+ ctx->wr.wr.ud.remote_qpn = rem_dest->qpn;
+ ctx->wr.wr.ud.remote_qkey = 0x11111111;
+ if (user_param->use_mcg) {
+ ctx->wr.wr.ud.remote_qpn = 0xffffff;
+ } else {
+ ctx->wr.wr.ud.remote_qpn = rem_dest->qpn;
+ }
+ }
+ /// receive //
+ rwr = ctx->rwr;
+ ctx->recv_list.addr = (uintptr_t) ctx->buf;
+ if (user_param->connection_type==UD) {
+ ctx->recv_list.length = ctx->size + 40;
+ } else {
+ ctx->recv_list.length = ctx->size;
+ }
+
+ ctx->recv_list.lkey = ctx->mr->lkey;
+
+ scnt = 0;
+ rcnt = 0;
+ ccnt = 0;
+ poll = 0;
+ poll_buf = ctx->poll_buf;
+ post_buf = ctx->post_buf;
+ qp = ctx->qp;
+ if (size > user_param->inline_size || size == 0) {/* complaince to perf_main don't signal*/
+ ctx->wr.send_flags = 0;
+ } else {
+ ctx->wr.send_flags = IBV_SEND_INLINE;
+ }
+
+ while (scnt < iters || rcnt < iters) {
+ if (rcnt < iters && !(scnt < 1 && user_param->servername)) {
+ int ne;
+ struct ibv_wc wc;
+ /*Server is polling on recieve first */
+ ++rcnt;
+ if (ibv_post_recv(qp, &rwr, &bad_wr_recv)) {
+ fprintf(stderr, "Couldn't post recv: rcnt=%d\n",
+ rcnt);
+ return 15;
+ }
+ if (user_param->use_event) {
+ struct ibv_cq *ev_cq;
+ void *ev_ctx;
+
+ if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) {
+ fprintf(stderr, "Failed to get receive cq_event\n");
+ return 1;
+ }
+
+ if (ev_cq != ctx->rcq) {
+ fprintf(stderr, "CQ event for unknown RCQ %p\n", ev_cq);
+ return 1;
+ }
+
+ if (ibv_req_notify_cq(ctx->rcq, 0)) {
+ fprintf(stderr, "Couldn't request RCQ notification\n");
+ return 1;
+ }
+ }
+ do {
+ ne = ibv_poll_cq(ctx->rcq, 1, &wc);
+ } while (!user_param->use_event && ne < 1);
+
+ if (ne < 0) {
+ fprintf(stderr, "Poll Recieve CQ failed %d\n", ne);
+ return 12;
+ }
+ if (wc.status != IBV_WC_SUCCESS) {
+ fprintf(stderr, "Recieve Completion wth error at %s:\n",
+ user_param->servername ? "client" : "server");
+ fprintf(stderr, "Failed status %d: wr_id %d\n",
+ wc.status, (int) wc.wr_id);
+ fprintf(stderr, "scnt=%d, rcnt=%d, ccnt=%d\n",
+ scnt, rcnt, ccnt);
+ return 13;
+ }
+ }
+ if (scnt < iters ) {
+ if (ccnt == (tx_depth - 2) || (user_param->signal_comp == SIGNAL)
+ || (scnt == (iters - 1)) ) {
+ ccnt = 0;
+ poll=1;
+ if (size > user_param->inline_size || size == 0) {/* complaince to perf_main */
+ ctx->wr.send_flags = IBV_SEND_SIGNALED;
+ } else {
+ ctx->wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;
+ }
+
+ }
+ struct ibv_send_wr *bad_wr;
+ /* client post first */
+ tstamp[scnt] = get_cycles();
+ *post_buf = (char)++scnt;
+ if (ibv_post_send(qp, wr, &bad_wr)) {
+ fprintf(stderr, "Couldn't post send: scnt=%d\n",
+ scnt);
+ return 11;
+ }
+ }
+ if (poll == 1) {
+ struct ibv_wc wc;
+ int ne;
+ if (user_param->use_event) {
+ struct ibv_cq *ev_cq;
+ void *ev_ctx;
+
+ if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) {
+ fprintf(stderr, "Failed to get send cq_event\n");
+ return 1;
+ }
+
+ if (ev_cq != ctx->scq) {
+ fprintf(stderr, "CQ event for unknown SCQ %p\n", ev_cq);
+ return 1;
+ }
+
+ if (ibv_req_notify_cq(ctx->scq, 0)) {
+ fprintf(stderr, "Couldn't request SCQ notification\n");
+ return 1;
+ }
+ }
+ /* poll on scq */
+ do {
+ ne = ibv_poll_cq(ctx->scq, 1, &wc);
+ } while (!user_param->use_event && ne < 1);
+
+ if (ne < 0) {
+ fprintf(stderr, "poll SCQ failed %d\n", ne);
+ return 12;
+ }
+ if (wc.status != IBV_WC_SUCCESS) {
+ fprintf(stderr, "Completion wth error at %s:\n",
+ user_param->servername ? "client" : "server");
+ fprintf(stderr, "Failed status %d: wr_id %d\n",
+ wc.status, (int) wc.wr_id);
+ fprintf(stderr, "scnt=%d, rcnt=%d, ccnt=%d\n",
+ scnt, rcnt, ccnt);
+ return 13;
+ }
+ poll = 0;
+ if (size > user_param->inline_size || size == 0) {/* complaince to perf_main don't signal*/
+ ctx->wr.send_flags = 0;
+ } else {
+ ctx->wr.send_flags = IBV_SEND_INLINE;
+ }
+
+ }
+ ++ccnt;
+ }
+
+ return(0);
+}
+int main(int argc, char *argv[])
+{
+ const char *ib_devname = NULL;
+ int port = 18515;
+ int ib_port = 1;
+ int size = 2;
+ int i = 0;
+ int size_max_pow = 24;
+ struct report_options report = {};
+
+ struct pingpong_context *ctx;
+ struct pingpong_dest rem_dest;
+ struct ibv_device *ib_dev;
+ struct user_parameters user_param;
+ int no_cpu_freq_fail = 0;
+
+ /* init default values to user's parameters */
+ memset(&user_param, 0, sizeof(struct user_parameters));
+ user_param.mtu = 0;
+ user_param.iters = 1000;
+ user_param.tx_depth = 50;
+ user_param.servername = NULL;
+ user_param.use_event = 0;
+ user_param.use_mcg = 0;
+ user_param.inline_size = MAX_INLINE;
+ user_param.signal_comp = 0;
+ user_param.qp_timeout = 14;
+ user_param.gid_index = -1; /*gid will not be used*/
+ /* Parameter parsing. */
+ while (1) {
+ int c;
+
+ static struct option long_options[] = {
+ { .name = "port", .has_arg = 1, .val = 'p' },
+ { .name = "connection", .has_arg = 1, .val = 'c' },
+ { .name = "mtu", .has_arg = 1, .val = 'm' },
+ { .name = "ib-dev", .has_arg = 1, .val = 'd' },
+ { .name = "ib-port", .has_arg = 1, .val = 'i' },
+ { .name = "size", .has_arg = 1, .val = 's' },
+ { .name = "iters", .has_arg = 1, .val = 'n' },
+ { .name = "tx-depth", .has_arg = 1, .val = 't' },
+ { .name = "inline_size", .has_arg = 1, .val = 'I' },
+ { .name = "qp-timeout", .has_arg = 1, .val = 'u' },
+ { .name = "sl", .has_arg = 1, .val = 'S' },
+ { .name = "gid-index", .has_arg = 1, .val = 'x' },
+ { .name = "signal", .has_arg = 0, .val = 'l' },
+ { .name = "all", .has_arg = 0, .val = 'a' },
+ { .name = "report-cycles", .has_arg = 0, .val = 'C' },
+ { .name = "report-histogram",.has_arg = 0, .val = 'H' },
+ { .name = "report-unsorted",.has_arg = 0, .val = 'U' },
+ { .name = "version", .has_arg = 0, .val = 'V' },
+ { .name = "events", .has_arg = 0, .val = 'e' },
+ { .name = "mcg", .has_arg = 0, .val = 'g' },
+ { .name = "CPU-freq", .has_arg = 0, .val = 'F' },
+ { 0 }
+ };
+ c = getopt_long(argc, argv, "p:c:m:d:i:s:n:t:I:u:S:x:laeCHUVgF", long_options, NULL);
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'p':
+ port = strtol(optarg, NULL, 0);
+ if (port < 0 || port > 65535) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+ case 'c':
+ if (strcmp("UC",optarg)==0)
+ user_param.connection_type=UC;
+ if (strcmp("UD",optarg)==0)
+ user_param.connection_type=UD;
+ /* default is 0 for any other option RC*/
+ break;
+ case 'e':
+ ++user_param.use_event;
+ break;
+ case 'g':
+ ++user_param.use_mcg;
+ break;
+ case 'm':
+ user_param.mtu = strtol(optarg, NULL, 0);
+ break;
+ case 'l':
+ user_param.signal_comp = SIGNAL;
+ break;
+ case 'a':
+ user_param.all = SIGNAL;
+ break;
+ case 'V':
+ printf("perftest version : %.2f\n",VERSION);
+ return 0;
+ break;
+ case 'd':
+ ib_devname = strdupa(optarg);
+ break;
+
+ case 'i':
+ ib_port = strtol(optarg, NULL, 0);
+ if (ib_port < 0) {
+ usage(argv[0]);
+ return 2;
+ }
+ break;
+
+ case 's':
+ size = strtol(optarg, NULL, 0);
+ if (size < 1) {
+ usage(argv[0]); return 3;
+ }
+ break;
+
+ case 'x':
+ user_param.gid_index = strtol(optarg, NULL, 0);
+ if (user_param.gid_index > 63) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ case 't':
+ user_param.tx_depth = strtol(optarg, NULL, 0);
+ if (user_param.tx_depth < 1) {
+ usage(argv[0]); return 4;
+ }
+ break;
+
+ case 'I':
+ user_param.inline_size = strtol(optarg, NULL, 0);
+ if (user_param.inline_size > MAX_INLINE) {
+ usage(argv[0]);
+ return 19;
+ }
+ break;
+
+ case 'n':
+ user_param.iters = strtol(optarg, NULL, 0);
+ if (user_param.iters < 2) {
+ usage(argv[0]);
+ return 5;
+ }
+
+ break;
+
+ case 'C':
+ report.cycles = 1;
+ break;
+
+ case 'H':
+ report.histogram = 1;
+ break;
+
+ case 'U':
+ report.unsorted = 1;
+ break;
+
+ case 'F':
+ no_cpu_freq_fail = 1;
+ break;
+
+ case 'u':
+ user_param.qp_timeout = strtol(optarg, NULL, 0);
+ break;
+
+ case 'S':
+ sl = strtol(optarg, NULL, 0);
+ if (sl > 15) { usage(argv[0]); return 6; }
+ break;
+
+ default:
+ usage(argv[0]);
+ return 7;
+ }
+ }
+
+ if (optind == argc - 1)
+ user_param.servername = strdupa(argv[optind]);
+ else if (optind < argc) {
+ usage(argv[0]);
+ return 6;
+ }
+
+ /*
+ * Done with parameter parsing. Perform setup.
+ */
+ tstamp = malloc(user_param.iters * sizeof *tstamp);
+ if (!tstamp) {
+ perror("malloc");
+ return 10;
+ }
+ /* Print header data */
+ printf("------------------------------------------------------------------\n");
+ if (user_param.use_mcg && (user_param.connection_type == UD))
+ printf(" Send Latency Multicast Test\n");
+ else
+ printf(" Send Latency Test\n");
+ printf("Inline data is used up to %d bytes message\n", user_param.inline_size);
+ if (user_param.connection_type==RC) {
+ printf("Connection type : RC\n");
+ } else if (user_param.connection_type==UC) {
+ printf("Connection type : UC\n");
+ } else {
+ printf("Connection type : UD\n");
+ }
+ if (user_param.gid_index > -1) {
+ printf("Using GID to support RDMAoE configuration. Refer to port type as Ethernet, default MTU 1024B\n");
+ }
+ if (user_param.all == 1) {
+ /*since we run all sizes lets allocate big enough buffer */
+ size = 8388608; /*2^23 */
+ }
+ if (user_param.connection_type == UD && size > 2048) {
+ printf("Max msg size in UD is 2048 changing to 2048\n");
+ size = 2048;
+ }
+ if (user_param.connection_type == UD && user_param.gid_index > -1 && size > 1024) {
+ printf("Max msg size in UD RDMAoE is 1024. changing to 1024\n");
+ size = 1024;
+ }
+
+ srand48(getpid() * time(NULL));
+ page_size = sysconf(_SC_PAGESIZE);
+
+ ib_dev = pp_find_dev(ib_devname);
+ if (!ib_dev)
+ return 7;
+
+ ctx = pp_init_ctx(ib_dev, size, user_param.tx_depth, ib_port,&user_param);
+ if (!ctx)
+ return 8;
+
+ if (pp_open_port(ctx, user_param.servername, ib_port, port, &rem_dest,&user_param))
+ return 9;
+ if (user_param.use_event) {
+ printf("Test with events.\n");
+ if (ibv_req_notify_cq(ctx->rcq, 0)) {
+ fprintf(stderr, "Couldn't request RCQ notification\n");
+ return 1;
+ }
+ if (ibv_req_notify_cq(ctx->scq, 0)) {
+ fprintf(stderr, "Couldn't request SCQ notification\n");
+ return 1;
+ }
+
+ }
+ printf("------------------------------------------------------------------\n");
+ printf(" #bytes #iterations t_min[usec] t_max[usec] t_typical[usec]\n");
+
+ if (user_param.all == 1) {
+ if (user_param.connection_type==UD) {
+ if (user_param.gid_index < 0) {
+ size_max_pow = 12;
+ } else {
+ size_max_pow = 11;
+ }
+ }
+ for (i = 1; i < size_max_pow ; ++i) {
+ size = 1 << i;
+ if(run_iter(ctx, &user_param, &rem_dest, size))
+ return 17;
+
+ print_report(&report, user_param.iters, tstamp, size, no_cpu_freq_fail);
+ }
+ } else {
+ if(run_iter(ctx, &user_param, &rem_dest, size))
+ return 18;
+ print_report(&report, user_param.iters, tstamp, size, no_cpu_freq_fail);
+ }
+ printf("------------------------------------------------------------------\n");
+ free(tstamp);
+ return 0;
+}
diff --git a/write_bw.c b/write_bw.c
new file mode 100755
index 0000000..8988e18
--- /dev/null
+++ b/write_bw.c
@@ -0,0 +1,1182 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <netdb.h>
+#include <malloc.h>
+#include <getopt.h>
+#include <arpa/inet.h>
+#include <byteswap.h>
+#include <time.h>
+
+#include <infiniband/verbs.h>
+
+#include "get_clock.h"
+
+#define PINGPONG_RDMA_WRID 3
+#define VERSION 2.0
+#define ALL 1
+#define MAX_INLINE 400
+#define RC 0
+#define UC 1
+
+struct user_parameters {
+ const char *servername;
+ int connection_type;
+ int mtu;
+ int all; /* run all msg size */
+ int iters;
+ int tx_depth;
+ int numofqps;
+ int maxpostsofqpiniteration;
+ int inline_size;
+ int qp_timeout;
+ int gid_index; /* if value not negative, we use gid AND gid_index=value */
+};
+struct extended_qp {
+ struct ibv_qp *qp;
+ int scnt, ccnt ;
+};
+static int sl = 0;
+static int page_size;
+
+cycles_t *tposted;
+cycles_t *tcompleted;
+struct pingpong_context {
+ struct ibv_context *context;
+ struct ibv_pd *pd;
+ struct ibv_mr *mr;
+ struct ibv_cq *cq;
+ struct ibv_qp **qp;
+ void *buf;
+ unsigned size;
+ int tx_depth;
+ struct ibv_sge list;
+ struct ibv_send_wr wr;
+ int *scnt;
+ int *ccnt;
+ union ibv_gid dgid;
+};
+
+struct pingpong_dest {
+ int lid;
+ int qpn;
+ int psn;
+ unsigned rkey;
+ unsigned long long vaddr;
+ union ibv_gid dgid;
+};
+
+
+static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port)
+{
+ struct ibv_port_attr attr;
+
+ if (ibv_query_port(ctx->context, port, &attr))
+ return 0;
+
+ return attr.lid;
+}
+
+static int pp_client_connect(const char *servername, int port)
+{
+ struct addrinfo *res, *t;
+ struct addrinfo hints = {
+ .ai_family = AF_UNSPEC,
+ .ai_socktype = SOCK_STREAM
+ };
+ char *service;
+ int n;
+ int sockfd = -1;
+
+ if (asprintf(&service, "%d", port) < 0)
+ return -1;
+
+ n = getaddrinfo(servername, service, &hints, &res);
+
+ if (n < 0) {
+ fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port);
+ return n;
+ }
+
+ for (t = res; t; t = t->ai_next) {
+ sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
+ if (sockfd >= 0) {
+ if (!connect(sockfd, t->ai_addr, t->ai_addrlen))
+ break;
+ close(sockfd);
+ sockfd = -1;
+ }
+ }
+
+ freeaddrinfo(res);
+
+ if (sockfd < 0) {
+ fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port);
+ return sockfd;
+ }
+ return sockfd;
+}
+
+struct pingpong_dest * pp_client_exch_dest(int sockfd,
+ const struct pingpong_dest *my_dest, struct user_parameters *user_parm)
+{
+ struct pingpong_dest *rem_dest = NULL;
+ char msg[sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00"];
+ int parsed;
+
+ sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x",
+ my_dest->lid, my_dest->qpn, my_dest->psn,my_dest->rkey,my_dest->vaddr,
+ my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2],
+ my_dest->dgid.raw[3], my_dest->dgid.raw[4], my_dest->dgid.raw[5],
+ my_dest->dgid.raw[6], my_dest->dgid.raw[7], my_dest->dgid.raw[8],
+ my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11],
+ my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14],
+ my_dest->dgid.raw[15]);
+ if (write(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("client write");
+ fprintf(stderr, "Couldn't send local address\n");
+ goto out;
+ }
+
+ if (read(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("client read");
+ fprintf(stderr, "Couldn't read remote address\n");
+ goto out;
+ }
+
+ rem_dest = malloc(sizeof *rem_dest);
+ if (!rem_dest)
+ goto out;
+
+ if (user_parm->gid_index < 0) {
+ parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &rem_dest->lid, &rem_dest->qpn,
+ &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr);
+ if (parsed != 5) {
+ fprintf(stderr, "Couldn't parse line <%.*s>\n",(int)sizeof msg, msg);
+ free(rem_dest);
+ rem_dest = NULL;
+ goto out;
+ }
+ }else{
+ char *pstr = msg, *term;
+ char tmp[20];
+ int i;
+
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA
+
+ for (i = 0; i < 15; ++i) {
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16);
+ }
+ pstr += term - pstr + 1;
+ strcpy(tmp, pstr);
+ rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16);
+ }
+out:
+ return rem_dest;
+}
+
+int pp_server_connect(int port)
+{
+ struct addrinfo *res, *t;
+ struct addrinfo hints = {
+ .ai_flags = AI_PASSIVE,
+ .ai_family = AF_UNSPEC,
+ .ai_socktype = SOCK_STREAM
+ };
+ char *service;
+ int sockfd = -1, connfd;
+ int n;
+
+ if (asprintf(&service, "%d", port) < 0)
+ return -1;
+
+ n = getaddrinfo(NULL, service, &hints, &res);
+
+ if (n < 0) {
+ fprintf(stderr, "%s for port %d\n", gai_strerror(n), port);
+ return n;
+ }
+
+ for (t = res; t; t = t->ai_next) {
+ sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
+ if (sockfd >= 0) {
+ n = 1;
+
+ setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n);
+
+ if (!bind(sockfd, t->ai_addr, t->ai_addrlen))
+ break;
+ close(sockfd);
+ sockfd = -1;
+ }
+ }
+
+ freeaddrinfo(res);
+
+ if (sockfd < 0) {
+ fprintf(stderr, "Couldn't listen to port %d\n", port);
+ return sockfd;
+ }
+
+ listen(sockfd, 1);
+ connfd = accept(sockfd, NULL, 0);
+ if (connfd < 0) {
+ perror("server accept");
+ fprintf(stderr, "accept() failed\n");
+ close(sockfd);
+ return connfd;
+ }
+
+ close(sockfd);
+ return connfd;
+}
+
+static struct pingpong_dest *pp_server_exch_dest(int connfd, const struct pingpong_dest *my_dest, struct user_parameters *user_parm)
+{
+ char msg[sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00"];
+ struct pingpong_dest *rem_dest = NULL;
+ int parsed;
+ int n;
+
+ n = read(connfd, msg, sizeof msg);
+ if (n != sizeof msg) {
+ perror("server read");
+ fprintf(stderr, "%d/%d: Couldn't read remote address\n", n, (int) sizeof msg);
+ goto out;
+ }
+
+ rem_dest = malloc(sizeof *rem_dest);
+ if (!rem_dest)
+ goto out;
+
+ if (user_parm->gid_index < 0) {
+ parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &rem_dest->lid, &rem_dest->qpn,
+ &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr);
+ if (parsed != 5) {
+ fprintf(stderr, "Couldn't parse line <%.*s>\n",(int)sizeof msg, msg);
+ free(rem_dest);
+ rem_dest = NULL;
+ goto out;
+ }
+ }else{
+ char *pstr = msg, *term;
+ char tmp[20];
+ int i;
+
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA
+
+ for (i = 0; i < 15; ++i) {
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16);
+ }
+ pstr += term - pstr + 1;
+ strcpy(tmp, pstr);
+ rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16);
+ }
+
+ sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x",
+ my_dest->lid, my_dest->qpn, my_dest->psn, my_dest->rkey, my_dest->vaddr,
+ my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2],
+ my_dest->dgid.raw[3], my_dest->dgid.raw[4], my_dest->dgid.raw[5],
+ my_dest->dgid.raw[6], my_dest->dgid.raw[7], my_dest->dgid.raw[8],
+ my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11],
+ my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14],
+ my_dest->dgid.raw[15]);
+ if (write(connfd, msg, sizeof msg) != sizeof msg) {
+ perror("server write");
+ fprintf(stderr, "Couldn't send local address\n");
+ free(rem_dest);
+ rem_dest = NULL;
+ goto out;
+ }
+out:
+ return rem_dest;
+}
+
+static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev,
+ unsigned size,
+ int tx_depth, int port, struct user_parameters *user_parm)
+{
+ struct pingpong_context *ctx;
+ struct ibv_device_attr device_attr;
+ int counter;
+
+ ctx = malloc(sizeof *ctx);
+ if (!ctx)
+ return NULL;
+ ctx->qp = malloc(sizeof (struct ibv_qp*) * user_parm->numofqps );
+ ctx->size = size;
+ ctx->tx_depth = tx_depth;
+ ctx->scnt = malloc(user_parm->numofqps * sizeof (int));
+ if (!ctx->scnt) {
+ perror("malloc");
+ return NULL;
+ }
+ ctx->ccnt = malloc(user_parm->numofqps * sizeof (int));
+ if (!ctx->ccnt) {
+ perror("malloc");
+ return NULL;
+ }
+ memset(ctx->scnt, 0, user_parm->numofqps * sizeof (int));
+ memset(ctx->ccnt, 0, user_parm->numofqps * sizeof (int));
+
+ ctx->buf = memalign(page_size, size * 2 * user_parm->numofqps );
+ if (!ctx->buf) {
+ fprintf(stderr, "Couldn't allocate work buf.\n");
+ return NULL;
+ }
+
+ memset(ctx->buf, 0, size * 2 * user_parm->numofqps);
+
+ ctx->context = ibv_open_device(ib_dev);
+ if (!ctx->context) {
+ fprintf(stderr, "Couldn't get context for %s\n",
+ ibv_get_device_name(ib_dev));
+ return NULL;
+ }
+ if (user_parm->mtu == 0) {/*user did not ask for specific mtu */
+ if (ibv_query_device(ctx->context, &device_attr)) {
+ fprintf(stderr, "Failed to query device props");
+ return NULL;
+ }
+ if (device_attr.vendor_part_id == 23108 || user_parm->gid_index > -1) {
+ user_parm->mtu = 1024;
+ } else {
+ user_parm->mtu = 2048;
+ }
+ }
+
+ ctx->pd = ibv_alloc_pd(ctx->context);
+ if (!ctx->pd) {
+ fprintf(stderr, "Couldn't allocate PD\n");
+ return NULL;
+ }
+
+ /* We dont really want IBV_ACCESS_LOCAL_WRITE, but IB spec says:
+ * The Consumer is not allowed to assign Remote Write or Remote Atomic to
+ * a Memory Region that has not been assigned Local Write. */
+ ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size * 2 * user_parm->numofqps,
+ IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+ if (!ctx->mr) {
+ fprintf(stderr, "Couldn't allocate MR\n");
+ return NULL;
+ }
+
+ ctx->cq = ibv_create_cq(ctx->context, tx_depth * user_parm->numofqps , NULL, NULL, 0);
+ if (!ctx->cq) {
+ fprintf(stderr, "Couldn't create CQ\n");
+ return NULL;
+ }
+ for (counter =0 ; counter < user_parm->numofqps ; counter++)
+ {
+ struct ibv_qp_init_attr initattr;
+ struct ibv_qp_attr attr;
+ memset(&initattr, 0, sizeof(struct ibv_qp_init_attr));
+ initattr.send_cq = ctx->cq;
+ initattr.recv_cq = ctx->cq;
+ initattr.cap.max_send_wr = tx_depth;
+ /* Work around: driver doesnt support
+ * recv_wr = 0 */
+ initattr.cap.max_recv_wr = 1;
+ initattr.cap.max_send_sge = 1;
+ initattr.cap.max_recv_sge = 1;
+ initattr.cap.max_inline_data = user_parm->inline_size;
+
+ if (user_parm->connection_type == 1) {
+ initattr.qp_type = IBV_QPT_UC;
+ } else {
+ initattr.qp_type = IBV_QPT_RC;
+ }
+ ctx->qp[counter] = ibv_create_qp(ctx->pd, &initattr);
+ if (!ctx->qp[counter]) {
+ fprintf(stderr, "Couldn't create QP\n");
+ return NULL;
+ }
+
+ attr.qp_state = IBV_QPS_INIT;
+ attr.pkey_index = 0;
+ attr.port_num = port;
+ attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE;
+
+ if (ibv_modify_qp(ctx->qp[counter], &attr,
+ IBV_QP_STATE |
+ IBV_QP_PKEY_INDEX |
+ IBV_QP_PORT |
+ IBV_QP_ACCESS_FLAGS)) {
+ fprintf(stderr, "Failed to modify QP to INIT\n");
+ return NULL;
+ }
+ }
+
+ return ctx;
+}
+
+static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn,
+ struct pingpong_dest *dest, struct user_parameters *user_parm, int qpindex)
+{
+ struct ibv_qp_attr attr;
+ memset(&attr, 0, sizeof attr);
+
+ attr.qp_state = IBV_QPS_RTR;
+ switch (user_parm->mtu) {
+ case 256 :
+ attr.path_mtu = IBV_MTU_256;
+ break;
+ case 512 :
+ attr.path_mtu = IBV_MTU_512;
+ break;
+ case 1024 :
+ attr.path_mtu = IBV_MTU_1024;
+ break;
+ case 2048 :
+ attr.path_mtu = IBV_MTU_2048;
+ break;
+ case 4096 :
+ attr.path_mtu = IBV_MTU_4096;
+ break;
+ }
+ printf("Mtu : %d\n", user_parm->mtu);
+ attr.dest_qp_num = dest->qpn;
+ attr.rq_psn = dest->psn;
+ if (user_parm->connection_type==RC) {
+ attr.max_dest_rd_atomic = 1;
+ attr.min_rnr_timer = 12;
+ }
+ if (user_parm->gid_index<0) {
+ attr.ah_attr.is_global = 0;
+ attr.ah_attr.dlid = dest->lid;
+ attr.ah_attr.sl = sl;
+ } else {
+ attr.ah_attr.is_global = 1;
+ attr.ah_attr.grh.dgid = dest->dgid;
+ attr.ah_attr.grh.hop_limit = 1;
+ attr.ah_attr.sl = 0;
+ }
+ attr.ah_attr.src_path_bits = 0;
+ attr.ah_attr.port_num = port;
+ if (user_parm->connection_type == RC) {
+ if (ibv_modify_qp(ctx->qp[qpindex], &attr,
+ IBV_QP_STATE |
+ IBV_QP_AV |
+ IBV_QP_PATH_MTU |
+ IBV_QP_DEST_QPN |
+ IBV_QP_RQ_PSN |
+ IBV_QP_MIN_RNR_TIMER |
+ IBV_QP_MAX_DEST_RD_ATOMIC)) {
+ fprintf(stderr, "Failed to modify RC QP to RTR\n");
+ return 1;
+ }
+ attr.timeout = user_parm->qp_timeout;
+ attr.retry_cnt = 7;
+ attr.rnr_retry = 7;
+ } else {
+ if (ibv_modify_qp(ctx->qp[qpindex], &attr,
+ IBV_QP_STATE |
+ IBV_QP_AV |
+ IBV_QP_PATH_MTU |
+ IBV_QP_DEST_QPN |
+ IBV_QP_RQ_PSN)) {
+ fprintf(stderr, "Failed to modify UC QP to RTR\n");
+ return 1;
+ }
+
+ }
+ attr.qp_state = IBV_QPS_RTS;
+ attr.sq_psn = my_psn;
+ attr.max_rd_atomic = 1;
+ if (user_parm->connection_type == 0) {
+ attr.max_rd_atomic = 1;
+ if (ibv_modify_qp(ctx->qp[qpindex], &attr,
+ IBV_QP_STATE |
+ IBV_QP_SQ_PSN |
+ IBV_QP_TIMEOUT |
+ IBV_QP_RETRY_CNT |
+ IBV_QP_RNR_RETRY |
+ IBV_QP_MAX_QP_RD_ATOMIC)) {
+ fprintf(stderr, "Failed to modify RC QP to RTS\n");
+ return 1;
+ }
+ } else {
+ if (ibv_modify_qp(ctx->qp[qpindex], &attr,
+ IBV_QP_STATE |
+ IBV_QP_SQ_PSN)) {
+ fprintf(stderr, "Failed to modify UC QP to RTS\n");
+ return 1;
+ }
+
+ }
+ return 0;
+}
+
+static void usage(const char *argv0)
+{
+ printf("Usage:\n");
+ printf(" %s start a server and wait for connection\n", argv0);
+ printf(" %s <host> connect to server at <host>\n", argv0);
+ printf("\n");
+ printf("Options:\n");
+ printf(" -p, --port=<port> listen on/connect to port <port> (default 18515)\n");
+ printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n");
+ printf(" -i, --ib-port=<port> use port <port> of IB device (default 1)\n");
+ printf(" -c, --connection=<RC/UC> connection type RC/UC (default RC)\n");
+ printf(" -m, --mtu=<mtu> mtu size (256 - 4096. default for hermon is 2048)\n");
+ printf(" -g, --post=<num of posts> number of posts for each qp in the chain (default tx_depth)\n");
+ printf(" -q, --qp=<num of qp's> Num of qp's(default 1)\n");
+ printf(" -s, --size=<size> size of message to exchange (default 65536)\n");
+ printf(" -a, --all Run sizes from 2 till 2^23\n");
+ printf(" -t, --tx-depth=<dep> size of tx queue (default 100)\n");
+ printf(" -n, --iters=<iters> number of exchanges (at least 2, default 5000)\n");
+ printf(" -I, --inline_size=<size> max size of message to be sent in inline mode (default 400)\n");
+ printf(" -u, --qp-timeout=<timeout> QP timeout, timeout value is 4 usec * 2 ^(timeout), default 14\n");
+ printf(" -S, --sl=<sl> SL (default 0)\n");
+ printf(" -x, --gid-index=<index> test uses GID with GID index taken from command line (for RDMAoE index should be 0)\n");
+ printf(" -b, --bidirectional measure bidirectional bandwidth (default unidirectional)\n");
+ printf(" -V, --version display version number\n");
+ printf(" -N, --no peak-bw cancel peak-bw calculation (default with peak-bw)\n");
+ printf(" -F, --CPU-freq do not fail even if cpufreq_ondemand module is loaded\n");
+}
+
+static void print_report(unsigned int iters, unsigned size, int duplex,
+ cycles_t *tposted, cycles_t *tcompleted, struct user_parameters *user_param,
+ int noPeak, int no_cpu_freq_fail)
+{
+ double cycles_to_units;
+ unsigned long tsize; /* Transferred size, in megabytes */
+ int i, j;
+ int opt_posted = 0, opt_completed = 0;
+ cycles_t opt_delta;
+ cycles_t t;
+
+
+ opt_delta = tcompleted[opt_posted] - tposted[opt_completed];
+
+ if (!noPeak) {
+ /* Find the peak bandwidth unless asked not to in command line*/
+ for (i = 0; i < iters * user_param->numofqps; ++i)
+ for (j = i; j < iters * user_param->numofqps; ++j) {
+ t = (tcompleted[j] - tposted[i]) / (j - i + 1);
+ if (t < opt_delta) {
+ opt_delta = t;
+ opt_posted = i;
+ opt_completed = j;
+ }
+ }
+ }
+
+ cycles_to_units = get_cpu_mhz(no_cpu_freq_fail) * 1000000;
+
+ tsize = duplex ? 2 : 1;
+ tsize = tsize * size;
+ printf("%7d %d %7.2f %7.2f\n",
+ size,iters,!(noPeak) * tsize * cycles_to_units / opt_delta / 0x100000,
+ tsize * iters * user_param->numofqps * cycles_to_units /(tcompleted[(iters* user_param->numofqps) - 1] - tposted[0]) / 0x100000);
+}
+int run_iter(struct pingpong_context *ctx, struct user_parameters *user_param,
+ struct pingpong_dest **rem_dest, int size)
+{
+
+ struct ibv_qp *qp;
+ int totscnt, totccnt ;
+ int index ,warmindex;
+ int inline_size;
+ struct ibv_send_wr *bad_wr;
+ struct ibv_wc wc;
+ ctx->list.addr = (uintptr_t) ctx->buf;
+ ctx->list.length = size;
+ ctx->list.lkey = ctx->mr->lkey;
+
+
+ ctx->wr.sg_list = &ctx->list;
+ ctx->wr.num_sge = 1;
+ ctx->wr.opcode = IBV_WR_RDMA_WRITE;
+ inline_size = user_param->inline_size;
+ if (size > inline_size) {/* complaince to perf_main */
+ ctx->wr.send_flags = IBV_SEND_SIGNALED;
+ } else {
+ ctx->wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;
+ }
+ ctx->wr.next = NULL;
+
+ totscnt = 0;
+ totccnt = 0;
+ /*clear the scnt ccnt counters for each iteration*/
+ for (index =0 ; index < user_param->numofqps ; index++) {
+ ctx->scnt[index] = 0;
+ ctx->ccnt[index] = 0;
+ }
+ index = 0;
+
+ /* Done with setup. Start the test.
+ warm up posting of total 100 wq's per qp
+ 1 for each qp till all qps have 100 */
+ for (warmindex =0 ;warmindex < user_param->maxpostsofqpiniteration ;warmindex ++ ) {
+ for (index =0 ; index < user_param->numofqps ; index++) {
+ ctx->wr.wr.rdma.remote_addr = rem_dest[index]->vaddr;
+ ctx->wr.wr.rdma.rkey = rem_dest[index]->rkey;
+ qp = ctx->qp[index];
+ ctx->wr.wr_id = index ;
+ tposted[totscnt] = get_cycles();
+ if (ibv_post_send(qp, &ctx->wr, &bad_wr)) {
+ fprintf(stderr, "Couldn't post warmup send: qp index = %d qp scnt=%d total scnt %d\n",
+ index,ctx->scnt[index],totscnt);
+ return 1;
+ }
+ ctx->scnt[index]= ctx->scnt[index]+1;
+ ++totscnt;
+ }
+ }
+ /* main loop for posting */
+ while (totscnt < (user_param->iters * user_param->numofqps) || totccnt < (user_param->iters * user_param->numofqps) ) {
+ /* main loop to run over all the qps and post each time n messages */
+ for (index =0 ; index < user_param->numofqps ; index++) {
+ ctx->wr.wr.rdma.remote_addr = rem_dest[index]->vaddr;
+ ctx->wr.wr.rdma.rkey = rem_dest[index]->rkey;
+ qp = ctx->qp[index];
+ ctx->wr.wr_id = index ;
+ while (ctx->scnt[index] < user_param->iters && (ctx->scnt[index] - ctx->ccnt[index]) < user_param->maxpostsofqpiniteration) {
+ tposted[totscnt] = get_cycles();
+ if (ibv_post_send(qp, &ctx->wr, &bad_wr)) {
+ fprintf(stderr, "Couldn't post send: qp index = %d qp scnt=%d total scnt %d\n",
+ index,ctx->scnt[index],totscnt);
+ return 1;
+ }
+ ctx->scnt[index]= ctx->scnt[index]+1;
+ ++totscnt;
+ }
+ }
+ /* finished posting now polling */
+ if (totccnt < (user_param->iters * user_param->numofqps) ) {
+
+ int ne;
+ do {
+ ne = ibv_poll_cq(ctx->cq, 1, &wc);
+ } while (ne == 0);
+ tcompleted[totccnt] = get_cycles();
+ if (ne < 0) {
+ fprintf(stderr, "poll CQ failed %d\n", ne);
+ return 1;
+ }
+ if (wc.status != IBV_WC_SUCCESS) {
+ fprintf(stderr, "Completion wth error at %s:\n",
+ user_param->servername ? "client" : "server");
+ fprintf(stderr, "Failed status %d: wr_id %d\n",
+ wc.status, (int) wc.wr_id);
+ fprintf(stderr, "qp index %d ,qp scnt=%d, qp ccnt=%d total scnt %d total ccnt %d\n",
+ (int)wc.wr_id, ctx->scnt[(int)wc.wr_id], ctx->ccnt[(int)wc.wr_id], totscnt, totccnt);
+ return 1;
+ }
+ /*here the id is the index to the qp num */
+ ctx->ccnt[(int)wc.wr_id] = ctx->ccnt[(int)wc.wr_id]+1;
+ totccnt += 1;
+ }
+ }
+ return(0);
+}
+
+int main(int argc, char *argv[])
+{
+ struct ibv_device **dev_list;
+ struct ibv_device *ib_dev;
+ struct pingpong_context *ctx;
+ struct pingpong_dest *my_dest;
+ struct pingpong_dest **rem_dest;
+ struct user_parameters user_param;
+ struct ibv_device_attr device_attribute;
+ char *ib_devname = NULL;
+ int port = 18515;
+ int ib_port = 1;
+ long long size = 65536;
+ int sockfd;
+ int duplex = 0;
+ int i = 0;
+ int noPeak = 0;/*noPeak == 0: regular peak-bw calculation done*/
+ int inline_given_in_cmd = 0;
+ struct ibv_context *context;
+ int no_cpu_freq_fail = 0;
+ union ibv_gid gid;
+
+ /* init default values to user's parameters */
+ memset(&user_param, 0, sizeof(struct user_parameters));
+ user_param.mtu = 0;
+ user_param.iters = 5000;
+ user_param.tx_depth = 100;
+ user_param.servername = NULL;
+ user_param.numofqps = 1;
+ user_param.maxpostsofqpiniteration = 100;
+ user_param.inline_size = MAX_INLINE;
+ user_param.qp_timeout = 14;
+ user_param.gid_index = -1; /*gid will not be used*/
+ /* Parameter parsing. */
+ while (1) {
+ int c;
+
+ static struct option long_options[] = {
+ { .name = "port", .has_arg = 1, .val = 'p' },
+ { .name = "ib-dev", .has_arg = 1, .val = 'd' },
+ { .name = "ib-port", .has_arg = 1, .val = 'i' },
+ { .name = "mtu", .has_arg = 1, .val = 'm' },
+ { .name = "qp", .has_arg = 1, .val = 'q' },
+ { .name = "post", .has_arg = 1, .val = 'g' },
+ { .name = "connection", .has_arg = 1, .val = 'c' },
+ { .name = "size", .has_arg = 1, .val = 's' },
+ { .name = "iters", .has_arg = 1, .val = 'n' },
+ { .name = "tx-depth", .has_arg = 1, .val = 't' },
+ { .name = "inline_size", .has_arg = 1, .val = 'I' },
+ { .name = "qp-timeout", .has_arg = 1, .val = 'u' },
+ { .name = "sl", .has_arg = 1, .val = 'S' },
+ { .name = "gid-index", .has_arg = 1, .val = 'x' },
+ { .name = "all", .has_arg = 0, .val = 'a' },
+ { .name = "bidirectional", .has_arg = 0, .val = 'b' },
+ { .name = "version", .has_arg = 0, .val = 'V' },
+ { .name = "noPeak", .has_arg = 0, .val = 'N' },
+ { .name = "CPU-freq", .has_arg = 0, .val = 'F' },
+ { 0 }
+ };
+
+ c = getopt_long(argc, argv, "p:d:i:m:q:g:c:s:n:t:I:u:S:x:baVNF", long_options, NULL);
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'p':
+ port = strtol(optarg, NULL, 0);
+ if (port < 0 || port > 65535) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ case 'd':
+ ib_devname = strdupa(optarg);
+ break;
+ case 'c':
+ if (strcmp("UC",optarg)==0)
+ user_param.connection_type=UC;
+ break;
+
+ case 'm':
+ user_param.mtu = strtol(optarg, NULL, 0);
+ break;
+ case 'q':
+ user_param.numofqps = strtol(optarg, NULL, 0);
+ break;
+ case 'g':
+ user_param.maxpostsofqpiniteration = strtol(optarg, NULL, 0);
+ break;
+ case 'a':
+ user_param.all = ALL;
+ break;
+ case 'V':
+ printf("rdma_bw version : %.2f\n",VERSION);
+ return 0;
+ break;
+ case 'i':
+ ib_port = strtol(optarg, NULL, 0);
+ if (ib_port < 0) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ case 's':
+ size = strtoll(optarg, NULL, 0);
+ if (size < 1 || size > UINT_MAX / 2) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ case 't':
+ user_param.tx_depth = strtol(optarg, NULL, 0);
+ if (user_param.tx_depth < 1) { usage(argv[0]); return 1; }
+ break;
+
+ case 'I':
+ user_param.inline_size = strtol(optarg, NULL, 0);
+ inline_given_in_cmd =1;
+ if (user_param.inline_size > MAX_INLINE) {
+ usage(argv[0]);
+ return 7;
+ }
+ break;
+
+ case 'n':
+ user_param.iters = strtol(optarg, NULL, 0);
+ if (user_param.iters < 2) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ break;
+
+ case 'b':
+ duplex = 1;
+ break;
+
+ case 'N':
+ noPeak = 1;
+ break;
+
+ case 'F':
+ no_cpu_freq_fail = 1;
+ break;
+
+ case 'u':
+ user_param.qp_timeout = strtol(optarg, NULL, 0);
+ break;
+
+ case 'S':
+ sl = strtol(optarg, NULL, 0);
+ if (sl > 15) { usage(argv[0]); return 1; }
+ break;
+
+ case 'x':
+ user_param.gid_index = strtol(optarg, NULL, 0);
+ if (user_param.gid_index > 63) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ default:
+ usage(argv[0]);
+ return 1;
+ }
+ }
+
+ if (optind == argc - 1)
+ user_param.servername = strdupa(argv[optind]);
+ else if (optind < argc) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ printf("------------------------------------------------------------------\n");
+ if (duplex == 1) {
+ printf(" RDMA_Write Bidirectional BW Test\n");
+ } else {
+ printf(" RDMA_Write BW Test\n");
+ }
+
+ printf("Number of qp's running %d\n",user_param.numofqps);
+ if (user_param.connection_type==RC) {
+ printf("Connection type : RC\n");
+ } else {
+ printf("Connection type : UC\n");
+ }
+ if (user_param.maxpostsofqpiniteration > user_param.tx_depth ) {
+ printf("Can not post more than tx_depth , adjusting number of post to tx_depth\n");
+ user_param.maxpostsofqpiniteration = user_param.tx_depth;
+ }
+ if (user_param.maxpostsofqpiniteration > user_param.iters ) {
+ printf("Can not post more than iterations per qp , adjusting max number of post to num of iteration\n");
+ user_param.maxpostsofqpiniteration = user_param.iters;
+ }
+ if (user_param.gid_index > -1) {
+ printf("Using GID to support RDMAoE configuration. Refer to port type as Ethernet, default MTU 1024B\n");
+ }
+ printf("Each Qp will post up to %d messages each time\n",user_param.maxpostsofqpiniteration);
+ /* Done with parameter parsing. Perform setup. */
+ if (user_param.all == ALL) {
+ /*since we run all sizes */
+ size = 8388608; /*2^23 */
+ }
+ srand48(getpid() * time(NULL));
+
+ page_size = sysconf(_SC_PAGESIZE);
+
+ dev_list = ibv_get_device_list(NULL);
+
+ if (!ib_devname) {
+ ib_dev = dev_list[0];
+ if (!ib_dev) {
+ fprintf(stderr, "No IB devices found\n");
+ return 1;
+ }
+ } else {
+ for (; (ib_dev = *dev_list); ++dev_list)
+ if (!strcmp(ibv_get_device_name(ib_dev), ib_devname))
+ break;
+ if (!ib_dev) {
+ fprintf(stderr, "IB device %s not found\n", ib_devname);
+ return 1;
+ }
+ }
+
+ context = ibv_open_device(ib_dev);
+ if (ibv_query_device(context, &device_attribute)) {
+ fprintf(stderr, "Failed to query device props");
+ return 1;
+ }
+ if ((device_attribute.vendor_part_id == 25408 ||
+ device_attribute.vendor_part_id == 25418 ||
+ device_attribute.vendor_part_id == 26408 ||
+ device_attribute.vendor_part_id == 26418 ||
+ device_attribute.vendor_part_id == 26428) && (!inline_given_in_cmd)) {
+ user_param.inline_size = 1;
+ }
+ printf("Inline data is used up to %d bytes message\n", user_param.inline_size);
+
+ ctx = pp_init_ctx(ib_dev, size, user_param.tx_depth, ib_port, &user_param);
+ if (!ctx)
+ return 1;
+
+ if (user_param.gid_index != -1) {
+ int err=0;
+ err = ibv_query_gid (ctx->context, ib_port, user_param.gid_index, &gid);
+ if (err) {
+ return -1;
+ }
+ ctx->dgid=gid;
+ }
+
+ if (user_param.servername) {
+ sockfd = pp_client_connect(user_param.servername, port);
+ if (sockfd < 0)
+ return 1;
+ } else {
+ sockfd = pp_server_connect(port);
+ if (sockfd < 0)
+ return 1;
+ }
+
+ my_dest = malloc(user_param.numofqps * sizeof *my_dest);
+ if (!my_dest) {
+ perror("malloc my_dest");
+ return 1;
+ }
+ rem_dest = malloc(sizeof (struct pingpong_dest*) * user_param.numofqps );
+ if (!rem_dest ) {
+ perror("malloc rem_dest");
+ return 1;
+ }
+
+ for (i =0 ;i<user_param.numofqps;i ++) {
+ /* Create connection between client and server.
+ * We do it by exchanging data over a TCP socket connection. */
+ my_dest[i].lid = pp_get_local_lid(ctx, ib_port);
+ my_dest[i].psn = lrand48() & 0xffffff;
+ if (user_param.gid_index < 0) {/*We do not fail test upon lid in RDMA0E/Eth conf*/
+ if (!my_dest[i].lid) {
+ fprintf(stderr, "Local lid 0x0 detected. Is an SM running? If you are running on an RMDAoE interface you must use GIDs\n");
+ return 1;
+ }
+ }
+ my_dest[i].dgid = gid;
+ my_dest[i].qpn = ctx->qp[i]->qp_num;
+ /* TBD this should be changed inot VA and different key to each qp */
+ my_dest[i].rkey = ctx->mr->rkey;
+ my_dest[i].vaddr = (uintptr_t)ctx->buf + ctx->size;
+
+ printf(" local address: LID %#04x, QPN %#06x, PSN %#06x "
+ "RKey %#08x VAddr %#016Lx\n",
+ my_dest[i].lid, my_dest[i].qpn, my_dest[i].psn,
+ my_dest[i].rkey, my_dest[i].vaddr);
+ if (user_param.gid_index > -1) {
+ printf(" GID %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+ my_dest[i].dgid.raw[0],my_dest[i].dgid.raw[1],
+ my_dest[i].dgid.raw[2], my_dest[i].dgid.raw[3], my_dest[i].dgid.raw[4],
+ my_dest[i].dgid.raw[5], my_dest[i].dgid.raw[6], my_dest[i].dgid.raw[7],
+ my_dest[i].dgid.raw[8], my_dest[i].dgid.raw[9], my_dest[i].dgid.raw[10],
+ my_dest[i].dgid.raw[11], my_dest[i].dgid.raw[12], my_dest[i].dgid.raw[13],
+ my_dest[i].dgid.raw[14], my_dest[i].dgid.raw[15]);
+ }
+ if (user_param.servername) {
+ rem_dest[i] = pp_client_exch_dest(sockfd, &my_dest[i], &user_param);
+ } else {
+ rem_dest[i] = pp_server_exch_dest(sockfd, &my_dest[i], &user_param);
+ }
+ if (!rem_dest[i])
+ return 1;
+ printf(" remote address: LID %#04x, QPN %#06x, PSN %#06x, "
+ "RKey %#08x VAddr %#016Lx\n",
+ rem_dest[i]->lid, rem_dest[i]->qpn, rem_dest[i]->psn,
+ rem_dest[i]->rkey, rem_dest[i]->vaddr);
+ if (user_param.gid_index > -1) {
+ printf(" GID %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+ rem_dest[i]->dgid.raw[0],rem_dest[i]->dgid.raw[1],
+ rem_dest[i]->dgid.raw[2], rem_dest[i]->dgid.raw[3], rem_dest[i]->dgid.raw[4],
+ rem_dest[i]->dgid.raw[5], rem_dest[i]->dgid.raw[6], rem_dest[i]->dgid.raw[7],
+ rem_dest[i]->dgid.raw[8], rem_dest[i]->dgid.raw[9], rem_dest[i]->dgid.raw[10],
+ rem_dest[i]->dgid.raw[11], rem_dest[i]->dgid.raw[12], rem_dest[i]->dgid.raw[13],
+ rem_dest[i]->dgid.raw[14], rem_dest[i]->dgid.raw[15]);
+ }
+ if (pp_connect_ctx(ctx, ib_port, my_dest[i].psn, rem_dest[i], &user_param, i))
+ return 1;
+
+ /* An additional handshake is required *after* moving qp to RTR.
+ Arbitrarily reuse exch_dest for this purpose. */
+ if (user_param.servername) {
+ rem_dest[i] = pp_client_exch_dest(sockfd, &my_dest[i], &user_param);
+ } else {
+ rem_dest[i] = pp_server_exch_dest(sockfd, &my_dest[i], &user_param);
+ }
+ }
+
+ printf("------------------------------------------------------------------\n");
+ printf(" #bytes #iterations BW peak[MB/sec] BW average[MB/sec] \n");
+ /* For half duplex tests, server just waits for client to exit */
+ /* the 0th place is arbitrary to signal finish ... */
+ if (!user_param.servername && !duplex) {
+ rem_dest[0] = pp_server_exch_dest(sockfd, &my_dest[0], &user_param);
+ if (write(sockfd, "done", sizeof "done") != sizeof "done"){
+ perror("server write");
+ fprintf(stderr, "Couldn't write to socket\n");
+ return 1;
+ }
+ close(sockfd);
+ return 0;
+ }
+
+ tposted = malloc(user_param.iters * user_param.numofqps * sizeof *tposted);
+
+ if (!tposted) {
+ perror("malloc");
+ return 1;
+ }
+
+ tcompleted = malloc(user_param.iters * user_param.numofqps * sizeof *tcompleted);
+
+ if (!tcompleted) {
+ perror("malloc");
+ return 1;
+ }
+
+ if (user_param.all == ALL) {
+ for (i = 1; i < 24 ; ++i) {
+ size = 1 << i;
+ if(run_iter(ctx, &user_param, rem_dest, size))
+ return 17;
+ print_report(user_param.iters, size, duplex, tposted, tcompleted, &user_param, noPeak, no_cpu_freq_fail);
+ }
+ } else {
+ if(run_iter(ctx, &user_param, rem_dest, size))
+ return 18;
+ print_report(user_param.iters, size, duplex, tposted, tcompleted, &user_param, noPeak, no_cpu_freq_fail);
+ }
+ /* the 0th place is arbitrary to signal finish ... */
+ if (user_param.servername) {
+ rem_dest[0] = pp_client_exch_dest(sockfd, &my_dest[0], &user_param);
+ } else {
+ rem_dest[0] = pp_server_exch_dest(sockfd, &my_dest[0], &user_param);
+ }
+
+ if (write(sockfd, "done", sizeof "done") != sizeof "done"){
+ perror("write");
+ fprintf(stderr, "Couldn't write to socket\n");
+ return 1;
+ }
+ close(sockfd);
+
+ free(tposted);
+ free(tcompleted);
+
+ printf("------------------------------------------------------------------\n");
+ return 0;
+}
diff --git a/write_bw_postlist.c b/write_bw_postlist.c
new file mode 100755
index 0000000..622bfb9
--- /dev/null
+++ b/write_bw_postlist.c
@@ -0,0 +1,1166 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <netdb.h>
+#include <malloc.h>
+#include <getopt.h>
+#include <arpa/inet.h>
+#include <byteswap.h>
+#include <time.h>
+
+#include <infiniband/verbs.h>
+
+#include "get_clock.h"
+
+#define PINGPONG_RDMA_WRID 3
+#define VERSION 1.0
+#define ALL 1
+#define MAX_INLINE 400
+#define RC 0
+#define UC 1
+
+struct user_parameters {
+ const char *servername;
+ int connection_type;
+ int mtu;
+ int all; /* run all msg size */
+ int iters;
+ int tx_depth;
+ int numofqps;
+ int maxpostsofqpiniteration;
+ int inline_size;
+ int qp_timeout;
+ int gid_index; /* if value not negative, we use gid AND gid_index=value */
+};
+struct extended_qp {
+ struct ibv_qp *qp;
+ int scnt, ccnt ;
+};
+static int sl = 0;
+static int page_size;
+
+cycles_t *tposted;
+cycles_t *tcompleted;
+struct pingpong_context {
+ struct ibv_context *context;
+ struct ibv_pd *pd;
+ struct ibv_mr *mr;
+ struct ibv_cq *cq;
+ struct ibv_qp **qp;
+ void *buf;
+ unsigned size;
+ int tx_depth;
+ struct ibv_sge list;
+ struct ibv_send_wr wr;
+ int *scnt;
+ int *ccnt ;
+ union ibv_gid dgid;
+};
+
+struct pingpong_dest {
+ int lid;
+ int qpn;
+ int psn;
+ unsigned rkey;
+ unsigned long long vaddr;
+ union ibv_gid dgid;
+};
+
+
+static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port)
+{
+ struct ibv_port_attr attr;
+
+ if (ibv_query_port(ctx->context, port, &attr))
+ return 0;
+
+ return attr.lid;
+}
+
+static int pp_client_connect(const char *servername, int port)
+{
+ struct addrinfo *res, *t;
+ struct addrinfo hints = {
+ .ai_family = AF_UNSPEC,
+ .ai_socktype = SOCK_STREAM
+ };
+ char *service;
+ int n;
+ int sockfd = -1;
+
+ if (asprintf(&service, "%d", port) < 0)
+ return -1;
+
+ n = getaddrinfo(servername, service, &hints, &res);
+
+ if (n < 0) {
+ fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port);
+ return n;
+ }
+
+ for (t = res; t; t = t->ai_next) {
+ sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
+ if (sockfd >= 0) {
+ if (!connect(sockfd, t->ai_addr, t->ai_addrlen))
+ break;
+ close(sockfd);
+ sockfd = -1;
+ }
+ }
+
+ freeaddrinfo(res);
+
+ if (sockfd < 0) {
+ fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port);
+ return sockfd;
+ }
+ return sockfd;
+}
+
+struct pingpong_dest * pp_client_exch_dest(int sockfd,
+ const struct pingpong_dest *my_dest, struct user_parameters *user_parm)
+{
+ struct pingpong_dest *rem_dest = NULL;
+ char msg[sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00"];
+ int parsed;
+
+ sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x",
+ my_dest->lid, my_dest->qpn, my_dest->psn,my_dest->rkey,my_dest->vaddr,
+ my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2],
+ my_dest->dgid.raw[3], my_dest->dgid.raw[4], my_dest->dgid.raw[5],
+ my_dest->dgid.raw[6], my_dest->dgid.raw[7], my_dest->dgid.raw[8],
+ my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11],
+ my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14],
+ my_dest->dgid.raw[15]);
+ if (write(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("client write");
+ fprintf(stderr, "Couldn't send local address\n");
+ goto out;
+ }
+
+ if (read(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("client read");
+ fprintf(stderr, "Couldn't read remote address\n");
+ goto out;
+ }
+
+ rem_dest = malloc(sizeof *rem_dest);
+ if (!rem_dest)
+ goto out;
+
+ if (user_parm->gid_index < 0) {
+ parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &rem_dest->lid, &rem_dest->qpn,
+ &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr);
+ if (parsed != 5) {
+ fprintf(stderr, "Couldn't parse line <%.*s>\n",(int)sizeof msg, msg);
+ free(rem_dest);
+ rem_dest = NULL;
+ goto out;
+ }
+ }else{
+ char *pstr = msg, *term;
+ char tmp[20];
+ int i;
+
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA
+
+ for (i = 0; i < 15; ++i) {
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16);
+ }
+ pstr += term - pstr + 1;
+ strcpy(tmp, pstr);
+ rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16);
+ }
+out:
+ return rem_dest;
+}
+
+int pp_server_connect(int port)
+{
+ struct addrinfo *res, *t;
+ struct addrinfo hints = {
+ .ai_flags = AI_PASSIVE,
+ .ai_family = AF_UNSPEC,
+ .ai_socktype = SOCK_STREAM
+ };
+ char *service;
+ int sockfd = -1, connfd;
+ int n;
+
+ if (asprintf(&service, "%d", port) < 0)
+ return -1;
+
+ n = getaddrinfo(NULL, service, &hints, &res);
+
+ if (n < 0) {
+ fprintf(stderr, "%s for port %d\n", gai_strerror(n), port);
+ return n;
+ }
+
+ for (t = res; t; t = t->ai_next) {
+ sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
+ if (sockfd >= 0) {
+ n = 1;
+
+ setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n);
+
+ if (!bind(sockfd, t->ai_addr, t->ai_addrlen))
+ break;
+ close(sockfd);
+ sockfd = -1;
+ }
+ }
+
+ freeaddrinfo(res);
+
+ if (sockfd < 0) {
+ fprintf(stderr, "Couldn't listen to port %d\n", port);
+ return sockfd;
+ }
+
+ listen(sockfd, 1);
+ connfd = accept(sockfd, NULL, 0);
+ if (connfd < 0) {
+ perror("server accept");
+ fprintf(stderr, "accept() failed\n");
+ close(sockfd);
+ return connfd;
+ }
+
+ close(sockfd);
+ return connfd;
+}
+
+static struct pingpong_dest *pp_server_exch_dest(int connfd, const struct pingpong_dest *my_dest, struct user_parameters *user_parm)
+{
+ char msg[sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00"];
+ struct pingpong_dest *rem_dest = NULL;
+ int parsed;
+ int n;
+
+ n = read(connfd, msg, sizeof msg);
+ if (n != sizeof msg) {
+ perror("server read");
+ fprintf(stderr, "%d/%d: Couldn't read remote address\n", n, (int) sizeof msg);
+ goto out;
+ }
+
+ rem_dest = malloc(sizeof *rem_dest);
+ if (!rem_dest)
+ goto out;
+
+ if (user_parm->gid_index < 0) {
+ parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &rem_dest->lid, &rem_dest->qpn,
+ &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr);
+ if (parsed != 5) {
+ fprintf(stderr, "Couldn't parse line <%.*s>\n",(int)sizeof msg, msg);
+ free(rem_dest);
+ rem_dest = NULL;
+ goto out;
+ }
+ }else{
+ char *pstr = msg, *term;
+ char tmp[20];
+ int i;
+
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA
+
+ for (i = 0; i < 15; ++i) {
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16);
+ }
+ pstr += term - pstr + 1;
+ strcpy(tmp, pstr);
+ rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16);
+ }
+
+ sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x",
+ my_dest->lid, my_dest->qpn, my_dest->psn, my_dest->rkey, my_dest->vaddr,
+ my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2],
+ my_dest->dgid.raw[3], my_dest->dgid.raw[4], my_dest->dgid.raw[5],
+ my_dest->dgid.raw[6], my_dest->dgid.raw[7], my_dest->dgid.raw[8],
+ my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11],
+ my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14],
+ my_dest->dgid.raw[15]);
+ if (write(connfd, msg, sizeof msg) != sizeof msg) {
+ perror("server write");
+ fprintf(stderr, "Couldn't send local address\n");
+ free(rem_dest);
+ rem_dest = NULL;
+ goto out;
+ }
+out:
+ return rem_dest;
+}
+
+static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev,
+ unsigned size,
+ int tx_depth, int port, struct user_parameters *user_parm)
+{
+ struct pingpong_context *ctx;
+ struct ibv_device_attr device_attr;
+ int counter;
+
+ ctx = malloc(sizeof *ctx);
+ if (!ctx)
+ return NULL;
+ ctx->qp = malloc(sizeof (struct ibv_qp*) * user_parm->numofqps );
+ ctx->size = size;
+ ctx->tx_depth = tx_depth;
+ ctx->scnt = malloc(user_parm->numofqps * sizeof (int));
+ if (!ctx->scnt) {
+ perror("malloc");
+ return NULL;
+ }
+ ctx->ccnt = malloc(user_parm->numofqps * sizeof (int));
+ if (!ctx->ccnt) {
+ perror("malloc");
+ return NULL;
+ }
+ memset(ctx->scnt, 0, user_parm->numofqps * sizeof (int));
+ memset(ctx->ccnt, 0, user_parm->numofqps * sizeof (int));
+
+ ctx->buf = memalign(page_size, size * 2 * user_parm->numofqps );
+ if (!ctx->buf) {
+ fprintf(stderr, "Couldn't allocate work buf.\n");
+ return NULL;
+ }
+
+ memset(ctx->buf, 0, size * 2 * user_parm->numofqps);
+
+ ctx->context = ibv_open_device(ib_dev);
+ if (!ctx->context) {
+ fprintf(stderr, "Couldn't get context for %s\n",
+ ibv_get_device_name(ib_dev));
+ return NULL;
+ }
+ if (user_parm->mtu == 0) {/*user did not ask for specific mtu */
+ if (ibv_query_device(ctx->context, &device_attr)) {
+ fprintf(stderr, "Failed to query device props");
+ return NULL;
+ }
+ if (device_attr.vendor_part_id == 23108 || user_parm->gid_index > -1) {
+ user_parm->mtu = 1024;
+ } else {
+ user_parm->mtu = 2048;
+ }
+ }
+
+ ctx->pd = ibv_alloc_pd(ctx->context);
+ if (!ctx->pd) {
+ fprintf(stderr, "Couldn't allocate PD\n");
+ return NULL;
+ }
+
+ /* We dont really want IBV_ACCESS_LOCAL_WRITE, but IB spec says:
+ * The Consumer is not allowed to assign Remote Write or Remote Atomic to
+ * a Memory Region that has not been assigned Local Write. */
+ ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size * 2 * user_parm->numofqps,
+ IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+ if (!ctx->mr) {
+ fprintf(stderr, "Couldn't allocate MR\n");
+ return NULL;
+ }
+
+ ctx->cq = ibv_create_cq(ctx->context, tx_depth * user_parm->numofqps , NULL, NULL, 0);
+ if (!ctx->cq) {
+ fprintf(stderr, "Couldn't create CQ\n");
+ return NULL;
+ }
+ for (counter =0 ; counter < user_parm->numofqps ; counter++)
+ {
+ struct ibv_qp_init_attr initattr;
+ struct ibv_qp_attr attr;
+ memset(&initattr, 0, sizeof(struct ibv_qp_init_attr));
+ initattr.send_cq = ctx->cq;
+ initattr.recv_cq = ctx->cq;
+ initattr.cap.max_send_wr = tx_depth;
+ /* Work around: driver doesnt support
+ * recv_wr = 0 */
+ initattr.cap.max_recv_wr = 1;
+ initattr.cap.max_send_sge = 1;
+ initattr.cap.max_recv_sge = 1;
+ initattr.cap.max_inline_data = user_parm->inline_size;
+
+ if (user_parm->connection_type == 1) {
+ initattr.qp_type = IBV_QPT_UC;
+ } else {
+ initattr.qp_type = IBV_QPT_RC;
+ }
+ ctx->qp[counter] = ibv_create_qp(ctx->pd, &initattr);
+ if (!ctx->qp[counter]) {
+ fprintf(stderr, "Couldn't create QP\n");
+ return NULL;
+ }
+
+ attr.qp_state = IBV_QPS_INIT;
+ attr.pkey_index = 0;
+ attr.port_num = port;
+ attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE;
+
+ if (ibv_modify_qp(ctx->qp[counter], &attr,
+ IBV_QP_STATE |
+ IBV_QP_PKEY_INDEX |
+ IBV_QP_PORT |
+ IBV_QP_ACCESS_FLAGS)) {
+ fprintf(stderr, "Failed to modify QP to INIT\n");
+ return NULL;
+ }
+ }
+
+ return ctx;
+}
+
+static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn,
+ struct pingpong_dest *dest, struct user_parameters *user_parm, int qpindex)
+{
+ struct ibv_qp_attr attr;
+ memset(&attr, 0, sizeof attr);
+
+ attr.qp_state = IBV_QPS_RTR;
+ switch (user_parm->mtu) {
+ case 256 :
+ attr.path_mtu = IBV_MTU_256;
+ break;
+ case 512 :
+ attr.path_mtu = IBV_MTU_512;
+ break;
+ case 1024 :
+ attr.path_mtu = IBV_MTU_1024;
+ break;
+ case 2048 :
+ attr.path_mtu = IBV_MTU_2048;
+ break;
+ case 4096 :
+ attr.path_mtu = IBV_MTU_4096;
+ break;
+ }
+ printf("Mtu : %d\n", user_parm->mtu);
+ attr.dest_qp_num = dest->qpn;
+ attr.rq_psn = dest->psn;
+ if (user_parm->connection_type==RC) {
+ attr.max_dest_rd_atomic = 1;
+ attr.min_rnr_timer = 12;
+ }
+ if (user_parm->gid_index < 0) {
+ attr.ah_attr.is_global = 0;
+ attr.ah_attr.dlid = dest->lid;
+ attr.ah_attr.sl = sl;
+ } else {
+ attr.ah_attr.is_global = 1;
+ attr.ah_attr.grh.dgid = dest->dgid;
+ attr.ah_attr.grh.hop_limit = 1;
+ attr.ah_attr.sl = 0;
+ }
+ attr.ah_attr.src_path_bits = 0;
+ attr.ah_attr.port_num = port;
+ if (user_parm->connection_type == RC) {
+ if (ibv_modify_qp(ctx->qp[qpindex], &attr,
+ IBV_QP_STATE |
+ IBV_QP_AV |
+ IBV_QP_PATH_MTU |
+ IBV_QP_DEST_QPN |
+ IBV_QP_RQ_PSN |
+ IBV_QP_MIN_RNR_TIMER |
+ IBV_QP_MAX_DEST_RD_ATOMIC)) {
+ fprintf(stderr, "Failed to modify RC QP to RTR\n");
+ return 1;
+ }
+ attr.timeout = user_parm->qp_timeout;
+ attr.retry_cnt = 7;
+ attr.rnr_retry = 7;
+ } else {
+ if (ibv_modify_qp(ctx->qp[qpindex], &attr,
+ IBV_QP_STATE |
+ IBV_QP_AV |
+ IBV_QP_PATH_MTU |
+ IBV_QP_DEST_QPN |
+ IBV_QP_RQ_PSN)) {
+ fprintf(stderr, "Failed to modify UC QP to RTR\n");
+ return 1;
+ }
+
+ }
+ attr.qp_state = IBV_QPS_RTS;
+ attr.sq_psn = my_psn;
+ attr.max_rd_atomic = 1;
+ if (user_parm->connection_type == 0) {
+ attr.max_rd_atomic = 1;
+ if (ibv_modify_qp(ctx->qp[qpindex], &attr,
+ IBV_QP_STATE |
+ IBV_QP_SQ_PSN |
+ IBV_QP_TIMEOUT |
+ IBV_QP_RETRY_CNT |
+ IBV_QP_RNR_RETRY |
+ IBV_QP_MAX_QP_RD_ATOMIC)) {
+ fprintf(stderr, "Failed to modify RC QP to RTS\n");
+ return 1;
+ }
+ } else {
+ if (ibv_modify_qp(ctx->qp[qpindex], &attr,
+ IBV_QP_STATE |
+ IBV_QP_SQ_PSN)) {
+ fprintf(stderr, "Failed to modify UC QP to RTS\n");
+ return 1;
+ }
+
+ }
+ return 0;
+}
+
+static void usage(const char *argv0)
+{
+ printf("Usage:\n");
+ printf(" %s start a server and wait for connection\n", argv0);
+ printf(" %s <host> connect to server at <host>\n", argv0);
+ printf("\n");
+ printf("Options:\n");
+ printf(" -p, --port=<port> listen on/connect to port <port> (default 18515)\n");
+ printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n");
+ printf(" -i, --ib-port=<port> use port <port> of IB device (default 1)\n");
+ printf(" -c, --connection=<RC/UC> connection type RC/UC (default RC)\n");
+ printf(" -m, --mtu=<mtu> mtu size (256 - 4096. default for hermon is 2048)\n");
+ printf(" -g, --post=<num of posts> number of posts for each qp in the chain (default tx_depth)\n");
+ printf(" -q, --qp=<num of qp's> Num of qp's(default 1)\n");
+ printf(" -s, --size=<size> size of message to exchange (default 65536)\n");
+ printf(" -a, --all Run sizes from 2 till 2^23\n");
+ printf(" -t, --tx-depth=<dep> size of tx queue (default 100)\n");
+ printf(" -n, --iters=<iters> number of exchanges (at least 2, default 5000)\n");
+ printf(" -I, --inline_size=<size> max size of message to be sent in inline mode (default 400)\n");
+ printf(" -u, --qp-timeout=<timeout> QP timeout, timeout value is 4 usec * 2 ^(timeout), default 14\n");
+ printf(" -S, --sl=<sl> SL (default 0)\n");
+ printf(" -x, --gid-index=<index> test uses GID with GID index taken from command line (for RDMAoE index should be 0)\n");
+ printf(" -b, --bidirectional measure bidirectional bandwidth (default unidirectional)\n");
+ printf(" -V, --version display version number\n");
+ printf(" -F, --CPU-freq do not fail even if cpufreq_ondemand module is loaded\n");
+}
+
+static void print_report(unsigned int iters, unsigned size, int duplex,
+ cycles_t *tposted, cycles_t *tcompleted, struct user_parameters *user_param, int no_cpu_freq_fail)
+{
+ double cycles_to_units;
+ unsigned long tsize; /* Transferred size, in megabytes */
+ int i, j;
+ int opt_posted = 0, opt_completed = 0;
+ cycles_t opt_delta;
+ cycles_t t;
+
+
+ opt_delta = tcompleted[opt_posted] - tposted[opt_completed];
+
+ /* Find the peak bandwidth */
+ for (i = 0; i < iters * user_param->numofqps; ++i)
+ for (j = i; j < iters * user_param->numofqps; ++j) {
+ t = (tcompleted[j] - tposted[i]) / (j - i + 1);
+ if (t < opt_delta) {
+ opt_delta = t;
+ opt_posted = i;
+ opt_completed = j;
+ }
+ }
+
+ cycles_to_units = get_cpu_mhz(no_cpu_freq_fail) * 1000000;
+
+ tsize = duplex ? 2 : 1;
+ tsize = tsize * size;
+ printf("%7d %d %7.2f %7.2f\n",
+ size,iters,tsize * cycles_to_units / opt_delta / 0x100000,
+ tsize * iters * user_param->numofqps * cycles_to_units /(tcompleted[(iters* user_param->numofqps) - 1] - tposted[0]) / 0x100000);
+}
+int run_iter(struct pingpong_context *ctx, struct user_parameters *user_param,
+ struct pingpong_dest **rem_dest, int size)
+{
+ struct ibv_qp *qp;
+ int totscnt, totccnt ;
+ int index , qpindex;
+ int numpostperqp ;
+ struct ibv_send_wr *wrlist;
+ struct ibv_send_wr *bad_wr;
+ struct ibv_wc wc;
+
+ wrlist = malloc(user_param->numofqps * sizeof (struct ibv_send_wr) * user_param->tx_depth);
+ if (!wrlist) {
+ perror("malloc");
+ return -1;
+ }
+ ctx->list.addr = (uintptr_t) ctx->buf;
+ ctx->list.length = size;
+ ctx->list.lkey = ctx->mr->lkey;
+
+ /* prepare the wqe */
+ ctx->wr.sg_list = &ctx->list;
+ ctx->wr.num_sge = 1;
+ ctx->wr.opcode = IBV_WR_RDMA_WRITE;
+ if (size > user_param->inline_size) {/* complaince to perf_main */
+ ctx->wr.send_flags = IBV_SEND_SIGNALED;
+ } else {
+ ctx->wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;
+ }
+ ctx->wr.next = NULL;
+ /*These should be the i'th place ... */
+ ctx->wr.wr.rdma.remote_addr = rem_dest[0]->vaddr;
+ ctx->wr.wr.rdma.rkey = rem_dest[0]->rkey;
+ /* lets make the list with the right id's*/
+ for (qpindex=0 ; qpindex < user_param->numofqps ; qpindex++) {
+ for (index =0 ; index < user_param->maxpostsofqpiniteration ; index++) {
+ wrlist[qpindex*user_param->maxpostsofqpiniteration+index]=ctx->wr;
+ wrlist[qpindex*user_param->maxpostsofqpiniteration+ index].wr_id = qpindex ;
+ if(index < user_param->maxpostsofqpiniteration -1) {
+ wrlist[qpindex*user_param->maxpostsofqpiniteration+index].next=&wrlist[qpindex*user_param->maxpostsofqpiniteration+index+1];
+ } else {
+ wrlist[qpindex*user_param->maxpostsofqpiniteration+index].next=NULL;
+ }
+ }
+ }
+ totscnt = 0;
+ totccnt = 0;
+ /*clear the scnt ccnt counters for each iteration*/
+ for (index =0 ; index < user_param->numofqps ; index++) {
+ ctx->scnt[index] = 0;
+ ctx->ccnt[index] = 0;
+ }
+ index = 0;
+
+ /* Done with setup. Start the test. */
+
+ while (totscnt < (user_param->iters * user_param->numofqps) || totccnt < (user_param->iters * user_param->numofqps) ) {
+ /* main loop to run over all the qps and post for each accumulated 40 wq's */
+ for (qpindex =0 ; qpindex < user_param->numofqps ; qpindex++) {
+ qp = ctx->qp[qpindex];
+ if (user_param->iters > ctx->scnt[qpindex] ) {
+ numpostperqp = user_param->maxpostsofqpiniteration - (ctx->scnt[qpindex] - ctx->ccnt[qpindex]);
+ if (numpostperqp > 40 || ((user_param->iters - ctx->scnt[qpindex]) <= 40 && numpostperqp > 0) ){
+ wrlist[qpindex*user_param->maxpostsofqpiniteration+numpostperqp-1].next=NULL;
+ tposted[totscnt] = get_cycles();
+ if (ibv_post_send(qp, &wrlist[qpindex*user_param->maxpostsofqpiniteration], &bad_wr)) {
+ fprintf(stderr, "Couldn't post %d send: qp index = %d qp scnt=%d total scnt %d qp scnt=%d total ccnt=%d\n",
+ numpostperqp,qpindex,ctx->scnt[qpindex],totscnt,ctx->ccnt[qpindex],totccnt);
+ return 1;
+ }
+ ctx->scnt[qpindex]= ctx->scnt[qpindex]+numpostperqp;
+ totscnt=totscnt + numpostperqp;
+ wrlist[qpindex*user_param->maxpostsofqpiniteration+numpostperqp-1].next=&wrlist[qpindex*user_param->maxpostsofqpiniteration+numpostperqp];
+ }
+ }
+ /*FINISHED POSTING */
+ }
+ if (totccnt < (user_param->iters * user_param->numofqps) ) {
+ int ne;
+ do {
+ ne = ibv_poll_cq(ctx->cq, 1, &wc);
+ } while (ne == 0);
+ tcompleted[totccnt] = get_cycles();
+ if (ne < 0) {
+ fprintf(stderr, "poll CQ failed %d\n", ne);
+ return 1;
+ }
+ if (wc.status != IBV_WC_SUCCESS) {
+ fprintf(stderr, "Completion wth error at %s:\n",
+ user_param->servername ? "client" : "server");
+ fprintf(stderr, "Failed status %d: wr_id %d\n",
+ wc.status, (int) wc.wr_id);
+ fprintf(stderr, "qp index %d ,qp scnt=%d, qp ccnt=%d total scnt %d total ccnt %d\n",
+ (int)wc.wr_id, ctx->scnt[(int)wc.wr_id], ctx->ccnt[(int)wc.wr_id], totscnt, totccnt);
+ return 1;
+ }
+ /*here the id is the index to the qp num */
+ ctx->ccnt[(int)wc.wr_id] = ctx->ccnt[(int)wc.wr_id]+1;
+ totccnt += 1;
+ }
+ }
+ free(wrlist);
+ return(0);
+}
+
+int main(int argc, char *argv[])
+{
+ struct ibv_device **dev_list;
+ struct ibv_device *ib_dev;
+ struct pingpong_context *ctx;
+ struct pingpong_dest *my_dest;
+ struct pingpong_dest **rem_dest;
+ struct user_parameters user_param;
+ struct ibv_device_attr device_attribute;
+ char *ib_devname = NULL;
+ int port = 18515;
+ int ib_port = 1;
+ long long size = 65536;
+ int sockfd;
+ int duplex = 0;
+ int i = 0;
+ int inline_given_in_cmd = 0;
+ struct ibv_context *context;
+ int no_cpu_freq_fail = 0;
+ union ibv_gid gid;
+
+ /* init default values to user's parameters */
+ memset(&user_param, 0, sizeof(struct user_parameters));
+ user_param.mtu = 0;
+ user_param.iters = 5000;
+ user_param.tx_depth = 100;
+ user_param.servername = NULL;
+ user_param.numofqps = 1;
+ user_param.maxpostsofqpiniteration = 100;
+ user_param.inline_size = MAX_INLINE;
+ user_param.qp_timeout = 14;
+ user_param.gid_index = -1; /*gid will not be used*/
+ /* Parameter parsing. */
+ while (1) {
+ int c;
+
+ static struct option long_options[] = {
+ { .name = "port", .has_arg = 1, .val = 'p' },
+ { .name = "ib-dev", .has_arg = 1, .val = 'd' },
+ { .name = "ib-port", .has_arg = 1, .val = 'i' },
+ { .name = "mtu", .has_arg = 1, .val = 'm' },
+ { .name = "qp", .has_arg = 1, .val = 'q' },
+ { .name = "post", .has_arg = 1, .val = 'g' },
+ { .name = "connection", .has_arg = 1, .val = 'c' },
+ { .name = "size", .has_arg = 1, .val = 's' },
+ { .name = "iters", .has_arg = 1, .val = 'n' },
+ { .name = "tx-depth", .has_arg = 1, .val = 't' },
+ { .name = "inline_size", .has_arg = 1, .val = 'I' },
+ { .name = "qp-timeout", .has_arg = 1, .val = 'u' },
+ { .name = "sl", .has_arg = 1, .val = 'S' },
+ { .name = "gid-index", .has_arg = 1, .val = 'x' },
+ { .name = "all", .has_arg = 0, .val = 'a' },
+ { .name = "bidirectional", .has_arg = 0, .val = 'b' },
+ { .name = "version", .has_arg = 0, .val = 'V' },
+ { .name = "CPU-freq", .has_arg = 0, .val = 'F' },
+ { 0 }
+ };
+
+ c = getopt_long(argc, argv, "p:d:i:m:q:g:c:s:n:t:I:u:S:x:baVF", long_options, NULL);
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'p':
+ port = strtol(optarg, NULL, 0);
+ if (port < 0 || port > 65535) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ case 'd':
+ ib_devname = strdupa(optarg);
+ break;
+ case 'c':
+ if (strcmp("UC",optarg)==0)
+ user_param.connection_type=UC;
+ break;
+
+ case 'm':
+ user_param.mtu = strtol(optarg, NULL, 0);
+ break;
+ case 'q':
+ user_param.numofqps = strtol(optarg, NULL, 0);
+ break;
+ case 'g':
+ user_param.maxpostsofqpiniteration = strtol(optarg, NULL, 0);
+ break;
+ case 'a':
+ user_param.all = ALL;
+ break;
+ case 'V':
+ printf("rdma_bw version : %.2f\n",VERSION);
+ return 0;
+ break;
+ case 'i':
+ ib_port = strtol(optarg, NULL, 0);
+ if (ib_port < 0) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ case 's':
+ size = strtoll(optarg, NULL, 0);
+ if (size < 1 || size > UINT_MAX / 2) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ case 't':
+ user_param.tx_depth = strtol(optarg, NULL, 0);
+ if (user_param.tx_depth < 1) { usage(argv[0]); return 1; }
+ break;
+
+ case 'I':
+ user_param.inline_size = strtol(optarg, NULL, 0);
+ inline_given_in_cmd =1;
+ if (user_param.inline_size > MAX_INLINE) {
+ usage(argv[0]);
+ return 7;
+ }
+
+ break;
+
+ case 'n':
+ user_param.iters = strtol(optarg, NULL, 0);
+ if (user_param.iters < 2) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ break;
+
+ case 'b':
+ duplex = 1;
+ break;
+
+ case 'F':
+ no_cpu_freq_fail = 1;
+ break;
+
+ case 'u':
+ user_param.qp_timeout = strtol(optarg, NULL, 0);
+ break;
+
+ case 'S':
+ sl = strtol(optarg, NULL, 0);
+ if (sl > 15) { usage(argv[0]); return 1; }
+ break;
+
+ case 'x':
+ user_param.gid_index = strtol(optarg, NULL, 0);
+ if (user_param.gid_index > 63) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ default:
+ usage(argv[0]);
+ return 1;
+ }
+ }
+
+ if (optind == argc - 1)
+ user_param.servername = strdupa(argv[optind]);
+ else if (optind < argc) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ printf("------------------------------------------------------------------\n");
+ if (duplex == 1) {
+ printf(" RDMA_Write Bidirectional Post List BW Test\n");
+ } else {
+ printf(" RDMA_Write Post List BW Test\n");
+ }
+
+ printf("Number of qp's running %d\n",user_param.numofqps);
+ if (user_param.connection_type==RC) {
+ printf("Connection type : RC\n");
+ } else {
+ printf("Connection type : UC\n");
+ }
+ if (user_param.maxpostsofqpiniteration > user_param.tx_depth ) {
+ printf("Can not post more than tx_depth , adjusting number of post to tx_depth\n");
+ user_param.maxpostsofqpiniteration = user_param.tx_depth;
+ } else {
+ printf("Each Qp will post %d messages each time\n",user_param.maxpostsofqpiniteration);
+ }
+ if (user_param.gid_index > -1) {
+ printf("Using GID to support RDMAoE configuration. Refer to port type as Ethernet, default MTU 1024B\n");
+ }
+ /* Done with parameter parsing. Perform setup. */
+ if (user_param.all == ALL) {
+ /*since we run all sizes */
+ size = 8388608; /*2^23 */
+ }
+ srand48(getpid() * time(NULL));
+
+ page_size = sysconf(_SC_PAGESIZE);
+
+ dev_list = ibv_get_device_list(NULL);
+
+ if (!ib_devname) {
+ ib_dev = dev_list[0];
+ if (!ib_dev) {
+ fprintf(stderr, "No IB devices found\n");
+ return 1;
+ }
+ } else {
+ for (; (ib_dev = *dev_list); ++dev_list)
+ if (!strcmp(ibv_get_device_name(ib_dev), ib_devname))
+ break;
+ if (!ib_dev) {
+ fprintf(stderr, "IB device %s not found\n", ib_devname);
+ return 1;
+ }
+ }
+
+ context = ibv_open_device(ib_dev);
+ if (ibv_query_device(context, &device_attribute)) {
+ fprintf(stderr, "Failed to query device props");
+ return 1;
+ }
+ if ((device_attribute.vendor_part_id == 25408 ||
+ device_attribute.vendor_part_id == 25418 ||
+ device_attribute.vendor_part_id == 26408 ||
+ device_attribute.vendor_part_id == 26418 ||
+ device_attribute.vendor_part_id == 26428) && (!inline_given_in_cmd)) {
+ user_param.inline_size = 1;
+ }
+ printf("Inline data is used up to %d bytes message\n", user_param.inline_size);
+
+ ctx = pp_init_ctx(ib_dev, size, user_param.tx_depth, ib_port, &user_param);
+ if (!ctx)
+ return 1;
+
+ if (user_param.gid_index != -1) {
+ int err=0;
+ err = ibv_query_gid (ctx->context, ib_port, user_param.gid_index, &gid);
+ if (err) {
+ return -1;
+ }
+ ctx->dgid=gid;
+ }
+
+
+ if (user_param.servername) {
+ sockfd = pp_client_connect(user_param.servername, port);
+ if (sockfd < 0)
+ return 1;
+ } else {
+ sockfd = pp_server_connect(port);
+ if (sockfd < 0)
+ return 1;
+ }
+
+ my_dest = malloc(user_param.numofqps * sizeof *my_dest);
+ rem_dest = malloc(sizeof (struct pingpong_dest*) * user_param.numofqps );
+
+ for (i =0 ; i<user_param.numofqps; i++) {
+ /* Create connection between client and server.
+ * We do it by exchanging data over a TCP socket connection. */
+ my_dest[i].lid = pp_get_local_lid(ctx, ib_port);
+ my_dest[i].psn = lrand48() & 0xffffff;
+ if (user_param.gid_index < 0) {/*We do not fail test upon lid in RDMA0E/Eth conf*/
+ if (!my_dest[i].lid) {
+ fprintf(stderr, "Local lid 0x0 detected. Is an SM running? If you are running on an RMDAoE interface you must use GIDs\n");
+ return 1;
+ }
+ }
+ my_dest[i].dgid = gid;
+ my_dest[i].qpn = ctx->qp[i]->qp_num;
+ /* TBD this should be changed into VA and diffreent key to each qp */
+ my_dest[i].rkey = ctx->mr->rkey;
+ my_dest[i].vaddr = (uintptr_t)ctx->buf + ctx->size;
+
+ printf(" local address: LID %#04x, QPN %#06x, PSN %#06x "
+ "RKey %#08x VAddr %#016Lx\n",
+ my_dest[i].lid, my_dest[i].qpn, my_dest[i].psn,
+ my_dest[i].rkey, my_dest[i].vaddr);
+ if (user_param.gid_index > -1) {
+ printf(" GID %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+ my_dest[i].dgid.raw[0],my_dest[i].dgid.raw[1],
+ my_dest[i].dgid.raw[2], my_dest[i].dgid.raw[3], my_dest[i].dgid.raw[4],
+ my_dest[i].dgid.raw[5], my_dest[i].dgid.raw[6], my_dest[i].dgid.raw[7],
+ my_dest[i].dgid.raw[8], my_dest[i].dgid.raw[9], my_dest[i].dgid.raw[10],
+ my_dest[i].dgid.raw[11], my_dest[i].dgid.raw[12], my_dest[i].dgid.raw[13],
+ my_dest[i].dgid.raw[14], my_dest[i].dgid.raw[15]);
+ }
+ if (user_param.servername) {
+ rem_dest[i] = pp_client_exch_dest(sockfd, &my_dest[i], &user_param);
+ } else {
+ rem_dest[i] = pp_server_exch_dest(sockfd, &my_dest[i], &user_param);
+ }
+ if (!rem_dest[i])
+ return 1;
+ printf(" remote address: LID %#04x, QPN %#06x, PSN %#06x, "
+ "RKey %#08x VAddr %#016Lx\n",
+ rem_dest[i]->lid, rem_dest[i]->qpn, rem_dest[i]->psn,
+ rem_dest[i]->rkey, rem_dest[i]->vaddr);
+ if (user_param.gid_index > -1) {
+ printf(" GID %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+ rem_dest[i]->dgid.raw[0],rem_dest[i]->dgid.raw[1],
+ rem_dest[i]->dgid.raw[2], rem_dest[i]->dgid.raw[3], rem_dest[i]->dgid.raw[4],
+ rem_dest[i]->dgid.raw[5], rem_dest[i]->dgid.raw[6], rem_dest[i]->dgid.raw[7],
+ rem_dest[i]->dgid.raw[8], rem_dest[i]->dgid.raw[9], rem_dest[i]->dgid.raw[10],
+ rem_dest[i]->dgid.raw[11], rem_dest[i]->dgid.raw[12], rem_dest[i]->dgid.raw[13],
+ rem_dest[i]->dgid.raw[14], rem_dest[i]->dgid.raw[15]);
+ }
+ if (pp_connect_ctx(ctx, ib_port, my_dest[i].psn, rem_dest[i], &user_param, i))
+ return 1;
+
+ /* An additional handshake is required *after* moving qp to RTR.
+ Arbitrarily reuse exch_dest for this purpose. */
+ if (user_param.servername) {
+ rem_dest[i] = pp_client_exch_dest(sockfd, &my_dest[i], &user_param);
+ } else {
+ rem_dest[i] = pp_server_exch_dest(sockfd, &my_dest[i], &user_param);
+ }
+ }
+
+ printf("------------------------------------------------------------------\n");
+ printf(" #bytes #iterations BW peak[MB/sec] BW average[MB/sec] \n");
+ /* For half duplex tests, server just waits for client to exit */
+ /* the 0th place is arbitrary to signal finish ... */
+ if (!user_param.servername && !duplex) {
+ rem_dest[0] = pp_server_exch_dest(sockfd, &my_dest[0], &user_param);
+ if (write(sockfd, "done", sizeof "done") != sizeof "done"){
+ perror("server write");
+ fprintf(stderr, "Couldn't write to socket\n");
+ return 1;
+ }
+ close(sockfd);
+ return 0;
+ }
+
+ tposted = malloc(user_param.iters * user_param.numofqps * sizeof *tposted);
+
+ if (!tposted) {
+ perror("malloc");
+ return 1;
+ }
+
+ tcompleted = malloc(user_param.iters * user_param.numofqps * sizeof *tcompleted);
+
+ if (!tcompleted) {
+ perror("malloc");
+ return 1;
+ }
+
+ if (user_param.all == ALL) {
+ for (i = 1; i < 24 ; ++i) {
+ size = 1 << i;
+ if(run_iter(ctx, &user_param, rem_dest, size))
+ return 17;
+ print_report(user_param.iters, size, duplex, tposted, tcompleted, &user_param, no_cpu_freq_fail);
+ }
+ } else {
+ if(run_iter(ctx, &user_param, rem_dest, size))
+ return 18;
+ print_report(user_param.iters, size, duplex, tposted, tcompleted, &user_param, no_cpu_freq_fail);
+ }
+ /* the 0th place is arbitrary to signal finish ... */
+ if (user_param.servername) {
+ rem_dest[0] = pp_client_exch_dest(sockfd, &my_dest[0], &user_param);
+ } else {
+ rem_dest[0] = pp_server_exch_dest(sockfd, &my_dest[0], &user_param);
+ }
+
+ if (write(sockfd, "done", sizeof "done") != sizeof "done"){
+ perror("write");
+ fprintf(stderr, "Couldn't write to socket\n");
+ return 1;
+ }
+ close(sockfd);
+
+ free(tposted);
+ free(tcompleted);
+
+ printf("------------------------------------------------------------------\n");
+ return 0;
+}
diff --git a/write_lat.c b/write_lat.c
new file mode 100755
index 0000000..e41981d
--- /dev/null
+++ b/write_lat.c
@@ -0,0 +1,1094 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2005 Hewlett Packard, Inc (Grant Grundler)
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <netdb.h>
+#include <malloc.h>
+#include <getopt.h>
+#include <arpa/inet.h>
+#include <byteswap.h>
+#include <time.h>
+
+#include <infiniband/verbs.h>
+
+#include "get_clock.h"
+
+#define PINGPONG_RDMA_WRID 3
+#define VERSION 1.0
+#define ALL 1
+#define MAX_INLINE 400
+static int sl = 0;
+static int page_size;
+cycles_t *tstamp;
+struct user_parameters {
+ const char *servername;
+ int connection_type;
+ int mtu;
+ int all; /* run all msg size */
+ int iters;
+ int tx_depth;
+ int inline_size;
+ int qp_timeout;
+ int gid_index; /* if value not negative, we use gid AND gid_index=value */
+};
+struct report_options {
+ int unsorted;
+ int histogram;
+ int cycles; /* report delta's in cycles, not microsec's */
+};
+
+
+struct pingpong_context {
+ struct ibv_context *context;
+ struct ibv_pd *pd;
+ struct ibv_mr *mr;
+ struct ibv_cq *cq;
+ struct ibv_qp *qp;
+ void *buf;
+ volatile char *post_buf;
+ volatile char *poll_buf;
+ int size;
+ int tx_depth;
+ struct ibv_sge list;
+ struct ibv_send_wr wr;
+ union ibv_gid dgid;
+};
+
+struct pingpong_dest {
+ int lid;
+ int qpn;
+ int psn;
+ unsigned rkey;
+ unsigned long long vaddr;
+ union ibv_gid dgid;
+};
+
+
+static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port)
+{
+ struct ibv_port_attr attr;
+
+ if (ibv_query_port(ctx->context, port, &attr))
+ return 0;
+
+ return attr.lid;
+}
+
+static struct ibv_device *pp_find_dev(const char *ib_devname) {
+ struct ibv_device **dev_list;
+ struct ibv_device *ib_dev = NULL;
+
+ dev_list = ibv_get_device_list(NULL);
+
+ if (!ib_devname) {
+ ib_dev = dev_list[0];
+ if (!ib_dev)
+ fprintf(stderr, "No IB devices found\n");
+ } else {
+ for (; (ib_dev = *dev_list); ++dev_list)
+ if (!strcmp(ibv_get_device_name(ib_dev), ib_devname))
+ break;
+ if (!ib_dev)
+ fprintf(stderr, "IB device %s not found\n", ib_devname);
+ }
+ return ib_dev;
+}
+
+#define KEY_MSG_SIZE (sizeof "0000:000000:000000:00000000:0000000000000000")
+#define KEY_PRINT_FMT "%04x:%06x:%06x:%08x:%016Lx"
+#define KEY_MSG_SIZE_GID (sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00")
+#define KEY_PRINT_FMT_GID "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x"
+
+static int pp_write_keys(int sockfd, const struct pingpong_dest *my_dest, struct user_parameters *user_parm)
+{
+ if (user_parm->gid_index < 0) {
+ char msg[KEY_MSG_SIZE];
+
+ sprintf(msg, KEY_PRINT_FMT, my_dest->lid, my_dest->qpn,
+ my_dest->psn, my_dest->rkey, my_dest->vaddr);
+
+ if (write(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("client write");
+ fprintf(stderr, "Couldn't send local address\n");
+ return -1;
+ }
+
+ return 0;
+ } else {
+ char msg[KEY_MSG_SIZE_GID];
+
+ sprintf(msg, KEY_PRINT_FMT_GID, my_dest->lid, my_dest->qpn,
+ my_dest->psn, my_dest->rkey, my_dest->vaddr,
+ my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2], my_dest->dgid.raw[3],
+ my_dest->dgid.raw[4], my_dest->dgid.raw[5], my_dest->dgid.raw[6], my_dest->dgid.raw[7],
+ my_dest->dgid.raw[8], my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11],
+ my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14], my_dest->dgid.raw[15]);
+ if (write(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("client write");
+ fprintf(stderr, "Couldn't send local address\n");
+ return -1;
+ }
+
+ return 0;
+ }
+}
+
+static int pp_read_keys(int sockfd, const struct pingpong_dest *my_dest,
+ struct pingpong_dest *rem_dest, struct user_parameters *user_parm)
+{
+ if (user_parm->gid_index < 0) {
+ int parsed;
+ char msg[KEY_MSG_SIZE];
+
+ if (read(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("pp_read_keys");
+ fprintf(stderr, "Couldn't read remote address\n");
+ return -1;
+ }
+
+ parsed = sscanf(msg, KEY_PRINT_FMT, &rem_dest->lid, &rem_dest->qpn,
+ &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr);
+
+ if (parsed != 5) {
+ fprintf(stderr, "Couldn't parse line <%.*s>\n",
+ (int)sizeof msg, msg);
+ return -1;
+ }
+
+ return 0;
+ } else {
+ char msg[KEY_MSG_SIZE_GID];
+ if (read(sockfd, msg, sizeof msg) != sizeof msg) {
+ perror("pp_read_keys");
+ fprintf(stderr, "Couldn't read remote address\n");
+ return -1;
+ }
+ char *pstr = msg, *term;
+ char tmp[20];
+ int i;
+
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY
+
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA
+
+ for (i = 0; i < 15; ++i) {
+ pstr += term - pstr + 1;
+ term = strpbrk(pstr, ":");
+ memcpy(tmp, pstr, term - pstr);
+ tmp[term - pstr] = 0;
+ rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16);
+ }
+ pstr += term - pstr + 1;
+ strcpy(tmp, pstr);
+ rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16);
+ return 0;
+ }
+}
+
+static int pp_client_connect(const char *servername, int port)
+{
+ struct addrinfo *res, *t;
+ struct addrinfo hints =
+ {
+ .ai_family = AF_UNSPEC,
+ .ai_socktype = SOCK_STREAM
+ };
+ char *service;
+ int n;
+ int sockfd = -1;
+
+ if (asprintf(&service, "%d", port) < 0)
+ return -1;
+
+ n = getaddrinfo(servername, service, &hints, &res);
+
+ if (n < 0) {
+ fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port);
+ return n;
+ }
+
+ for (t = res; t; t = t->ai_next) {
+ sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
+ if (sockfd >= 0) {
+ if (!connect(sockfd, t->ai_addr, t->ai_addrlen))
+ break;
+ close(sockfd);
+ sockfd = -1;
+ }
+ }
+
+ freeaddrinfo(res);
+
+ if (sockfd < 0) {
+ fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port);
+ return sockfd;
+ }
+ return sockfd;
+}
+
+static int pp_client_exch_dest(int sockfd, const struct pingpong_dest *my_dest,
+ struct pingpong_dest *rem_dest, struct user_parameters *user_parm)
+{
+ if (pp_write_keys(sockfd, my_dest, user_parm))
+ return -1;
+
+ return pp_read_keys(sockfd, my_dest, rem_dest, user_parm);
+}
+
+static int pp_server_connect(int port)
+{
+ struct addrinfo *res, *t;
+ struct addrinfo hints = {
+ .ai_flags = AI_PASSIVE,
+ .ai_family = AF_UNSPEC,
+ .ai_socktype = SOCK_STREAM
+ };
+ char *service;
+ int sockfd = -1, connfd;
+ int n;
+
+ if (asprintf(&service, "%d", port) < 0)
+ return -1;
+
+ n = getaddrinfo(NULL, service, &hints, &res);
+
+ if (n < 0) {
+ fprintf(stderr, "%s for port %d\n", gai_strerror(n), port);
+ return n;
+ }
+
+ for (t = res; t; t = t->ai_next) {
+ sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
+ if (sockfd >= 0) {
+ n = 1;
+
+ setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n);
+
+ if (!bind(sockfd, t->ai_addr, t->ai_addrlen))
+ break;
+ close(sockfd);
+ sockfd = -1;
+ }
+ }
+
+ freeaddrinfo(res);
+
+ if (sockfd < 0) {
+ fprintf(stderr, "Couldn't listen to port %d\n", port);
+ return sockfd;
+ }
+
+ listen(sockfd, 1);
+ connfd = accept(sockfd, NULL, 0);
+ if (connfd < 0) {
+ perror("server accept");
+ fprintf(stderr, "accept() failed\n");
+ close(sockfd);
+ return connfd;
+ }
+
+ close(sockfd);
+ return connfd;
+}
+
+static int pp_server_exch_dest(int sockfd, const struct pingpong_dest *my_dest,
+ struct pingpong_dest* rem_dest, struct user_parameters *user_parm)
+{
+
+ if (pp_read_keys(sockfd, my_dest, rem_dest, user_parm))
+ return -1;
+
+ return pp_write_keys(sockfd, my_dest, user_parm);
+}
+
+static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,
+ int tx_depth, int port, struct user_parameters *user_parm) {
+ struct pingpong_context *ctx;
+ struct ibv_device_attr device_attr;
+
+ ctx = malloc(sizeof *ctx);
+ if (!ctx)
+ return NULL;
+
+ ctx->size = size;
+ ctx->tx_depth = tx_depth;
+
+ ctx->buf = memalign(page_size, size * 2);
+ if (!ctx->buf) {
+ fprintf(stderr, "Couldn't allocate work buf.\n");
+ return NULL;
+ }
+
+ memset(ctx->buf, 0, size * 2);
+
+ ctx->post_buf = (char*)ctx->buf + (size - 1);
+ ctx->poll_buf = (char*)ctx->buf + (2 * size - 1);
+
+ ctx->context = ibv_open_device(ib_dev);
+ if (!ctx->context) {
+ fprintf(stderr, "Couldn't get context for %s\n",
+ ibv_get_device_name(ib_dev));
+ return NULL;
+ }
+ if (user_parm->mtu == 0) {/*user did not ask for specific mtu */
+ if (ibv_query_device(ctx->context, &device_attr)) {
+ fprintf(stderr, "Failed to query device props");
+ return NULL;
+ }
+ if (device_attr.vendor_part_id == 23108 || user_parm->gid_index > -1) {
+ user_parm->mtu = 1024;
+ } else {
+ user_parm->mtu = 2048;
+ }
+ }
+ ctx->pd = ibv_alloc_pd(ctx->context);
+ if (!ctx->pd) {
+ fprintf(stderr, "Couldn't allocate PD\n");
+ return NULL;
+ }
+
+ ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size * 2,
+ IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+ if (!ctx->mr) {
+ fprintf(stderr, "Couldn't allocate MR\n");
+ return NULL;
+ }
+
+ ctx->cq = ibv_create_cq(ctx->context, tx_depth, NULL, NULL, 0);
+ if (!ctx->cq) {
+ fprintf(stderr, "Couldn't create CQ\n");
+ return NULL;
+ }
+
+ {
+ struct ibv_qp_init_attr attr;
+ memset(&attr, 0, sizeof(struct ibv_qp_init_attr));
+ attr.send_cq = ctx->cq;
+ attr.recv_cq = ctx->cq;
+ attr.cap.max_send_wr = tx_depth;
+ /* Work around: driver doesnt support
+ * recv_wr = 0 */
+ attr.cap.max_recv_wr = 1;
+ attr.cap.max_send_sge = 1;
+ attr.cap.max_recv_sge = 1;
+ attr.cap.max_inline_data = user_parm->inline_size;
+
+ if (user_parm->connection_type==1) {
+ attr.qp_type = IBV_QPT_UC;
+ } else {
+ attr.qp_type = IBV_QPT_RC;
+ }
+ ctx->qp = ibv_create_qp(ctx->pd, &attr);
+ if (!ctx->qp) {
+ fprintf(stderr, "Couldn't create QP\n");
+ return NULL;
+ }
+ }
+
+ {
+ struct ibv_qp_attr attr = {
+ .qp_state = IBV_QPS_INIT,
+ .pkey_index = 0,
+ .port_num = port,
+ .qp_access_flags = IBV_ACCESS_REMOTE_WRITE
+ };
+
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_PKEY_INDEX |
+ IBV_QP_PORT |
+ IBV_QP_ACCESS_FLAGS)) {
+ fprintf(stderr, "Failed to modify QP to INIT\n");
+ return NULL;
+ }
+ }
+
+ memset(&ctx->wr, 0, sizeof(ctx->wr));
+
+ ctx->wr.wr_id = PINGPONG_RDMA_WRID;
+ ctx->wr.sg_list = &ctx->list;
+ ctx->wr.num_sge = 1;
+ ctx->wr.opcode = IBV_WR_RDMA_WRITE;
+ ctx->wr.next = NULL;
+
+ return ctx;
+}
+
+static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn,
+ struct pingpong_dest *dest,struct user_parameters *user_parm)
+{
+ struct ibv_qp_attr attr;
+ memset(&attr, 0, sizeof(struct ibv_qp_attr));
+ attr.qp_state = IBV_QPS_RTR;
+ switch (user_parm->mtu) {
+ case 256 :
+ attr.path_mtu = IBV_MTU_256;
+ break;
+ case 512 :
+ attr.path_mtu = IBV_MTU_512;
+ break;
+ case 1024 :
+ attr.path_mtu = IBV_MTU_1024;
+ break;
+ case 2048 :
+ attr.path_mtu = IBV_MTU_2048;
+ break;
+ case 4096 :
+ attr.path_mtu = IBV_MTU_4096;
+ break;
+ }
+ printf("Mtu : %d\n", user_parm->mtu);
+ attr.dest_qp_num = dest->qpn;
+ attr.rq_psn = dest->psn;
+
+ if (user_parm->connection_type==0) {
+ attr.max_dest_rd_atomic = 1;
+ attr.min_rnr_timer = 12;
+ }
+
+ if (user_parm->gid_index < 0) {
+ attr.ah_attr.is_global = 0;
+ attr.ah_attr.dlid = dest->lid;
+ attr.ah_attr.sl = sl;
+ } else {
+ attr.ah_attr.is_global = 1;
+ attr.ah_attr.grh.dgid = dest->dgid;
+ attr.ah_attr.grh.hop_limit = 1;
+ attr.ah_attr.sl = 0;
+ }
+ attr.ah_attr.src_path_bits = 0;
+ attr.ah_attr.port_num = port;
+
+ if (user_parm->connection_type == 0) {
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_AV |
+ IBV_QP_PATH_MTU |
+ IBV_QP_DEST_QPN |
+ IBV_QP_RQ_PSN |
+ IBV_QP_MIN_RNR_TIMER |
+ IBV_QP_MAX_DEST_RD_ATOMIC)) {
+ fprintf(stderr, "Failed to modify RC QP to RTR\n");
+ return 1;
+ }
+ attr.timeout = user_parm->qp_timeout;
+ attr.retry_cnt = 7;
+ attr.rnr_retry = 7;
+ } else {
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_AV |
+ IBV_QP_PATH_MTU |
+ IBV_QP_DEST_QPN |
+ IBV_QP_RQ_PSN)) {
+ fprintf(stderr, "Failed to modify UC QP to RTR\n");
+ return 1;
+ }
+
+ }
+ attr.qp_state = IBV_QPS_RTS;
+ attr.sq_psn = my_psn;
+
+ if (user_parm->connection_type == 0) {
+ attr.max_rd_atomic = 1;
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_SQ_PSN |
+ IBV_QP_TIMEOUT |
+ IBV_QP_RETRY_CNT |
+ IBV_QP_RNR_RETRY |
+ IBV_QP_MAX_QP_RD_ATOMIC)) {
+ fprintf(stderr, "Failed to modify RC QP to RTS\n");
+ return 1;
+ }
+ } else {
+ if (ibv_modify_qp(ctx->qp, &attr,
+ IBV_QP_STATE |
+ IBV_QP_SQ_PSN)) {
+ fprintf(stderr, "Failed to modify UC QP to RTS\n");
+ return 1;
+ }
+
+ }
+ return 0;
+}
+
+static int pp_open_port(struct pingpong_context *ctx, const char * servername,
+ int ib_port, int port, struct pingpong_dest *rem_dest,struct user_parameters *user_parm)
+{
+ char addr_fmt[] = "%8s address: LID %#04x QPN %#06x PSN %#06x RKey %#08x VAddr %#016Lx\n";
+ struct pingpong_dest my_dest;
+ int sockfd;
+ int rc;
+ union ibv_gid gid;
+
+
+ /* Create connection between client and server.
+ * We do it by exchanging data over a TCP socket connection. */
+
+
+ if (user_parm->gid_index != -1) {
+ int err=0;
+ err = ibv_query_gid (ctx->context, ib_port, user_parm->gid_index, &gid);
+ if (err) {
+ return -1;
+ }
+ ctx->dgid=gid;
+ }
+
+ my_dest.lid = pp_get_local_lid(ctx, ib_port);
+ my_dest.dgid = gid;
+ my_dest.qpn = ctx->qp->qp_num;
+ my_dest.psn = lrand48() & 0xffffff;
+ if (user_parm->gid_index < 0) {/*We do not fail test upon lid in RDMAoE/Eth conf*/
+ if (!my_dest.lid) {
+ fprintf(stderr, "Local lid 0x0 detected. Is an SM running? If you are running on an RMDAoE interface you must use GIDs\n");
+ return 1;
+ }
+ }
+ my_dest.rkey = ctx->mr->rkey;
+ my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size;
+
+ printf(addr_fmt, "local", my_dest.lid, my_dest.qpn, my_dest.psn,
+ my_dest.rkey, my_dest.vaddr);
+ if (user_parm->gid_index > -1) {
+ printf(" GID: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+ my_dest.dgid.raw[0],my_dest.dgid.raw[1],
+ my_dest.dgid.raw[2], my_dest.dgid.raw[3], my_dest.dgid.raw[4],
+ my_dest.dgid.raw[5], my_dest.dgid.raw[6], my_dest.dgid.raw[7],
+ my_dest.dgid.raw[8], my_dest.dgid.raw[9], my_dest.dgid.raw[10],
+ my_dest.dgid.raw[11], my_dest.dgid.raw[12], my_dest.dgid.raw[13],
+ my_dest.dgid.raw[14], my_dest.dgid.raw[15]);
+ }
+
+ sockfd = servername ? pp_client_connect(servername, port) :
+ pp_server_connect(port);
+
+ if (sockfd < 0) {
+ printf("pp_connect_sock(%s,%d) failed (%d)!\n",
+ servername, port, sockfd);
+ return sockfd;
+ }
+
+ rc = servername ? pp_client_exch_dest(sockfd, &my_dest, rem_dest, user_parm) :
+ pp_server_exch_dest(sockfd, &my_dest, rem_dest, user_parm);
+ if (rc)
+ return rc;
+
+ printf(addr_fmt, "remote", rem_dest->lid, rem_dest->qpn, rem_dest->psn,
+ rem_dest->rkey, rem_dest->vaddr);
+ if (user_parm->gid_index > -1) {
+ printf(" GID: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+ rem_dest->dgid.raw[0],rem_dest->dgid.raw[1],
+ rem_dest->dgid.raw[2], rem_dest->dgid.raw[3], rem_dest->dgid.raw[4],
+ rem_dest->dgid.raw[5], rem_dest->dgid.raw[6], rem_dest->dgid.raw[7],
+ rem_dest->dgid.raw[8], rem_dest->dgid.raw[9], rem_dest->dgid.raw[10],
+ rem_dest->dgid.raw[11], rem_dest->dgid.raw[12], rem_dest->dgid.raw[13],
+ rem_dest->dgid.raw[14], rem_dest->dgid.raw[15]);
+ }
+
+ if ((rc = pp_connect_ctx(ctx, ib_port, my_dest.psn, rem_dest,user_parm)))
+ return rc;
+
+ /* An additional handshake is required *after* moving qp to RTR.
+ * Arbitrarily reuse exch_dest for this purpose.
+ */
+
+ rc = servername ? pp_client_exch_dest(sockfd, &my_dest, rem_dest, user_parm) :
+ pp_server_exch_dest(sockfd, &my_dest, rem_dest, user_parm);
+
+ if (rc)
+ return rc;
+
+ if (write(sockfd, "done", sizeof "done") != sizeof "done"){
+ perror("write");
+ fprintf(stderr, "Couldn't write to socket\n");
+ return 1;
+ }
+ close(sockfd);
+ return 0;
+}
+
+static void usage(const char *argv0)
+{
+ printf("Usage:\n");
+ printf(" %s start a server and wait for connection\n", argv0);
+ printf(" %s <host> connect to server at <host>\n", argv0);
+ printf("\n");
+ printf("Options:\n");
+ printf(" -p, --port=<port> listen on/connect to port <port> (default 18515)\n");
+ printf(" -c, --connection=<RC/UC> connection type RC/UC (default RC)\n");
+ printf(" -m, --mtu=<mtu> mtu size (256 - 4096. default for hermon is 2048)\n");
+ printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n");
+ printf(" -i, --ib-port=<port> use port <port> of IB device (default 1)\n");
+ printf(" -s, --size=<size> size of message to exchange (default 1)\n");
+ printf(" -a, --all Run sizes from 2 till 2^23\n");
+ printf(" -t, --tx-depth=<dep> size of tx queue (default 50)\n");
+ printf(" -n, --iters=<iters> number of exchanges (at least 2, default 1000)\n");
+ printf(" -I, --inline_size=<size> max size of message to be sent in inline mode (default 400)\n");
+ printf(" -u, --qp-timeout=<timeout> QP timeout, timeout value is 4 usec * 2 ^(timeout), default 14\n");
+ printf(" -S, --sl=<sl> SL (default 0)\n");
+ printf(" -x, --gid-index=<index> test uses GID with GID index taken from command line (for RDMAoE index should be 0)\n");
+ printf(" -C, --report-cycles report times in cpu cycle units (default microseconds)\n");
+ printf(" -H, --report-histogram print out all results (default print summary only)\n");
+ printf(" -U, --report-unsorted (implies -H) print out unsorted results (default sorted)\n");
+ printf(" -V, --version display version number\n");
+ printf(" -F, --CPU-freq do not fail even if cpufreq_ondemand module is loaded\n");
+}
+
+/*
+ * When there is an
+ * odd number of samples, the median is the middle number.
+ * even number of samples, the median is the mean of the
+ * two middle numbers.
+ *
+ */
+static inline cycles_t get_median(int n, cycles_t delta[])
+{
+ if ((n - 1) % 2)
+ return(delta[n / 2] + delta[n / 2 - 1]) / 2;
+ else
+ return delta[n / 2];
+}
+
+static int cycles_compare(const void * aptr, const void * bptr)
+{
+ const cycles_t *a = aptr;
+ const cycles_t *b = bptr;
+ if (*a < *b) return -1;
+ if (*a > *b) return 1;
+ return 0;
+
+}
+
+static void print_report(struct report_options * options,
+ unsigned int iters, cycles_t *tstamp, int size, int no_cpu_freq_fail)
+{
+ double cycles_to_units;
+ cycles_t median;
+ unsigned int i;
+ const char* units;
+ cycles_t *delta = malloc((iters - 1) * sizeof *delta);
+
+ if (!delta) {
+ perror("malloc");
+ return;
+ }
+
+ for (i = 0; i < iters - 1; ++i)
+ delta[i] = tstamp[i + 1] - tstamp[i];
+
+
+ if (options->cycles) {
+ cycles_to_units = 1;
+ units = "cycles";
+ } else {
+ cycles_to_units = get_cpu_mhz(no_cpu_freq_fail);
+ units = "usec";
+ }
+
+ if (options->unsorted) {
+ printf("#, %s\n", units);
+ for (i = 0; i < iters - 1; ++i)
+ printf("%d, %g\n", i + 1, delta[i] / cycles_to_units / 2);
+ }
+
+ qsort(delta, iters - 1, sizeof *delta, cycles_compare);
+
+ if (options->histogram) {
+ printf("#, %s\n", units);
+ for (i = 0; i < iters - 1; ++i)
+ printf("%d, %g\n", i + 1, delta[i] / cycles_to_units / 2);
+ }
+
+ median = get_median(iters - 1, delta);
+ printf("%7d %d %7.2f %7.2f %7.2f\n",
+ size,iters,delta[0] / cycles_to_units / 2,
+ delta[iters - 2] / cycles_to_units / 2,median / cycles_to_units / 2);
+
+ free(delta);
+}
+int run_iter(struct pingpong_context *ctx, struct user_parameters *user_param,
+ struct pingpong_dest *rem_dest, int size)
+{
+ struct ibv_qp *qp;
+ struct ibv_send_wr *wr;
+ volatile char *poll_buf;
+ volatile char *post_buf;
+
+ int scnt, ccnt, rcnt;
+ int iters;
+ int tx_depth;
+ int inline_size;
+
+ iters = user_param->iters;
+ tx_depth = user_param->tx_depth;
+ inline_size = user_param->inline_size;
+
+ wr = &ctx->wr;
+ ctx->list.addr = (uintptr_t) ctx->buf ;
+ ctx->list.length = size;
+ ctx->list.lkey = ctx->mr->lkey;
+ wr->wr.rdma.remote_addr = rem_dest->vaddr;
+ wr->wr.rdma.rkey = rem_dest->rkey;
+
+ if (size > inline_size) {/* complaince to perf_main */
+ ctx->wr.send_flags = IBV_SEND_SIGNALED;
+ } else {
+ ctx->wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;
+ }
+ scnt = 0;
+ rcnt = 0;
+ ccnt = 0;
+
+ if(user_param->all == ALL) {
+ post_buf = (char*)ctx->buf + size - 1;
+ poll_buf = (char*)ctx->buf + 8388608 + size - 1;
+ } else {
+ poll_buf = ctx->poll_buf;
+ post_buf = ctx->post_buf;
+ }
+ qp = ctx->qp;
+
+ /* Done with setup. Start the test. */
+ while (scnt < iters || ccnt < iters || rcnt < iters) {
+
+ /* Wait till buffer changes. */
+ if (rcnt < user_param->iters && !(scnt < 1 && user_param->servername)) {
+ ++rcnt;
+ while (*poll_buf != (char)rcnt)
+ ;
+ /* Here the data is already in the physical memory.
+ If we wanted to actually use it, we may need
+ a read memory barrier here. */
+ }
+
+ if (scnt < user_param->iters) {
+ struct ibv_send_wr *bad_wr;
+ tstamp[scnt] = get_cycles();
+
+ *post_buf = (char)++scnt;
+
+ if (ibv_post_send(qp, wr, &bad_wr)) {
+ fprintf(stderr, "Couldn't post send: scnt=%d\n",
+ scnt);
+ return 11;
+ }
+ }
+
+ if (ccnt < user_param->iters) {
+ struct ibv_wc wc;
+ int ne;
+ ++ccnt;
+ do {
+ ne = ibv_poll_cq(ctx->cq, 1, &wc);
+ } while (ne == 0);
+ if (ne < 0) {
+ fprintf(stderr, "poll CQ failed %d\n", ne);
+ return 12;
+ }
+ if (wc.status != IBV_WC_SUCCESS) {
+ fprintf(stderr, "Completion wth error at %s:\n",
+ user_param->servername ? "client" : "server");
+ fprintf(stderr, "Failed status %d: wr_id %d\n",
+ wc.status, (int) wc.wr_id);
+ fprintf(stderr, "scnt=%d, rcnt=%d, ccnt=%d\n",
+ scnt, rcnt, ccnt);
+ return 13;
+ }
+ }
+ }
+ return(0);
+}
+int main(int argc, char *argv[])
+{
+ const char *ib_devname = NULL;
+ int port = 18515;
+ int ib_port = 1;
+ int size = 2;
+ int i = 0;
+ struct report_options report = {};
+
+ struct pingpong_context *ctx;
+ struct pingpong_dest rem_dest;
+ struct ibv_device *ib_dev;
+
+ struct user_parameters user_param;
+ int no_cpu_freq_fail = 0;
+
+ /* init default values to user's parameters */
+ memset(&user_param, 0, sizeof(struct user_parameters));
+ user_param.mtu = 0; /* signal choose default by device */
+ user_param.iters = 1000;
+ user_param.tx_depth = 50;
+ user_param.servername = NULL;
+ user_param.inline_size = MAX_INLINE;
+ user_param.qp_timeout = 14;
+ user_param.gid_index = -1; /*gid will not be used*/
+ /* Parameter parsing. */
+ while (1) {
+ int c;
+
+ static struct option long_options[] = {
+ { .name = "port", .has_arg = 1, .val = 'p' },
+ { .name = "connection", .has_arg = 1, .val = 'c' },
+ { .name = "mtu", .has_arg = 1, .val = 'm' },
+ { .name = "ib-dev", .has_arg = 1, .val = 'd' },
+ { .name = "ib-port", .has_arg = 1, .val = 'i' },
+ { .name = "size", .has_arg = 1, .val = 's' },
+ { .name = "iters", .has_arg = 1, .val = 'n' },
+ { .name = "tx-depth", .has_arg = 1, .val = 't' },
+ { .name = "inline_size", .has_arg = 1, .val = 'I' },
+ { .name = "qp-timeout", .has_arg = 1, .val = 'u' },
+ { .name = "sl", .has_arg = 1, .val = 'S' },
+ { .name = "gid-index", .has_arg = 1, .val = 'x' },
+ { .name = "all", .has_arg = 0, .val = 'a' },
+ { .name = "report-cycles", .has_arg = 0, .val = 'C' },
+ { .name = "report-histogram",.has_arg = 0, .val = 'H' },
+ { .name = "report-unsorted",.has_arg = 0, .val = 'U' },
+ { .name = "version", .has_arg = 0, .val = 'V' },
+ { .name = "CPU-freq", .has_arg = 0, .val = 'F' },
+ { 0 }
+ };
+
+ c = getopt_long(argc, argv, "p:c:m:d:i:s:n:t:I:u:S:x:aCHUVF", long_options, NULL);///cpufreq
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'p':
+ port = strtol(optarg, NULL, 0);
+ if (port < 0 || port > 65535) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+ case 'c':
+ if (strcmp("UC",optarg)==0)
+ user_param.connection_type=1;
+ /* default is 0 for any other option RC*/
+ break;
+
+ case 'm':
+ user_param.mtu = strtol(optarg, NULL, 0);
+ break;
+ case 'a':
+ user_param.all = ALL;
+ break;
+ case 'V':
+ printf("perftest version : %.2f\n",VERSION);
+ return 0;
+ break;
+ case 'd':
+ ib_devname = strdupa(optarg);
+ break;
+
+ case 'i':
+ ib_port = strtol(optarg, NULL, 0);
+ if (ib_port < 0) {
+ usage(argv[0]);
+ return 2;
+ }
+ break;
+
+ case 's':
+ size = strtol(optarg, NULL, 0);
+ if (size < 1) {
+ usage(argv[0]); return 3;
+ }
+ break;
+
+ case 't':
+ user_param.tx_depth = strtol(optarg, NULL, 0);
+ if (user_param.tx_depth < 1) {
+ usage(argv[0]); return 4;
+ }
+ break;
+
+ case 'I':
+ user_param.inline_size = strtol(optarg, NULL, 0);
+ if (user_param.inline_size > MAX_INLINE) {
+ usage(argv[0]); return 7;
+ }
+ break;
+
+ case 'n':
+ user_param.iters = strtol(optarg, NULL, 0);
+ if (user_param.iters < 2) {
+ usage(argv[0]);
+ return 5;
+ }
+
+ break;
+
+ case 'C':
+ report.cycles = 1;
+ break;
+
+ case 'H':
+ report.histogram = 1;
+ break;
+
+ case 'U':
+ report.unsorted = 1;
+ break;
+
+ case 'F':
+ no_cpu_freq_fail = 1;
+ break;
+
+ case 'u':
+ user_param.qp_timeout = strtol(optarg, NULL, 0);
+ break;
+ case 'S':
+ sl = strtol(optarg, NULL, 0);
+ if (sl > 15) { usage(argv[0]); return 6; }
+ break;
+
+ case 'x':
+ user_param.gid_index = strtol(optarg, NULL, 0);
+ if (user_param.gid_index > 63) {
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+
+ default:
+ usage(argv[0]);
+ return 7;
+ }
+ }
+
+ if (optind == argc - 1)
+ user_param.servername = strdupa(argv[optind]);
+ else if (optind < argc) {
+ usage(argv[0]);
+ return 6;
+ }
+
+ /*
+ * Done with parameter parsing. Perform setup.
+ */
+
+ tstamp = malloc(user_param.iters * sizeof *tstamp);
+ if (!tstamp) {
+ perror("malloc");
+ return 10;
+ }
+ printf("------------------------------------------------------------------\n");
+ printf(" RDMA_Write Latency Test\n");
+ printf("Inline data is used up to %d bytes message\n", user_param.inline_size);
+ if (user_param.connection_type==0) {
+ printf("Connection type : RC\n");
+ } else {
+ printf("Connection type : UC\n");
+ }
+ if (user_param.gid_index > -1) {
+ printf("Using GID to support RDMAoE configuration. Refer to port type as Ethernet, default MTU 1024B\n");
+ }
+ if (user_param.all == ALL) {
+ /*since we run all sizes */
+ size = 8388608; /*2^23 */
+ }
+ srand48(getpid() * time(NULL));
+ page_size = sysconf(_SC_PAGESIZE);
+
+ ib_dev = pp_find_dev(ib_devname);
+ if (!ib_dev)
+ return 7;
+
+ ctx = pp_init_ctx(ib_dev, size, user_param.tx_depth, ib_port,&user_param);
+ if (!ctx)
+ return 8;
+
+ if (pp_open_port(ctx, user_param.servername, ib_port, port, &rem_dest,&user_param))
+ return 9;
+ printf("------------------------------------------------------------------\n");
+ printf(" #bytes #iterations t_min[usec] t_max[usec] t_typical[usec]\n");
+
+ if (user_param.all == ALL) {
+ for (i = 1; i < 24 ; ++i) {
+ size = 1 << i;
+ if(run_iter(ctx, &user_param, &rem_dest, size))
+ return 17;
+ print_report(&report, user_param.iters, tstamp, size, no_cpu_freq_fail);
+ }
+ } else {
+ if(run_iter(ctx, &user_param, &rem_dest, size))
+ return 18;
+ print_report(&report, user_param.iters, tstamp, size, no_cpu_freq_fail);
+ }
+
+ printf("------------------------------------------------------------------\n");
+ free(tstamp);
+ return 0;
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ofed/perftest.git
More information about the Pkg-ofed-commits
mailing list