[Pkg-ofed-commits] [rds-tools] 01/02: Imported Upstream version 1.4.1-OFED-1.4.2
Ana Beatriz Guerrero López
ana at moszumanska.debian.org
Wed Jul 2 14:34:58 UTC 2014
This is an automated email from the git hooks/post-receive script.
ana pushed a commit to branch master
in repository rds-tools.
commit bff58facfa627c9fa256488b770fdb905f008e0c
Author: Ana Guerrero López <ana at ekaia.org>
Date: Wed Jul 2 16:34:33 2014 +0200
Imported Upstream version 1.4.1-OFED-1.4.2
---
Makefile | 106 ++
Makefile.in | 106 ++
README | 9 +
configure | 2126 +++++++++++++++++++++++++++++++++++
configure.in | 10 +
docs/rds-architecture.txt | 356 ++++++
examples/Makefile | 6 +
examples/README | 6 +
examples/rds-sample.c | 347 ++++++
kernel-list.h | 194 ++++
net/ib_rds.h | 265 +++++
net/rds.h | 50 +
options.c | 481 ++++++++
pfhack.c | 124 +++
pfhack.h | 60 +
rds-gen.1 | 89 ++
rds-gen.c | 322 ++++++
rds-info.1 | 162 +++
rds-info.c | 363 ++++++
rds-ping.1 | 69 ++
rds-ping.c | 385 +++++++
rds-rdma.7 | 427 +++++++
rds-sink.1 | 1 +
rds-sink.c | 250 +++++
rds-stress.1 | 174 +++
rds-stress.c | 2715 +++++++++++++++++++++++++++++++++++++++++++++
rds-tools.spec | 38 +
rds-tools.spec.in | 38 +
rds-tools.txt | 39 +
rds.7 | 445 ++++++++
rdstool.h | 112 ++
stap/README | 15 +
stap/rds.stp | 35 +
stats.c | 227 ++++
34 files changed, 10152 insertions(+)
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..f52710e
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,106 @@
+prefix = $(DESTDIR)/usr
+exec_prefix = $(DESTDIR)${prefix}
+bindir = $(DESTDIR)${exec_prefix}/bin
+mandir = $(DESTDIR)${prefix}/share/man
+incdir = $(DESTDIR)${prefix}/include
+
+all: all-programs
+
+CFLAGS = -O2 -Wall
+CPPFLAGS = -DDEBUG_EXE -MD -MP -MF $(@D)/.$(basename $(@F)).d
+
+HEADERS = kernel-list.h rdstool.h pfhack.h net/rds.h net/ib_rds.h
+COMMON_SOURCES = options.c stats.c pfhack.c
+SOURCES = $(addsuffix .c,$(PROGRAMS)) $(COMMON_SOURCES)
+CLEAN_OBJECTS = $(addsuffix .o,$(PROGRAMS)) $(subst .c,.o,$(COMMON_SOURCES))
+
+# This is the default
+DYNAMIC_PF_RDS = true
+
+ifneq ($(DYNAMIC_PF_RDS),)
+CPPFLAGS += -DDYNAMIC_PF_RDS
+COMMON_OBJECTS = $(subst .c,.o,$(COMMON_SOURCES))
+else
+COMMON_OBJECTS = $(subst .c,.o,$(filter-out pfhack.c,$(COMMON_SOURCES)))
+endif
+
+PROGRAMS = rds-gen rds-sink rds-info rds-stress rds-ping
+
+all-programs: $(PROGRAMS)
+
+install: $(PROGRAMS)
+ install -d $(bindir)
+ install -m 555 -s $(PROGRAMS) $(bindir)
+ install -d $(mandir)/man1
+ install -d $(mandir)/man7
+ install -m 644 *.1 $(mandir)/man1
+ install -m 644 *.7 $(mandir)/man7
+ install -d $(incdir)/net
+ install -m 444 net/rds.h $(incdir)/net
+
+clean:
+ rm -f $(PROGRAMS) $(CLEAN_OBJECTS)
+
+distclean: clean
+ rm -f .*.d
+
+
+
+$(PROGRAMS) : % : %.o $(COMMON_OBJECTS)
+ gcc $(CFLAGS) $(LDFLAGS) -o $@ $^
+
+LOCAL_DFILES := $(wildcard .*.d)
+ifneq ($(LOCAL_DFILES),)
+.PHONY: $(LOCAL_DFILES)
+-include $(LOCAL_DFILES)
+endif
+
+VERSION := 1.4
+RELEASE := 1
+
+TAR_PREFIX := rds-tools-$(VERSION)-$(RELEASE)
+TAR_FILE := $(TAR_PREFIX).tar.gz
+
+EXTRA_DIST := rds-info.1 \
+ rds-gen.1 \
+ rds-sink.1 \
+ rds-stress.1 \
+ rds-ping.1 \
+ rds.7 \
+ rds-rdma.7 \
+ Makefile.in \
+ rds-tools.spec.in \
+ configure.in \
+ configure \
+ README \
+ rds-tools.txt \
+ stap/rds.stp \
+ stap/README \
+ docs/rds-architecture.txt \
+ examples/Makefile \
+ examples/rds-sample.c \
+ examples/README
+
+DISTFILES := $(SOURCES) $(HEADERS) $(EXTRA_DIST)
+
+$(TAR_FILE): Makefile rds-tools.spec
+ @rm -rf $@ $(TAR_PREFIX) || :
+ @mkdir $(TAR_PREFIX)
+ for a in $^ $(DISTFILES); do \
+ if [ ! -f $$a ]; then \
+ continue; \
+ fi; \
+ targ=$(TAR_PREFIX)/$$(dirname $$a); \
+ mkdir -p $$targ; \
+ cp $$a $$targ; \
+ done
+
+ tar -zcf $@ $(TAR_PREFIX)
+
+.PHONY: rpm
+rpm: $(TAR_FILE)
+ rpmbuild -ta $^
+
+.PHONY: dist
+dist: $(TAR_FILE)
+
diff --git a/Makefile.in b/Makefile.in
new file mode 100644
index 0000000..088ee69
--- /dev/null
+++ b/Makefile.in
@@ -0,0 +1,106 @@
+prefix = $(DESTDIR)@prefix@
+exec_prefix = $(DESTDIR)@exec_prefix@
+bindir = $(DESTDIR)@bindir@
+mandir = $(DESTDIR)@mandir@
+incdir = $(DESTDIR)@includedir@
+
+all: all-programs
+
+CFLAGS = -O2 -Wall
+CPPFLAGS = -DDEBUG_EXE -MD -MP -MF $(@D)/.$(basename $(@F)).d
+
+HEADERS = kernel-list.h rdstool.h pfhack.h net/rds.h net/ib_rds.h
+COMMON_SOURCES = options.c stats.c pfhack.c
+SOURCES = $(addsuffix .c,$(PROGRAMS)) $(COMMON_SOURCES)
+CLEAN_OBJECTS = $(addsuffix .o,$(PROGRAMS)) $(subst .c,.o,$(COMMON_SOURCES))
+
+# This is the default
+DYNAMIC_PF_RDS = true
+
+ifneq ($(DYNAMIC_PF_RDS),)
+CPPFLAGS += -DDYNAMIC_PF_RDS
+COMMON_OBJECTS = $(subst .c,.o,$(COMMON_SOURCES))
+else
+COMMON_OBJECTS = $(subst .c,.o,$(filter-out pfhack.c,$(COMMON_SOURCES)))
+endif
+
+PROGRAMS = rds-gen rds-sink rds-info rds-stress rds-ping
+
+all-programs: $(PROGRAMS)
+
+install: $(PROGRAMS)
+ install -d $(bindir)
+ install -m 555 -s $(PROGRAMS) $(bindir)
+ install -d $(mandir)/man1
+ install -d $(mandir)/man7
+ install -m 644 *.1 $(mandir)/man1
+ install -m 644 *.7 $(mandir)/man7
+ install -d $(incdir)/net
+ install -m 444 net/rds.h $(incdir)/net
+
+clean:
+ rm -f $(PROGRAMS) $(CLEAN_OBJECTS)
+
+distclean: clean
+ rm -f .*.d
+
+
+
+$(PROGRAMS) : % : %.o $(COMMON_OBJECTS)
+ gcc $(CFLAGS) $(LDFLAGS) -o $@ $^
+
+LOCAL_DFILES := $(wildcard .*.d)
+ifneq ($(LOCAL_DFILES),)
+.PHONY: $(LOCAL_DFILES)
+-include $(LOCAL_DFILES)
+endif
+
+VERSION := @VERSION@
+RELEASE := @RELEASE@
+
+TAR_PREFIX := rds-tools-$(VERSION)-$(RELEASE)
+TAR_FILE := $(TAR_PREFIX).tar.gz
+
+EXTRA_DIST := rds-info.1 \
+ rds-gen.1 \
+ rds-sink.1 \
+ rds-stress.1 \
+ rds-ping.1 \
+ rds.7 \
+ rds-rdma.7 \
+ Makefile.in \
+ rds-tools.spec.in \
+ configure.in \
+ configure \
+ README \
+ rds-tools.txt \
+ stap/rds.stp \
+ stap/README \
+ docs/rds-architecture.txt \
+ examples/Makefile \
+ examples/rds-sample.c \
+ examples/README
+
+DISTFILES := $(SOURCES) $(HEADERS) $(EXTRA_DIST)
+
+$(TAR_FILE): Makefile rds-tools.spec
+ @rm -rf $@ $(TAR_PREFIX) || :
+ @mkdir $(TAR_PREFIX)
+ for a in $^ $(DISTFILES); do \
+ if [ ! -f $$a ]; then \
+ continue; \
+ fi; \
+ targ=$(TAR_PREFIX)/$$(dirname $$a); \
+ mkdir -p $$targ; \
+ cp $$a $$targ; \
+ done
+
+ tar -zcf $@ $(TAR_PREFIX)
+
+.PHONY: rpm
+rpm: $(TAR_FILE)
+ rpmbuild -ta $^
+
+.PHONY: dist
+dist: $(TAR_FILE)
+
diff --git a/README b/README
new file mode 100644
index 0000000..0c6a8d6
--- /dev/null
+++ b/README
@@ -0,0 +1,9 @@
+
+== Short build instructions ==
+
+ autoconf
+ ./configure
+ make rpm
+
+This should result in an rds-tools rpm which is versioned by the VERSION
+in the Makefile and the subversion rev that was checked out.
diff --git a/configure b/configure
new file mode 100755
index 0000000..67b6316
--- /dev/null
+++ b/configure
@@ -0,0 +1,2126 @@
+#! /bin/sh
+# Guess values for system-dependent variables and create Makefiles.
+# Generated by GNU Autoconf 2.59.
+#
+# Copyright (C) 2003 Free Software Foundation, Inc.
+# This configure script is free software; the Free Software Foundation
+# gives unlimited permission to copy, distribute and modify it.
+## --------------------- ##
+## M4sh Initialization. ##
+## --------------------- ##
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+ emulate sh
+ NULLCMD=:
+ # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+ # is contrary to our usage. Disable this feature.
+ alias -g '${1+"$@"}'='"$@"'
+elif test -n "${BASH_VERSION+set}" && (set -o posix) >/dev/null 2>&1; then
+ set -o posix
+fi
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# Support unset when possible.
+if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then
+ as_unset=unset
+else
+ as_unset=false
+fi
+
+
+# Work around bugs in pre-3.0 UWIN ksh.
+$as_unset ENV MAIL MAILPATH
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+for as_var in \
+ LANG LANGUAGE LC_ADDRESS LC_ALL LC_COLLATE LC_CTYPE LC_IDENTIFICATION \
+ LC_MEASUREMENT LC_MESSAGES LC_MONETARY LC_NAME LC_NUMERIC LC_PAPER \
+ LC_TELEPHONE LC_TIME
+do
+ if (set +x; test -z "`(eval $as_var=C; export $as_var) 2>&1`"); then
+ eval $as_var=C; export $as_var
+ else
+ $as_unset $as_var
+ fi
+done
+
+# Required to use basename.
+if expr a : '\(a\)' >/dev/null 2>&1; then
+ as_expr=expr
+else
+ as_expr=false
+fi
+
+if (basename /) >/dev/null 2>&1 && test "X`basename / 2>&1`" = "X/"; then
+ as_basename=basename
+else
+ as_basename=false
+fi
+
+
+# Name of the executable.
+as_me=`$as_basename "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+ X"$0" : 'X\(//\)$' \| \
+ X"$0" : 'X\(/\)$' \| \
+ . : '\(.\)' 2>/dev/null ||
+echo X/"$0" |
+ sed '/^.*\/\([^/][^/]*\)\/*$/{ s//\1/; q; }
+ /^X\/\(\/\/\)$/{ s//\1/; q; }
+ /^X\/\(\/\).*/{ s//\1/; q; }
+ s/.*/./; q'`
+
+
+# PATH needs CR, and LINENO needs CR and PATH.
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+ echo "#! /bin/sh" >conf$$.sh
+ echo "exit 0" >>conf$$.sh
+ chmod +x conf$$.sh
+ if (PATH="/nonexistent;."; conf$$.sh) >/dev/null 2>&1; then
+ PATH_SEPARATOR=';'
+ else
+ PATH_SEPARATOR=:
+ fi
+ rm -f conf$$.sh
+fi
+
+
+ as_lineno_1=$LINENO
+ as_lineno_2=$LINENO
+ as_lineno_3=`(expr $as_lineno_1 + 1) 2>/dev/null`
+ test "x$as_lineno_1" != "x$as_lineno_2" &&
+ test "x$as_lineno_3" = "x$as_lineno_2" || {
+ # Find who we are. Look in the path if we contain no path at all
+ # relative or not.
+ case $0 in
+ *[\\/]* ) as_myself=$0 ;;
+ *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+done
+
+ ;;
+ esac
+ # We did not find ourselves, most probably we were run as `sh COMMAND'
+ # in which case we are not to be found in the path.
+ if test "x$as_myself" = x; then
+ as_myself=$0
+ fi
+ if test ! -f "$as_myself"; then
+ { echo "$as_me: error: cannot find myself; rerun with an absolute path" >&2
+ { (exit 1); exit 1; }; }
+ fi
+ case $CONFIG_SHELL in
+ '')
+ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for as_base in sh bash ksh sh5; do
+ case $as_dir in
+ /*)
+ if ("$as_dir/$as_base" -c '
+ as_lineno_1=$LINENO
+ as_lineno_2=$LINENO
+ as_lineno_3=`(expr $as_lineno_1 + 1) 2>/dev/null`
+ test "x$as_lineno_1" != "x$as_lineno_2" &&
+ test "x$as_lineno_3" = "x$as_lineno_2" ') 2>/dev/null; then
+ $as_unset BASH_ENV || test "${BASH_ENV+set}" != set || { BASH_ENV=; export BASH_ENV; }
+ $as_unset ENV || test "${ENV+set}" != set || { ENV=; export ENV; }
+ CONFIG_SHELL=$as_dir/$as_base
+ export CONFIG_SHELL
+ exec "$CONFIG_SHELL" "$0" ${1+"$@"}
+ fi;;
+ esac
+ done
+done
+;;
+ esac
+
+ # Create $as_me.lineno as a copy of $as_myself, but with $LINENO
+ # uniformly replaced by the line number. The first 'sed' inserts a
+ # line-number line before each line; the second 'sed' does the real
+ # work. The second script uses 'N' to pair each line-number line
+ # with the numbered line, and appends trailing '-' during
+ # substitution so that $LINENO is not a special case at line end.
+ # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the
+ # second 'sed' script. Blame Lee E. McMahon for sed's syntax. :-)
+ sed '=' <$as_myself |
+ sed '
+ N
+ s,$,-,
+ : loop
+ s,^\(['$as_cr_digits']*\)\(.*\)[$]LINENO\([^'$as_cr_alnum'_]\),\1\2\1\3,
+ t loop
+ s,-$,,
+ s,^['$as_cr_digits']*\n,,
+ ' >$as_me.lineno &&
+ chmod +x $as_me.lineno ||
+ { echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2
+ { (exit 1); exit 1; }; }
+
+ # Don't try to exec as it changes $[0], causing all sort of problems
+ # (the dirname of $[0] is not the place where we might find the
+ # original and so on. Autoconf is especially sensible to this).
+ . ./$as_me.lineno
+ # Exit status is that of the last command.
+ exit
+}
+
+
+case `echo "testing\c"; echo 1,2,3`,`echo -n testing; echo 1,2,3` in
+ *c*,-n*) ECHO_N= ECHO_C='
+' ECHO_T=' ' ;;
+ *c*,* ) ECHO_N=-n ECHO_C= ECHO_T= ;;
+ *) ECHO_N= ECHO_C='\c' ECHO_T= ;;
+esac
+
+if expr a : '\(a\)' >/dev/null 2>&1; then
+ as_expr=expr
+else
+ as_expr=false
+fi
+
+rm -f conf$$ conf$$.exe conf$$.file
+echo >conf$$.file
+if ln -s conf$$.file conf$$ 2>/dev/null; then
+ # We could just check for DJGPP; but this test a) works b) is more generic
+ # and c) will remain valid once DJGPP supports symlinks (DJGPP 2.04).
+ if test -f conf$$.exe; then
+ # Don't use ln at all; we don't have any links
+ as_ln_s='cp -p'
+ else
+ as_ln_s='ln -s'
+ fi
+elif ln conf$$.file conf$$ 2>/dev/null; then
+ as_ln_s=ln
+else
+ as_ln_s='cp -p'
+fi
+rm -f conf$$ conf$$.exe conf$$.file
+
+if mkdir -p . 2>/dev/null; then
+ as_mkdir_p=:
+else
+ test -d ./-p && rmdir ./-p
+ as_mkdir_p=false
+fi
+
+as_executable_p="test -f"
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+
+# IFS
+# We need space, tab and new line, in precisely that order.
+as_nl='
+'
+IFS=" $as_nl"
+
+# CDPATH.
+$as_unset CDPATH
+
+
+# Name of the host.
+# hostname on some systems (SVR3.2, Linux) returns a bogus exit status,
+# so uname gets run too.
+ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q`
+
+exec 6>&1
+
+#
+# Initializations.
+#
+ac_default_prefix=/usr/local
+ac_config_libobj_dir=.
+cross_compiling=no
+subdirs=
+MFLAGS=
+MAKEFLAGS=
+SHELL=${CONFIG_SHELL-/bin/sh}
+
+# Maximum number of lines to put in a shell here document.
+# This variable seems obsolete. It should probably be removed, and
+# only ac_max_sed_lines should be used.
+: ${ac_max_here_lines=38}
+
+# Identity of this package.
+PACKAGE_NAME=
+PACKAGE_TARNAME=
+PACKAGE_VERSION=
+PACKAGE_STRING=
+PACKAGE_BUGREPORT=
+
+ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS VERSION RELEASE LIBOBJS LTLIBOBJS'
+ac_subst_files=''
+
+# Initialize some variables set by options.
+ac_init_help=
+ac_init_version=false
+# The variables have the same names as the options, with
+# dashes changed to underlines.
+cache_file=/dev/null
+exec_prefix=NONE
+no_create=
+no_recursion=
+prefix=NONE
+program_prefix=NONE
+program_suffix=NONE
+program_transform_name=s,x,x,
+silent=
+site=
+srcdir=
+verbose=
+x_includes=NONE
+x_libraries=NONE
+
+# Installation directory options.
+# These are left unexpanded so users can "make install exec_prefix=/foo"
+# and all the variables that are supposed to be based on exec_prefix
+# by default will actually change.
+# Use braces instead of parens because sh, perl, etc. also accept them.
+bindir='${exec_prefix}/bin'
+sbindir='${exec_prefix}/sbin'
+libexecdir='${exec_prefix}/libexec'
+datadir='${prefix}/share'
+sysconfdir='${prefix}/etc'
+sharedstatedir='${prefix}/com'
+localstatedir='${prefix}/var'
+libdir='${exec_prefix}/lib'
+includedir='${prefix}/include'
+oldincludedir='/usr/include'
+infodir='${prefix}/info'
+mandir='${prefix}/man'
+
+ac_prev=
+for ac_option
+do
+ # If the previous option needs an argument, assign it.
+ if test -n "$ac_prev"; then
+ eval "$ac_prev=\$ac_option"
+ ac_prev=
+ continue
+ fi
+
+ ac_optarg=`expr "x$ac_option" : 'x[^=]*=\(.*\)'`
+
+ # Accept the important Cygnus configure options, so we can diagnose typos.
+
+ case $ac_option in
+
+ -bindir | --bindir | --bindi | --bind | --bin | --bi)
+ ac_prev=bindir ;;
+ -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*)
+ bindir=$ac_optarg ;;
+
+ -build | --build | --buil | --bui | --bu)
+ ac_prev=build_alias ;;
+ -build=* | --build=* | --buil=* | --bui=* | --bu=*)
+ build_alias=$ac_optarg ;;
+
+ -cache-file | --cache-file | --cache-fil | --cache-fi \
+ | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c)
+ ac_prev=cache_file ;;
+ -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \
+ | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*)
+ cache_file=$ac_optarg ;;
+
+ --config-cache | -C)
+ cache_file=config.cache ;;
+
+ -datadir | --datadir | --datadi | --datad | --data | --dat | --da)
+ ac_prev=datadir ;;
+ -datadir=* | --datadir=* | --datadi=* | --datad=* | --data=* | --dat=* \
+ | --da=*)
+ datadir=$ac_optarg ;;
+
+ -disable-* | --disable-*)
+ ac_feature=`expr "x$ac_option" : 'x-*disable-\(.*\)'`
+ # Reject names that are not valid shell variable names.
+ expr "x$ac_feature" : ".*[^-_$as_cr_alnum]" >/dev/null &&
+ { echo "$as_me: error: invalid feature name: $ac_feature" >&2
+ { (exit 1); exit 1; }; }
+ ac_feature=`echo $ac_feature | sed 's/-/_/g'`
+ eval "enable_$ac_feature=no" ;;
+
+ -enable-* | --enable-*)
+ ac_feature=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'`
+ # Reject names that are not valid shell variable names.
+ expr "x$ac_feature" : ".*[^-_$as_cr_alnum]" >/dev/null &&
+ { echo "$as_me: error: invalid feature name: $ac_feature" >&2
+ { (exit 1); exit 1; }; }
+ ac_feature=`echo $ac_feature | sed 's/-/_/g'`
+ case $ac_option in
+ *=*) ac_optarg=`echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"`;;
+ *) ac_optarg=yes ;;
+ esac
+ eval "enable_$ac_feature='$ac_optarg'" ;;
+
+ -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \
+ | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \
+ | --exec | --exe | --ex)
+ ac_prev=exec_prefix ;;
+ -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \
+ | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \
+ | --exec=* | --exe=* | --ex=*)
+ exec_prefix=$ac_optarg ;;
+
+ -gas | --gas | --ga | --g)
+ # Obsolete; use --with-gas.
+ with_gas=yes ;;
+
+ -help | --help | --hel | --he | -h)
+ ac_init_help=long ;;
+ -help=r* | --help=r* | --hel=r* | --he=r* | -hr*)
+ ac_init_help=recursive ;;
+ -help=s* | --help=s* | --hel=s* | --he=s* | -hs*)
+ ac_init_help=short ;;
+
+ -host | --host | --hos | --ho)
+ ac_prev=host_alias ;;
+ -host=* | --host=* | --hos=* | --ho=*)
+ host_alias=$ac_optarg ;;
+
+ -includedir | --includedir | --includedi | --included | --include \
+ | --includ | --inclu | --incl | --inc)
+ ac_prev=includedir ;;
+ -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \
+ | --includ=* | --inclu=* | --incl=* | --inc=*)
+ includedir=$ac_optarg ;;
+
+ -infodir | --infodir | --infodi | --infod | --info | --inf)
+ ac_prev=infodir ;;
+ -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*)
+ infodir=$ac_optarg ;;
+
+ -libdir | --libdir | --libdi | --libd)
+ ac_prev=libdir ;;
+ -libdir=* | --libdir=* | --libdi=* | --libd=*)
+ libdir=$ac_optarg ;;
+
+ -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \
+ | --libexe | --libex | --libe)
+ ac_prev=libexecdir ;;
+ -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \
+ | --libexe=* | --libex=* | --libe=*)
+ libexecdir=$ac_optarg ;;
+
+ -localstatedir | --localstatedir | --localstatedi | --localstated \
+ | --localstate | --localstat | --localsta | --localst \
+ | --locals | --local | --loca | --loc | --lo)
+ ac_prev=localstatedir ;;
+ -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \
+ | --localstate=* | --localstat=* | --localsta=* | --localst=* \
+ | --locals=* | --local=* | --loca=* | --loc=* | --lo=*)
+ localstatedir=$ac_optarg ;;
+
+ -mandir | --mandir | --mandi | --mand | --man | --ma | --m)
+ ac_prev=mandir ;;
+ -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*)
+ mandir=$ac_optarg ;;
+
+ -nfp | --nfp | --nf)
+ # Obsolete; use --without-fp.
+ with_fp=no ;;
+
+ -no-create | --no-create | --no-creat | --no-crea | --no-cre \
+ | --no-cr | --no-c | -n)
+ no_create=yes ;;
+
+ -no-recursion | --no-recursion | --no-recursio | --no-recursi \
+ | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r)
+ no_recursion=yes ;;
+
+ -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \
+ | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \
+ | --oldin | --oldi | --old | --ol | --o)
+ ac_prev=oldincludedir ;;
+ -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \
+ | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \
+ | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*)
+ oldincludedir=$ac_optarg ;;
+
+ -prefix | --prefix | --prefi | --pref | --pre | --pr | --p)
+ ac_prev=prefix ;;
+ -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*)
+ prefix=$ac_optarg ;;
+
+ -program-prefix | --program-prefix | --program-prefi | --program-pref \
+ | --program-pre | --program-pr | --program-p)
+ ac_prev=program_prefix ;;
+ -program-prefix=* | --program-prefix=* | --program-prefi=* \
+ | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*)
+ program_prefix=$ac_optarg ;;
+
+ -program-suffix | --program-suffix | --program-suffi | --program-suff \
+ | --program-suf | --program-su | --program-s)
+ ac_prev=program_suffix ;;
+ -program-suffix=* | --program-suffix=* | --program-suffi=* \
+ | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*)
+ program_suffix=$ac_optarg ;;
+
+ -program-transform-name | --program-transform-name \
+ | --program-transform-nam | --program-transform-na \
+ | --program-transform-n | --program-transform- \
+ | --program-transform | --program-transfor \
+ | --program-transfo | --program-transf \
+ | --program-trans | --program-tran \
+ | --progr-tra | --program-tr | --program-t)
+ ac_prev=program_transform_name ;;
+ -program-transform-name=* | --program-transform-name=* \
+ | --program-transform-nam=* | --program-transform-na=* \
+ | --program-transform-n=* | --program-transform-=* \
+ | --program-transform=* | --program-transfor=* \
+ | --program-transfo=* | --program-transf=* \
+ | --program-trans=* | --program-tran=* \
+ | --progr-tra=* | --program-tr=* | --program-t=*)
+ program_transform_name=$ac_optarg ;;
+
+ -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+ | -silent | --silent | --silen | --sile | --sil)
+ silent=yes ;;
+
+ -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
+ ac_prev=sbindir ;;
+ -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
+ | --sbi=* | --sb=*)
+ sbindir=$ac_optarg ;;
+
+ -sharedstatedir | --sharedstatedir | --sharedstatedi \
+ | --sharedstated | --sharedstate | --sharedstat | --sharedsta \
+ | --sharedst | --shareds | --shared | --share | --shar \
+ | --sha | --sh)
+ ac_prev=sharedstatedir ;;
+ -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \
+ | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \
+ | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \
+ | --sha=* | --sh=*)
+ sharedstatedir=$ac_optarg ;;
+
+ -site | --site | --sit)
+ ac_prev=site ;;
+ -site=* | --site=* | --sit=*)
+ site=$ac_optarg ;;
+
+ -srcdir | --srcdir | --srcdi | --srcd | --src | --sr)
+ ac_prev=srcdir ;;
+ -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*)
+ srcdir=$ac_optarg ;;
+
+ -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \
+ | --syscon | --sysco | --sysc | --sys | --sy)
+ ac_prev=sysconfdir ;;
+ -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \
+ | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*)
+ sysconfdir=$ac_optarg ;;
+
+ -target | --target | --targe | --targ | --tar | --ta | --t)
+ ac_prev=target_alias ;;
+ -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*)
+ target_alias=$ac_optarg ;;
+
+ -v | -verbose | --verbose | --verbos | --verbo | --verb)
+ verbose=yes ;;
+
+ -version | --version | --versio | --versi | --vers | -V)
+ ac_init_version=: ;;
+
+ -with-* | --with-*)
+ ac_package=`expr "x$ac_option" : 'x-*with-\([^=]*\)'`
+ # Reject names that are not valid shell variable names.
+ expr "x$ac_package" : ".*[^-_$as_cr_alnum]" >/dev/null &&
+ { echo "$as_me: error: invalid package name: $ac_package" >&2
+ { (exit 1); exit 1; }; }
+ ac_package=`echo $ac_package| sed 's/-/_/g'`
+ case $ac_option in
+ *=*) ac_optarg=`echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"`;;
+ *) ac_optarg=yes ;;
+ esac
+ eval "with_$ac_package='$ac_optarg'" ;;
+
+ -without-* | --without-*)
+ ac_package=`expr "x$ac_option" : 'x-*without-\(.*\)'`
+ # Reject names that are not valid shell variable names.
+ expr "x$ac_package" : ".*[^-_$as_cr_alnum]" >/dev/null &&
+ { echo "$as_me: error: invalid package name: $ac_package" >&2
+ { (exit 1); exit 1; }; }
+ ac_package=`echo $ac_package | sed 's/-/_/g'`
+ eval "with_$ac_package=no" ;;
+
+ --x)
+ # Obsolete; use --with-x.
+ with_x=yes ;;
+
+ -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \
+ | --x-incl | --x-inc | --x-in | --x-i)
+ ac_prev=x_includes ;;
+ -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \
+ | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*)
+ x_includes=$ac_optarg ;;
+
+ -x-libraries | --x-libraries | --x-librarie | --x-librari \
+ | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l)
+ ac_prev=x_libraries ;;
+ -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \
+ | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
+ x_libraries=$ac_optarg ;;
+
+ -*) { echo "$as_me: error: unrecognized option: $ac_option
+Try \`$0 --help' for more information." >&2
+ { (exit 1); exit 1; }; }
+ ;;
+
+ *=*)
+ ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='`
+ # Reject names that are not valid shell variable names.
+ expr "x$ac_envvar" : ".*[^_$as_cr_alnum]" >/dev/null &&
+ { echo "$as_me: error: invalid variable name: $ac_envvar" >&2
+ { (exit 1); exit 1; }; }
+ ac_optarg=`echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"`
+ eval "$ac_envvar='$ac_optarg'"
+ export $ac_envvar ;;
+
+ *)
+ # FIXME: should be removed in autoconf 3.0.
+ echo "$as_me: WARNING: you should use --build, --host, --target" >&2
+ expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null &&
+ echo "$as_me: WARNING: invalid host type: $ac_option" >&2
+ : ${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}
+ ;;
+
+ esac
+done
+
+if test -n "$ac_prev"; then
+ ac_option=--`echo $ac_prev | sed 's/_/-/g'`
+ { echo "$as_me: error: missing argument to $ac_option" >&2
+ { (exit 1); exit 1; }; }
+fi
+
+# Be sure to have absolute paths.
+for ac_var in exec_prefix prefix
+do
+ eval ac_val=$`echo $ac_var`
+ case $ac_val in
+ [\\/$]* | ?:[\\/]* | NONE | '' ) ;;
+ *) { echo "$as_me: error: expected an absolute directory name for --$ac_var: $ac_val" >&2
+ { (exit 1); exit 1; }; };;
+ esac
+done
+
+# Be sure to have absolute paths.
+for ac_var in bindir sbindir libexecdir datadir sysconfdir sharedstatedir \
+ localstatedir libdir includedir oldincludedir infodir mandir
+do
+ eval ac_val=$`echo $ac_var`
+ case $ac_val in
+ [\\/$]* | ?:[\\/]* ) ;;
+ *) { echo "$as_me: error: expected an absolute directory name for --$ac_var: $ac_val" >&2
+ { (exit 1); exit 1; }; };;
+ esac
+done
+
+# There might be people who depend on the old broken behavior: `$host'
+# used to hold the argument of --host etc.
+# FIXME: To remove some day.
+build=$build_alias
+host=$host_alias
+target=$target_alias
+
+# FIXME: To remove some day.
+if test "x$host_alias" != x; then
+ if test "x$build_alias" = x; then
+ cross_compiling=maybe
+ echo "$as_me: WARNING: If you wanted to set the --build type, don't use --host.
+ If a cross compiler is detected then cross compile mode will be used." >&2
+ elif test "x$build_alias" != "x$host_alias"; then
+ cross_compiling=yes
+ fi
+fi
+
+ac_tool_prefix=
+test -n "$host_alias" && ac_tool_prefix=$host_alias-
+
+test "$silent" = yes && exec 6>/dev/null
+
+
+# Find the source files, if location was not specified.
+if test -z "$srcdir"; then
+ ac_srcdir_defaulted=yes
+ # Try the directory containing this script, then its parent.
+ ac_confdir=`(dirname "$0") 2>/dev/null ||
+$as_expr X"$0" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+ X"$0" : 'X\(//\)[^/]' \| \
+ X"$0" : 'X\(//\)$' \| \
+ X"$0" : 'X\(/\)' \| \
+ . : '\(.\)' 2>/dev/null ||
+echo X"$0" |
+ sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/; q; }
+ /^X\(\/\/\)[^/].*/{ s//\1/; q; }
+ /^X\(\/\/\)$/{ s//\1/; q; }
+ /^X\(\/\).*/{ s//\1/; q; }
+ s/.*/./; q'`
+ srcdir=$ac_confdir
+ if test ! -r $srcdir/$ac_unique_file; then
+ srcdir=..
+ fi
+else
+ ac_srcdir_defaulted=no
+fi
+if test ! -r $srcdir/$ac_unique_file; then
+ if test "$ac_srcdir_defaulted" = yes; then
+ { echo "$as_me: error: cannot find sources ($ac_unique_file) in $ac_confdir or .." >&2
+ { (exit 1); exit 1; }; }
+ else
+ { echo "$as_me: error: cannot find sources ($ac_unique_file) in $srcdir" >&2
+ { (exit 1); exit 1; }; }
+ fi
+fi
+(cd $srcdir && test -r ./$ac_unique_file) 2>/dev/null ||
+ { echo "$as_me: error: sources are in $srcdir, but \`cd $srcdir' does not work" >&2
+ { (exit 1); exit 1; }; }
+srcdir=`echo "$srcdir" | sed 's%\([^\\/]\)[\\/]*$%\1%'`
+ac_env_build_alias_set=${build_alias+set}
+ac_env_build_alias_value=$build_alias
+ac_cv_env_build_alias_set=${build_alias+set}
+ac_cv_env_build_alias_value=$build_alias
+ac_env_host_alias_set=${host_alias+set}
+ac_env_host_alias_value=$host_alias
+ac_cv_env_host_alias_set=${host_alias+set}
+ac_cv_env_host_alias_value=$host_alias
+ac_env_target_alias_set=${target_alias+set}
+ac_env_target_alias_value=$target_alias
+ac_cv_env_target_alias_set=${target_alias+set}
+ac_cv_env_target_alias_value=$target_alias
+
+#
+# Report the --help message.
+#
+if test "$ac_init_help" = "long"; then
+ # Omit some internal or obsolete options to make the list less imposing.
+ # This message is too long to be a string in the A/UX 3.1 sh.
+ cat <<_ACEOF
+\`configure' configures this package to adapt to many kinds of systems.
+
+Usage: $0 [OPTION]... [VAR=VALUE]...
+
+To assign environment variables (e.g., CC, CFLAGS...), specify them as
+VAR=VALUE. See below for descriptions of some of the useful variables.
+
+Defaults for the options are specified in brackets.
+
+Configuration:
+ -h, --help display this help and exit
+ --help=short display options specific to this package
+ --help=recursive display the short help of all the included packages
+ -V, --version display version information and exit
+ -q, --quiet, --silent do not print \`checking...' messages
+ --cache-file=FILE cache test results in FILE [disabled]
+ -C, --config-cache alias for \`--cache-file=config.cache'
+ -n, --no-create do not create output files
+ --srcdir=DIR find the sources in DIR [configure dir or \`..']
+
+_ACEOF
+
+ cat <<_ACEOF
+Installation directories:
+ --prefix=PREFIX install architecture-independent files in PREFIX
+ [$ac_default_prefix]
+ --exec-prefix=EPREFIX install architecture-dependent files in EPREFIX
+ [PREFIX]
+
+By default, \`make install' will install all the files in
+\`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc. You can specify
+an installation prefix other than \`$ac_default_prefix' using \`--prefix',
+for instance \`--prefix=\$HOME'.
+
+For better control, use the options below.
+
+Fine tuning of the installation directories:
+ --bindir=DIR user executables [EPREFIX/bin]
+ --sbindir=DIR system admin executables [EPREFIX/sbin]
+ --libexecdir=DIR program executables [EPREFIX/libexec]
+ --datadir=DIR read-only architecture-independent data [PREFIX/share]
+ --sysconfdir=DIR read-only single-machine data [PREFIX/etc]
+ --sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com]
+ --localstatedir=DIR modifiable single-machine data [PREFIX/var]
+ --libdir=DIR object code libraries [EPREFIX/lib]
+ --includedir=DIR C header files [PREFIX/include]
+ --oldincludedir=DIR C header files for non-gcc [/usr/include]
+ --infodir=DIR info documentation [PREFIX/info]
+ --mandir=DIR man documentation [PREFIX/man]
+_ACEOF
+
+ cat <<\_ACEOF
+_ACEOF
+fi
+
+if test -n "$ac_init_help"; then
+
+ cat <<\_ACEOF
+
+_ACEOF
+fi
+
+if test "$ac_init_help" = "recursive"; then
+ # If there are subdirs, report their specific --help.
+ ac_popdir=`pwd`
+ for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue
+ test -d $ac_dir || continue
+ ac_builddir=.
+
+if test "$ac_dir" != .; then
+ ac_dir_suffix=/`echo "$ac_dir" | sed 's,^\.[\\/],,'`
+ # A "../" for each directory in $ac_dir_suffix.
+ ac_top_builddir=`echo "$ac_dir_suffix" | sed 's,/[^\\/]*,../,g'`
+else
+ ac_dir_suffix= ac_top_builddir=
+fi
+
+case $srcdir in
+ .) # No --srcdir option. We are building in place.
+ ac_srcdir=.
+ if test -z "$ac_top_builddir"; then
+ ac_top_srcdir=.
+ else
+ ac_top_srcdir=`echo $ac_top_builddir | sed 's,/$,,'`
+ fi ;;
+ [\\/]* | ?:[\\/]* ) # Absolute path.
+ ac_srcdir=$srcdir$ac_dir_suffix;
+ ac_top_srcdir=$srcdir ;;
+ *) # Relative path.
+ ac_srcdir=$ac_top_builddir$srcdir$ac_dir_suffix
+ ac_top_srcdir=$ac_top_builddir$srcdir ;;
+esac
+
+# Do not use `cd foo && pwd` to compute absolute paths, because
+# the directories may not exist.
+case `pwd` in
+.) ac_abs_builddir="$ac_dir";;
+*)
+ case "$ac_dir" in
+ .) ac_abs_builddir=`pwd`;;
+ [\\/]* | ?:[\\/]* ) ac_abs_builddir="$ac_dir";;
+ *) ac_abs_builddir=`pwd`/"$ac_dir";;
+ esac;;
+esac
+case $ac_abs_builddir in
+.) ac_abs_top_builddir=${ac_top_builddir}.;;
+*)
+ case ${ac_top_builddir}. in
+ .) ac_abs_top_builddir=$ac_abs_builddir;;
+ [\\/]* | ?:[\\/]* ) ac_abs_top_builddir=${ac_top_builddir}.;;
+ *) ac_abs_top_builddir=$ac_abs_builddir/${ac_top_builddir}.;;
+ esac;;
+esac
+case $ac_abs_builddir in
+.) ac_abs_srcdir=$ac_srcdir;;
+*)
+ case $ac_srcdir in
+ .) ac_abs_srcdir=$ac_abs_builddir;;
+ [\\/]* | ?:[\\/]* ) ac_abs_srcdir=$ac_srcdir;;
+ *) ac_abs_srcdir=$ac_abs_builddir/$ac_srcdir;;
+ esac;;
+esac
+case $ac_abs_builddir in
+.) ac_abs_top_srcdir=$ac_top_srcdir;;
+*)
+ case $ac_top_srcdir in
+ .) ac_abs_top_srcdir=$ac_abs_builddir;;
+ [\\/]* | ?:[\\/]* ) ac_abs_top_srcdir=$ac_top_srcdir;;
+ *) ac_abs_top_srcdir=$ac_abs_builddir/$ac_top_srcdir;;
+ esac;;
+esac
+
+ cd $ac_dir
+ # Check for guested configure; otherwise get Cygnus style configure.
+ if test -f $ac_srcdir/configure.gnu; then
+ echo
+ $SHELL $ac_srcdir/configure.gnu --help=recursive
+ elif test -f $ac_srcdir/configure; then
+ echo
+ $SHELL $ac_srcdir/configure --help=recursive
+ elif test -f $ac_srcdir/configure.ac ||
+ test -f $ac_srcdir/configure.in; then
+ echo
+ $ac_configure --help
+ else
+ echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2
+ fi
+ cd $ac_popdir
+ done
+fi
+
+test -n "$ac_init_help" && exit 0
+if $ac_init_version; then
+ cat <<\_ACEOF
+
+Copyright (C) 2003 Free Software Foundation, Inc.
+This configure script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it.
+_ACEOF
+ exit 0
+fi
+exec 5>config.log
+cat >&5 <<_ACEOF
+This file contains any messages produced by compilers while
+running configure, to aid debugging if configure makes a mistake.
+
+It was created by $as_me, which was
+generated by GNU Autoconf 2.59. Invocation command line was
+
+ $ $0 $@
+
+_ACEOF
+{
+cat <<_ASUNAME
+## --------- ##
+## Platform. ##
+## --------- ##
+
+hostname = `(hostname || uname -n) 2>/dev/null | sed 1q`
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown`
+/bin/uname -X = `(/bin/uname -X) 2>/dev/null || echo unknown`
+
+/bin/arch = `(/bin/arch) 2>/dev/null || echo unknown`
+/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null || echo unknown`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown`
+hostinfo = `(hostinfo) 2>/dev/null || echo unknown`
+/bin/machine = `(/bin/machine) 2>/dev/null || echo unknown`
+/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null || echo unknown`
+/bin/universe = `(/bin/universe) 2>/dev/null || echo unknown`
+
+_ASUNAME
+
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ echo "PATH: $as_dir"
+done
+
+} >&5
+
+cat >&5 <<_ACEOF
+
+
+## ----------- ##
+## Core tests. ##
+## ----------- ##
+
+_ACEOF
+
+
+# Keep a trace of the command line.
+# Strip out --no-create and --no-recursion so they do not pile up.
+# Strip out --silent because we don't want to record it for future runs.
+# Also quote any args containing shell meta-characters.
+# Make two passes to allow for proper duplicate-argument suppression.
+ac_configure_args=
+ac_configure_args0=
+ac_configure_args1=
+ac_sep=
+ac_must_keep_next=false
+for ac_pass in 1 2
+do
+ for ac_arg
+ do
+ case $ac_arg in
+ -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;;
+ -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+ | -silent | --silent | --silen | --sile | --sil)
+ continue ;;
+ *" "*|*" "*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?\"\']*)
+ ac_arg=`echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
+ esac
+ case $ac_pass in
+ 1) ac_configure_args0="$ac_configure_args0 '$ac_arg'" ;;
+ 2)
+ ac_configure_args1="$ac_configure_args1 '$ac_arg'"
+ if test $ac_must_keep_next = true; then
+ ac_must_keep_next=false # Got value, back to normal.
+ else
+ case $ac_arg in
+ *=* | --config-cache | -C | -disable-* | --disable-* \
+ | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \
+ | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \
+ | -with-* | --with-* | -without-* | --without-* | --x)
+ case "$ac_configure_args0 " in
+ "$ac_configure_args1"*" '$ac_arg' "* ) continue ;;
+ esac
+ ;;
+ -* ) ac_must_keep_next=true ;;
+ esac
+ fi
+ ac_configure_args="$ac_configure_args$ac_sep'$ac_arg'"
+ # Get rid of the leading space.
+ ac_sep=" "
+ ;;
+ esac
+ done
+done
+$as_unset ac_configure_args0 || test "${ac_configure_args0+set}" != set || { ac_configure_args0=; export ac_configure_args0; }
+$as_unset ac_configure_args1 || test "${ac_configure_args1+set}" != set || { ac_configure_args1=; export ac_configure_args1; }
+
+# When interrupted or exit'd, cleanup temporary files, and complete
+# config.log. We remove comments because anyway the quotes in there
+# would cause problems or look ugly.
+# WARNING: Be sure not to use single quotes in there, as some shells,
+# such as our DU 5.0 friend, will then `close' the trap.
+trap 'exit_status=$?
+ # Save into config.log some information that might help in debugging.
+ {
+ echo
+
+ cat <<\_ASBOX
+## ---------------- ##
+## Cache variables. ##
+## ---------------- ##
+_ASBOX
+ echo
+ # The following way of writing the cache mishandles newlines in values,
+{
+ (set) 2>&1 |
+ case `(ac_space='"'"' '"'"'; set | grep ac_space) 2>&1` in
+ *ac_space=\ *)
+ sed -n \
+ "s/'"'"'/'"'"'\\\\'"'"''"'"'/g;
+ s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='"'"'\\2'"'"'/p"
+ ;;
+ *)
+ sed -n \
+ "s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1=\\2/p"
+ ;;
+ esac;
+}
+ echo
+
+ cat <<\_ASBOX
+## ----------------- ##
+## Output variables. ##
+## ----------------- ##
+_ASBOX
+ echo
+ for ac_var in $ac_subst_vars
+ do
+ eval ac_val=$`echo $ac_var`
+ echo "$ac_var='"'"'$ac_val'"'"'"
+ done | sort
+ echo
+
+ if test -n "$ac_subst_files"; then
+ cat <<\_ASBOX
+## ------------- ##
+## Output files. ##
+## ------------- ##
+_ASBOX
+ echo
+ for ac_var in $ac_subst_files
+ do
+ eval ac_val=$`echo $ac_var`
+ echo "$ac_var='"'"'$ac_val'"'"'"
+ done | sort
+ echo
+ fi
+
+ if test -s confdefs.h; then
+ cat <<\_ASBOX
+## ----------- ##
+## confdefs.h. ##
+## ----------- ##
+_ASBOX
+ echo
+ sed "/^$/d" confdefs.h | sort
+ echo
+ fi
+ test "$ac_signal" != 0 &&
+ echo "$as_me: caught signal $ac_signal"
+ echo "$as_me: exit $exit_status"
+ } >&5
+ rm -f core *.core &&
+ rm -rf conftest* confdefs* conf$$* $ac_clean_files &&
+ exit $exit_status
+ ' 0
+for ac_signal in 1 2 13 15; do
+ trap 'ac_signal='$ac_signal'; { (exit 1); exit 1; }' $ac_signal
+done
+ac_signal=0
+
+# confdefs.h avoids OS command line length limits that DEFS can exceed.
+rm -rf conftest* confdefs.h
+# AIX cpp loses on an empty file, so make sure it contains at least a newline.
+echo >confdefs.h
+
+# Predefined preprocessor variables.
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_NAME "$PACKAGE_NAME"
+_ACEOF
+
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_TARNAME "$PACKAGE_TARNAME"
+_ACEOF
+
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_VERSION "$PACKAGE_VERSION"
+_ACEOF
+
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_STRING "$PACKAGE_STRING"
+_ACEOF
+
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT"
+_ACEOF
+
+
+# Let the site file select an alternate cache file if it wants to.
+# Prefer explicitly selected file to automatically selected ones.
+if test -z "$CONFIG_SITE"; then
+ if test "x$prefix" != xNONE; then
+ CONFIG_SITE="$prefix/share/config.site $prefix/etc/config.site"
+ else
+ CONFIG_SITE="$ac_default_prefix/share/config.site $ac_default_prefix/etc/config.site"
+ fi
+fi
+for ac_site_file in $CONFIG_SITE; do
+ if test -r "$ac_site_file"; then
+ { echo "$as_me:$LINENO: loading site script $ac_site_file" >&5
+echo "$as_me: loading site script $ac_site_file" >&6;}
+ sed 's/^/| /' "$ac_site_file" >&5
+ . "$ac_site_file"
+ fi
+done
+
+if test -r "$cache_file"; then
+ # Some versions of bash will fail to source /dev/null (special
+ # files actually), so we avoid doing that.
+ if test -f "$cache_file"; then
+ { echo "$as_me:$LINENO: loading cache $cache_file" >&5
+echo "$as_me: loading cache $cache_file" >&6;}
+ case $cache_file in
+ [\\/]* | ?:[\\/]* ) . $cache_file;;
+ *) . ./$cache_file;;
+ esac
+ fi
+else
+ { echo "$as_me:$LINENO: creating cache $cache_file" >&5
+echo "$as_me: creating cache $cache_file" >&6;}
+ >$cache_file
+fi
+
+# Check that the precious variables saved in the cache have kept the same
+# value.
+ac_cache_corrupted=false
+for ac_var in `(set) 2>&1 |
+ sed -n 's/^ac_env_\([a-zA-Z_0-9]*\)_set=.*/\1/p'`; do
+ eval ac_old_set=\$ac_cv_env_${ac_var}_set
+ eval ac_new_set=\$ac_env_${ac_var}_set
+ eval ac_old_val="\$ac_cv_env_${ac_var}_value"
+ eval ac_new_val="\$ac_env_${ac_var}_value"
+ case $ac_old_set,$ac_new_set in
+ set,)
+ { echo "$as_me:$LINENO: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5
+echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;}
+ ac_cache_corrupted=: ;;
+ ,set)
+ { echo "$as_me:$LINENO: error: \`$ac_var' was not set in the previous run" >&5
+echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;}
+ ac_cache_corrupted=: ;;
+ ,);;
+ *)
+ if test "x$ac_old_val" != "x$ac_new_val"; then
+ { echo "$as_me:$LINENO: error: \`$ac_var' has changed since the previous run:" >&5
+echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;}
+ { echo "$as_me:$LINENO: former value: $ac_old_val" >&5
+echo "$as_me: former value: $ac_old_val" >&2;}
+ { echo "$as_me:$LINENO: current value: $ac_new_val" >&5
+echo "$as_me: current value: $ac_new_val" >&2;}
+ ac_cache_corrupted=:
+ fi;;
+ esac
+ # Pass precious variables to config.status.
+ if test "$ac_new_set" = set; then
+ case $ac_new_val in
+ *" "*|*" "*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?\"\']*)
+ ac_arg=$ac_var=`echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;;
+ *) ac_arg=$ac_var=$ac_new_val ;;
+ esac
+ case " $ac_configure_args " in
+ *" '$ac_arg' "*) ;; # Avoid dups. Use of quotes ensures accuracy.
+ *) ac_configure_args="$ac_configure_args '$ac_arg'" ;;
+ esac
+ fi
+done
+if $ac_cache_corrupted; then
+ { echo "$as_me:$LINENO: error: changes in the environment can compromise the build" >&5
+echo "$as_me: error: changes in the environment can compromise the build" >&2;}
+ { { echo "$as_me:$LINENO: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&5
+echo "$as_me: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&2;}
+ { (exit 1); exit 1; }; }
+fi
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+VERSION=1.4
+RELEASE=1
+
+
+
+
+ ac_config_files="$ac_config_files Makefile rds-tools.spec"
+cat >confcache <<\_ACEOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs, see configure's option --config-cache.
+# It is not useful on other systems. If it contains results you don't
+# want to keep, you may remove or edit it.
+#
+# config.status only pays attention to the cache file if you give it
+# the --recheck option to rerun configure.
+#
+# `ac_cv_env_foo' variables (set or unset) will be overridden when
+# loading this file, other *unset* `ac_cv_foo' will be assigned the
+# following values.
+
+_ACEOF
+
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, don't put newlines in cache variables' values.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+{
+ (set) 2>&1 |
+ case `(ac_space=' '; set | grep ac_space) 2>&1` in
+ *ac_space=\ *)
+ # `set' does not quote correctly, so add quotes (double-quote
+ # substitution turns \\\\ into \\, and sed turns \\ into \).
+ sed -n \
+ "s/'/'\\\\''/g;
+ s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p"
+ ;;
+ *)
+ # `set' quotes correctly as required by POSIX, so do not add quotes.
+ sed -n \
+ "s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1=\\2/p"
+ ;;
+ esac;
+} |
+ sed '
+ t clear
+ : clear
+ s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/
+ t end
+ /^ac_cv_env/!s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/
+ : end' >>confcache
+if diff $cache_file confcache >/dev/null 2>&1; then :; else
+ if test -w $cache_file; then
+ test "x$cache_file" != "x/dev/null" && echo "updating cache $cache_file"
+ cat confcache >$cache_file
+ else
+ echo "not updating unwritable cache $cache_file"
+ fi
+fi
+rm -f confcache
+
+test "x$prefix" = xNONE && prefix=$ac_default_prefix
+# Let make expand exec_prefix.
+test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
+
+# VPATH may cause trouble with some makes, so we remove $(srcdir),
+# ${srcdir} and @srcdir@ from VPATH if srcdir is ".", strip leading and
+# trailing colons and then remove the whole line if VPATH becomes empty
+# (actually we leave an empty line to preserve line numbers).
+if test "x$srcdir" = x.; then
+ ac_vpsub='/^[ ]*VPATH[ ]*=/{
+s/:*\$(srcdir):*/:/;
+s/:*\${srcdir}:*/:/;
+s/:*@srcdir@:*/:/;
+s/^\([^=]*=[ ]*\):*/\1/;
+s/:*$//;
+s/^[^=]*=[ ]*$//;
+}'
+fi
+
+# Transform confdefs.h into DEFS.
+# Protect against shell expansion while executing Makefile rules.
+# Protect against Makefile macro expansion.
+#
+# If the first sed substitution is executed (which looks for macros that
+# take arguments), then we branch to the quote section. Otherwise,
+# look for a macro that doesn't take arguments.
+cat >confdef2opt.sed <<\_ACEOF
+t clear
+: clear
+s,^[ ]*#[ ]*define[ ][ ]*\([^ (][^ (]*([^)]*)\)[ ]*\(.*\),-D\1=\2,g
+t quote
+s,^[ ]*#[ ]*define[ ][ ]*\([^ ][^ ]*\)[ ]*\(.*\),-D\1=\2,g
+t quote
+d
+: quote
+s,[ `~#$^&*(){}\\|;'"<>?],\\&,g
+s,\[,\\&,g
+s,\],\\&,g
+s,\$,$$,g
+p
+_ACEOF
+# We use echo to avoid assuming a particular line-breaking character.
+# The extra dot is to prevent the shell from consuming trailing
+# line-breaks from the sub-command output. A line-break within
+# single-quotes doesn't work because, if this script is created in a
+# platform that uses two characters for line-breaks (e.g., DOS), tr
+# would break.
+ac_LF_and_DOT=`echo; echo .`
+DEFS=`sed -n -f confdef2opt.sed confdefs.h | tr "$ac_LF_and_DOT" ' .'`
+rm -f confdef2opt.sed
+
+
+ac_libobjs=
+ac_ltlibobjs=
+for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue
+ # 1. Remove the extension, and $U if already installed.
+ ac_i=`echo "$ac_i" |
+ sed 's/\$U\././;s/\.o$//;s/\.obj$//'`
+ # 2. Add them.
+ ac_libobjs="$ac_libobjs $ac_i\$U.$ac_objext"
+ ac_ltlibobjs="$ac_ltlibobjs $ac_i"'$U.lo'
+done
+LIBOBJS=$ac_libobjs
+
+LTLIBOBJS=$ac_ltlibobjs
+
+
+
+: ${CONFIG_STATUS=./config.status}
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files $CONFIG_STATUS"
+{ echo "$as_me:$LINENO: creating $CONFIG_STATUS" >&5
+echo "$as_me: creating $CONFIG_STATUS" >&6;}
+cat >$CONFIG_STATUS <<_ACEOF
+#! $SHELL
+# Generated by $as_me.
+# Run this file to recreate the current configuration.
+# Compiler output produced by configure, useful for debugging
+# configure, is in config.log if it exists.
+
+debug=false
+ac_cs_recheck=false
+ac_cs_silent=false
+SHELL=\${CONFIG_SHELL-$SHELL}
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF
+## --------------------- ##
+## M4sh Initialization. ##
+## --------------------- ##
+
+# Be Bourne compatible
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+ emulate sh
+ NULLCMD=:
+ # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+ # is contrary to our usage. Disable this feature.
+ alias -g '${1+"$@"}'='"$@"'
+elif test -n "${BASH_VERSION+set}" && (set -o posix) >/dev/null 2>&1; then
+ set -o posix
+fi
+DUALCASE=1; export DUALCASE # for MKS sh
+
+# Support unset when possible.
+if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then
+ as_unset=unset
+else
+ as_unset=false
+fi
+
+
+# Work around bugs in pre-3.0 UWIN ksh.
+$as_unset ENV MAIL MAILPATH
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+for as_var in \
+ LANG LANGUAGE LC_ADDRESS LC_ALL LC_COLLATE LC_CTYPE LC_IDENTIFICATION \
+ LC_MEASUREMENT LC_MESSAGES LC_MONETARY LC_NAME LC_NUMERIC LC_PAPER \
+ LC_TELEPHONE LC_TIME
+do
+ if (set +x; test -z "`(eval $as_var=C; export $as_var) 2>&1`"); then
+ eval $as_var=C; export $as_var
+ else
+ $as_unset $as_var
+ fi
+done
+
+# Required to use basename.
+if expr a : '\(a\)' >/dev/null 2>&1; then
+ as_expr=expr
+else
+ as_expr=false
+fi
+
+if (basename /) >/dev/null 2>&1 && test "X`basename / 2>&1`" = "X/"; then
+ as_basename=basename
+else
+ as_basename=false
+fi
+
+
+# Name of the executable.
+as_me=`$as_basename "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+ X"$0" : 'X\(//\)$' \| \
+ X"$0" : 'X\(/\)$' \| \
+ . : '\(.\)' 2>/dev/null ||
+echo X/"$0" |
+ sed '/^.*\/\([^/][^/]*\)\/*$/{ s//\1/; q; }
+ /^X\/\(\/\/\)$/{ s//\1/; q; }
+ /^X\/\(\/\).*/{ s//\1/; q; }
+ s/.*/./; q'`
+
+
+# PATH needs CR, and LINENO needs CR and PATH.
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+ echo "#! /bin/sh" >conf$$.sh
+ echo "exit 0" >>conf$$.sh
+ chmod +x conf$$.sh
+ if (PATH="/nonexistent;."; conf$$.sh) >/dev/null 2>&1; then
+ PATH_SEPARATOR=';'
+ else
+ PATH_SEPARATOR=:
+ fi
+ rm -f conf$$.sh
+fi
+
+
+ as_lineno_1=$LINENO
+ as_lineno_2=$LINENO
+ as_lineno_3=`(expr $as_lineno_1 + 1) 2>/dev/null`
+ test "x$as_lineno_1" != "x$as_lineno_2" &&
+ test "x$as_lineno_3" = "x$as_lineno_2" || {
+ # Find who we are. Look in the path if we contain no path at all
+ # relative or not.
+ case $0 in
+ *[\\/]* ) as_myself=$0 ;;
+ *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+done
+
+ ;;
+ esac
+ # We did not find ourselves, most probably we were run as `sh COMMAND'
+ # in which case we are not to be found in the path.
+ if test "x$as_myself" = x; then
+ as_myself=$0
+ fi
+ if test ! -f "$as_myself"; then
+ { { echo "$as_me:$LINENO: error: cannot find myself; rerun with an absolute path" >&5
+echo "$as_me: error: cannot find myself; rerun with an absolute path" >&2;}
+ { (exit 1); exit 1; }; }
+ fi
+ case $CONFIG_SHELL in
+ '')
+ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for as_base in sh bash ksh sh5; do
+ case $as_dir in
+ /*)
+ if ("$as_dir/$as_base" -c '
+ as_lineno_1=$LINENO
+ as_lineno_2=$LINENO
+ as_lineno_3=`(expr $as_lineno_1 + 1) 2>/dev/null`
+ test "x$as_lineno_1" != "x$as_lineno_2" &&
+ test "x$as_lineno_3" = "x$as_lineno_2" ') 2>/dev/null; then
+ $as_unset BASH_ENV || test "${BASH_ENV+set}" != set || { BASH_ENV=; export BASH_ENV; }
+ $as_unset ENV || test "${ENV+set}" != set || { ENV=; export ENV; }
+ CONFIG_SHELL=$as_dir/$as_base
+ export CONFIG_SHELL
+ exec "$CONFIG_SHELL" "$0" ${1+"$@"}
+ fi;;
+ esac
+ done
+done
+;;
+ esac
+
+ # Create $as_me.lineno as a copy of $as_myself, but with $LINENO
+ # uniformly replaced by the line number. The first 'sed' inserts a
+ # line-number line before each line; the second 'sed' does the real
+ # work. The second script uses 'N' to pair each line-number line
+ # with the numbered line, and appends trailing '-' during
+ # substitution so that $LINENO is not a special case at line end.
+ # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the
+ # second 'sed' script. Blame Lee E. McMahon for sed's syntax. :-)
+ sed '=' <$as_myself |
+ sed '
+ N
+ s,$,-,
+ : loop
+ s,^\(['$as_cr_digits']*\)\(.*\)[$]LINENO\([^'$as_cr_alnum'_]\),\1\2\1\3,
+ t loop
+ s,-$,,
+ s,^['$as_cr_digits']*\n,,
+ ' >$as_me.lineno &&
+ chmod +x $as_me.lineno ||
+ { { echo "$as_me:$LINENO: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&5
+echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2;}
+ { (exit 1); exit 1; }; }
+
+ # Don't try to exec as it changes $[0], causing all sort of problems
+ # (the dirname of $[0] is not the place where we might find the
+ # original and so on. Autoconf is especially sensible to this).
+ . ./$as_me.lineno
+ # Exit status is that of the last command.
+ exit
+}
+
+
+case `echo "testing\c"; echo 1,2,3`,`echo -n testing; echo 1,2,3` in
+ *c*,-n*) ECHO_N= ECHO_C='
+' ECHO_T=' ' ;;
+ *c*,* ) ECHO_N=-n ECHO_C= ECHO_T= ;;
+ *) ECHO_N= ECHO_C='\c' ECHO_T= ;;
+esac
+
+if expr a : '\(a\)' >/dev/null 2>&1; then
+ as_expr=expr
+else
+ as_expr=false
+fi
+
+rm -f conf$$ conf$$.exe conf$$.file
+echo >conf$$.file
+if ln -s conf$$.file conf$$ 2>/dev/null; then
+ # We could just check for DJGPP; but this test a) works b) is more generic
+ # and c) will remain valid once DJGPP supports symlinks (DJGPP 2.04).
+ if test -f conf$$.exe; then
+ # Don't use ln at all; we don't have any links
+ as_ln_s='cp -p'
+ else
+ as_ln_s='ln -s'
+ fi
+elif ln conf$$.file conf$$ 2>/dev/null; then
+ as_ln_s=ln
+else
+ as_ln_s='cp -p'
+fi
+rm -f conf$$ conf$$.exe conf$$.file
+
+if mkdir -p . 2>/dev/null; then
+ as_mkdir_p=:
+else
+ test -d ./-p && rmdir ./-p
+ as_mkdir_p=false
+fi
+
+as_executable_p="test -f"
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+
+# IFS
+# We need space, tab and new line, in precisely that order.
+as_nl='
+'
+IFS=" $as_nl"
+
+# CDPATH.
+$as_unset CDPATH
+
+exec 6>&1
+
+# Open the log real soon, to keep \$[0] and so on meaningful, and to
+# report actual input values of CONFIG_FILES etc. instead of their
+# values after options handling. Logging --version etc. is OK.
+exec 5>>config.log
+{
+ echo
+ sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX
+## Running $as_me. ##
+_ASBOX
+} >&5
+cat >&5 <<_CSEOF
+
+This file was extended by $as_me, which was
+generated by GNU Autoconf 2.59. Invocation command line was
+
+ CONFIG_FILES = $CONFIG_FILES
+ CONFIG_HEADERS = $CONFIG_HEADERS
+ CONFIG_LINKS = $CONFIG_LINKS
+ CONFIG_COMMANDS = $CONFIG_COMMANDS
+ $ $0 $@
+
+_CSEOF
+echo "on `(hostname || uname -n) 2>/dev/null | sed 1q`" >&5
+echo >&5
+_ACEOF
+
+# Files that config.status was made for.
+if test -n "$ac_config_files"; then
+ echo "config_files=\"$ac_config_files\"" >>$CONFIG_STATUS
+fi
+
+if test -n "$ac_config_headers"; then
+ echo "config_headers=\"$ac_config_headers\"" >>$CONFIG_STATUS
+fi
+
+if test -n "$ac_config_links"; then
+ echo "config_links=\"$ac_config_links\"" >>$CONFIG_STATUS
+fi
+
+if test -n "$ac_config_commands"; then
+ echo "config_commands=\"$ac_config_commands\"" >>$CONFIG_STATUS
+fi
+
+cat >>$CONFIG_STATUS <<\_ACEOF
+
+ac_cs_usage="\
+\`$as_me' instantiates files from templates according to the
+current configuration.
+
+Usage: $0 [OPTIONS] [FILE]...
+
+ -h, --help print this help, then exit
+ -V, --version print version number, then exit
+ -q, --quiet do not print progress messages
+ -d, --debug don't remove temporary files
+ --recheck update $as_me by reconfiguring in the same conditions
+ --file=FILE[:TEMPLATE]
+ instantiate the configuration file FILE
+
+Configuration files:
+$config_files
+
+Report bugs to <bug-autoconf at gnu.org>."
+_ACEOF
+
+cat >>$CONFIG_STATUS <<_ACEOF
+ac_cs_version="\\
+config.status
+configured by $0, generated by GNU Autoconf 2.59,
+ with options \\"`echo "$ac_configure_args" | sed 's/[\\""\`\$]/\\\\&/g'`\\"
+
+Copyright (C) 2003 Free Software Foundation, Inc.
+This config.status script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it."
+srcdir=$srcdir
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF
+# If no file are specified by the user, then we need to provide default
+# value. By we need to know if files were specified by the user.
+ac_need_defaults=:
+while test $# != 0
+do
+ case $1 in
+ --*=*)
+ ac_option=`expr "x$1" : 'x\([^=]*\)='`
+ ac_optarg=`expr "x$1" : 'x[^=]*=\(.*\)'`
+ ac_shift=:
+ ;;
+ -*)
+ ac_option=$1
+ ac_optarg=$2
+ ac_shift=shift
+ ;;
+ *) # This is not an option, so the user has probably given explicit
+ # arguments.
+ ac_option=$1
+ ac_need_defaults=false;;
+ esac
+
+ case $ac_option in
+ # Handling of the options.
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF
+ -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
+ ac_cs_recheck=: ;;
+ --version | --vers* | -V )
+ echo "$ac_cs_version"; exit 0 ;;
+ --he | --h)
+ # Conflict between --help and --header
+ { { echo "$as_me:$LINENO: error: ambiguous option: $1
+Try \`$0 --help' for more information." >&5
+echo "$as_me: error: ambiguous option: $1
+Try \`$0 --help' for more information." >&2;}
+ { (exit 1); exit 1; }; };;
+ --help | --hel | -h )
+ echo "$ac_cs_usage"; exit 0 ;;
+ --debug | --d* | -d )
+ debug=: ;;
+ --file | --fil | --fi | --f )
+ $ac_shift
+ CONFIG_FILES="$CONFIG_FILES $ac_optarg"
+ ac_need_defaults=false;;
+ --header | --heade | --head | --hea )
+ $ac_shift
+ CONFIG_HEADERS="$CONFIG_HEADERS $ac_optarg"
+ ac_need_defaults=false;;
+ -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+ | -silent | --silent | --silen | --sile | --sil | --si | --s)
+ ac_cs_silent=: ;;
+
+ # This is an error.
+ -*) { { echo "$as_me:$LINENO: error: unrecognized option: $1
+Try \`$0 --help' for more information." >&5
+echo "$as_me: error: unrecognized option: $1
+Try \`$0 --help' for more information." >&2;}
+ { (exit 1); exit 1; }; } ;;
+
+ *) ac_config_targets="$ac_config_targets $1" ;;
+
+ esac
+ shift
+done
+
+ac_configure_extra_args=
+
+if $ac_cs_silent; then
+ exec 6>/dev/null
+ ac_configure_extra_args="$ac_configure_extra_args --silent"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF
+if \$ac_cs_recheck; then
+ echo "running $SHELL $0 " $ac_configure_args \$ac_configure_extra_args " --no-create --no-recursion" >&6
+ exec $SHELL $0 $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion
+fi
+
+_ACEOF
+
+
+
+
+
+cat >>$CONFIG_STATUS <<\_ACEOF
+for ac_config_target in $ac_config_targets
+do
+ case "$ac_config_target" in
+ # Handling of arguments.
+ "Makefile" ) CONFIG_FILES="$CONFIG_FILES Makefile" ;;
+ "rds-tools.spec" ) CONFIG_FILES="$CONFIG_FILES rds-tools.spec" ;;
+ *) { { echo "$as_me:$LINENO: error: invalid argument: $ac_config_target" >&5
+echo "$as_me: error: invalid argument: $ac_config_target" >&2;}
+ { (exit 1); exit 1; }; };;
+ esac
+done
+
+# If the user did not use the arguments to specify the items to instantiate,
+# then the envvar interface is used. Set only those that are not.
+# We use the long form for the default assignment because of an extremely
+# bizarre bug on SunOS 4.1.3.
+if $ac_need_defaults; then
+ test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files
+fi
+
+# Have a temporary directory for convenience. Make it in the build tree
+# simply because there is no reason to put it here, and in addition,
+# creating and moving files from /tmp can sometimes cause problems.
+# Create a temporary directory, and hook for its removal unless debugging.
+$debug ||
+{
+ trap 'exit_status=$?; rm -rf $tmp && exit $exit_status' 0
+ trap '{ (exit 1); exit 1; }' 1 2 13 15
+}
+
+# Create a (secure) tmp directory for tmp files.
+
+{
+ tmp=`(umask 077 && mktemp -d -q "./confstatXXXXXX") 2>/dev/null` &&
+ test -n "$tmp" && test -d "$tmp"
+} ||
+{
+ tmp=./confstat$$-$RANDOM
+ (umask 077 && mkdir $tmp)
+} ||
+{
+ echo "$me: cannot create a temporary directory in ." >&2
+ { (exit 1); exit 1; }
+}
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<_ACEOF
+
+#
+# CONFIG_FILES section.
+#
+
+# No need to generate the scripts if there are no CONFIG_FILES.
+# This happens for instance when ./config.status config.h
+if test -n "\$CONFIG_FILES"; then
+ # Protect against being on the right side of a sed subst in config.status.
+ sed 's/,@/@@/; s/@,/@@/; s/,;t t\$/@;t t/; /@;t t\$/s/[\\\\&,]/\\\\&/g;
+ s/@@/,@/; s/@@/@,/; s/@;t t\$/,;t t/' >\$tmp/subs.sed <<\\CEOF
+s, at SHELL@,$SHELL,;t t
+s, at PATH_SEPARATOR@,$PATH_SEPARATOR,;t t
+s, at PACKAGE_NAME@,$PACKAGE_NAME,;t t
+s, at PACKAGE_TARNAME@,$PACKAGE_TARNAME,;t t
+s, at PACKAGE_VERSION@,$PACKAGE_VERSION,;t t
+s, at PACKAGE_STRING@,$PACKAGE_STRING,;t t
+s, at PACKAGE_BUGREPORT@,$PACKAGE_BUGREPORT,;t t
+s, at exec_prefix@,$exec_prefix,;t t
+s, at prefix@,$prefix,;t t
+s, at program_transform_name@,$program_transform_name,;t t
+s, at bindir@,$bindir,;t t
+s, at sbindir@,$sbindir,;t t
+s, at libexecdir@,$libexecdir,;t t
+s, at datadir@,$datadir,;t t
+s, at sysconfdir@,$sysconfdir,;t t
+s, at sharedstatedir@,$sharedstatedir,;t t
+s, at localstatedir@,$localstatedir,;t t
+s, at libdir@,$libdir,;t t
+s, at includedir@,$includedir,;t t
+s, at oldincludedir@,$oldincludedir,;t t
+s, at infodir@,$infodir,;t t
+s, at mandir@,$mandir,;t t
+s, at build_alias@,$build_alias,;t t
+s, at host_alias@,$host_alias,;t t
+s, at target_alias@,$target_alias,;t t
+s, at DEFS@,$DEFS,;t t
+s, at ECHO_C@,$ECHO_C,;t t
+s, at ECHO_N@,$ECHO_N,;t t
+s, at ECHO_T@,$ECHO_T,;t t
+s, at LIBS@,$LIBS,;t t
+s, at VERSION@,$VERSION,;t t
+s, at RELEASE@,$RELEASE,;t t
+s, at LIBOBJS@,$LIBOBJS,;t t
+s, at LTLIBOBJS@,$LTLIBOBJS,;t t
+CEOF
+
+_ACEOF
+
+ cat >>$CONFIG_STATUS <<\_ACEOF
+ # Split the substitutions into bite-sized pieces for seds with
+ # small command number limits, like on Digital OSF/1 and HP-UX.
+ ac_max_sed_lines=48
+ ac_sed_frag=1 # Number of current file.
+ ac_beg=1 # First line for current file.
+ ac_end=$ac_max_sed_lines # Line after last line for current file.
+ ac_more_lines=:
+ ac_sed_cmds=
+ while $ac_more_lines; do
+ if test $ac_beg -gt 1; then
+ sed "1,${ac_beg}d; ${ac_end}q" $tmp/subs.sed >$tmp/subs.frag
+ else
+ sed "${ac_end}q" $tmp/subs.sed >$tmp/subs.frag
+ fi
+ if test ! -s $tmp/subs.frag; then
+ ac_more_lines=false
+ else
+ # The purpose of the label and of the branching condition is to
+ # speed up the sed processing (if there are no `@' at all, there
+ # is no need to browse any of the substitutions).
+ # These are the two extra sed commands mentioned above.
+ (echo ':t
+ /@[a-zA-Z_][a-zA-Z_0-9]*@/!b' && cat $tmp/subs.frag) >$tmp/subs-$ac_sed_frag.sed
+ if test -z "$ac_sed_cmds"; then
+ ac_sed_cmds="sed -f $tmp/subs-$ac_sed_frag.sed"
+ else
+ ac_sed_cmds="$ac_sed_cmds | sed -f $tmp/subs-$ac_sed_frag.sed"
+ fi
+ ac_sed_frag=`expr $ac_sed_frag + 1`
+ ac_beg=$ac_end
+ ac_end=`expr $ac_end + $ac_max_sed_lines`
+ fi
+ done
+ if test -z "$ac_sed_cmds"; then
+ ac_sed_cmds=cat
+ fi
+fi # test -n "$CONFIG_FILES"
+
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF
+for ac_file in : $CONFIG_FILES; do test "x$ac_file" = x: && continue
+ # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in".
+ case $ac_file in
+ - | *:- | *:-:* ) # input from stdin
+ cat >$tmp/stdin
+ ac_file_in=`echo "$ac_file" | sed 's,[^:]*:,,'`
+ ac_file=`echo "$ac_file" | sed 's,:.*,,'` ;;
+ *:* ) ac_file_in=`echo "$ac_file" | sed 's,[^:]*:,,'`
+ ac_file=`echo "$ac_file" | sed 's,:.*,,'` ;;
+ * ) ac_file_in=$ac_file.in ;;
+ esac
+
+ # Compute @srcdir@, @top_srcdir@, and @INSTALL@ for subdirectories.
+ ac_dir=`(dirname "$ac_file") 2>/dev/null ||
+$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+ X"$ac_file" : 'X\(//\)[^/]' \| \
+ X"$ac_file" : 'X\(//\)$' \| \
+ X"$ac_file" : 'X\(/\)' \| \
+ . : '\(.\)' 2>/dev/null ||
+echo X"$ac_file" |
+ sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/; q; }
+ /^X\(\/\/\)[^/].*/{ s//\1/; q; }
+ /^X\(\/\/\)$/{ s//\1/; q; }
+ /^X\(\/\).*/{ s//\1/; q; }
+ s/.*/./; q'`
+ { if $as_mkdir_p; then
+ mkdir -p "$ac_dir"
+ else
+ as_dir="$ac_dir"
+ as_dirs=
+ while test ! -d "$as_dir"; do
+ as_dirs="$as_dir $as_dirs"
+ as_dir=`(dirname "$as_dir") 2>/dev/null ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+ X"$as_dir" : 'X\(//\)[^/]' \| \
+ X"$as_dir" : 'X\(//\)$' \| \
+ X"$as_dir" : 'X\(/\)' \| \
+ . : '\(.\)' 2>/dev/null ||
+echo X"$as_dir" |
+ sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/; q; }
+ /^X\(\/\/\)[^/].*/{ s//\1/; q; }
+ /^X\(\/\/\)$/{ s//\1/; q; }
+ /^X\(\/\).*/{ s//\1/; q; }
+ s/.*/./; q'`
+ done
+ test ! -n "$as_dirs" || mkdir $as_dirs
+ fi || { { echo "$as_me:$LINENO: error: cannot create directory \"$ac_dir\"" >&5
+echo "$as_me: error: cannot create directory \"$ac_dir\"" >&2;}
+ { (exit 1); exit 1; }; }; }
+
+ ac_builddir=.
+
+if test "$ac_dir" != .; then
+ ac_dir_suffix=/`echo "$ac_dir" | sed 's,^\.[\\/],,'`
+ # A "../" for each directory in $ac_dir_suffix.
+ ac_top_builddir=`echo "$ac_dir_suffix" | sed 's,/[^\\/]*,../,g'`
+else
+ ac_dir_suffix= ac_top_builddir=
+fi
+
+case $srcdir in
+ .) # No --srcdir option. We are building in place.
+ ac_srcdir=.
+ if test -z "$ac_top_builddir"; then
+ ac_top_srcdir=.
+ else
+ ac_top_srcdir=`echo $ac_top_builddir | sed 's,/$,,'`
+ fi ;;
+ [\\/]* | ?:[\\/]* ) # Absolute path.
+ ac_srcdir=$srcdir$ac_dir_suffix;
+ ac_top_srcdir=$srcdir ;;
+ *) # Relative path.
+ ac_srcdir=$ac_top_builddir$srcdir$ac_dir_suffix
+ ac_top_srcdir=$ac_top_builddir$srcdir ;;
+esac
+
+# Do not use `cd foo && pwd` to compute absolute paths, because
+# the directories may not exist.
+case `pwd` in
+.) ac_abs_builddir="$ac_dir";;
+*)
+ case "$ac_dir" in
+ .) ac_abs_builddir=`pwd`;;
+ [\\/]* | ?:[\\/]* ) ac_abs_builddir="$ac_dir";;
+ *) ac_abs_builddir=`pwd`/"$ac_dir";;
+ esac;;
+esac
+case $ac_abs_builddir in
+.) ac_abs_top_builddir=${ac_top_builddir}.;;
+*)
+ case ${ac_top_builddir}. in
+ .) ac_abs_top_builddir=$ac_abs_builddir;;
+ [\\/]* | ?:[\\/]* ) ac_abs_top_builddir=${ac_top_builddir}.;;
+ *) ac_abs_top_builddir=$ac_abs_builddir/${ac_top_builddir}.;;
+ esac;;
+esac
+case $ac_abs_builddir in
+.) ac_abs_srcdir=$ac_srcdir;;
+*)
+ case $ac_srcdir in
+ .) ac_abs_srcdir=$ac_abs_builddir;;
+ [\\/]* | ?:[\\/]* ) ac_abs_srcdir=$ac_srcdir;;
+ *) ac_abs_srcdir=$ac_abs_builddir/$ac_srcdir;;
+ esac;;
+esac
+case $ac_abs_builddir in
+.) ac_abs_top_srcdir=$ac_top_srcdir;;
+*)
+ case $ac_top_srcdir in
+ .) ac_abs_top_srcdir=$ac_abs_builddir;;
+ [\\/]* | ?:[\\/]* ) ac_abs_top_srcdir=$ac_top_srcdir;;
+ *) ac_abs_top_srcdir=$ac_abs_builddir/$ac_top_srcdir;;
+ esac;;
+esac
+
+
+
+ if test x"$ac_file" != x-; then
+ { echo "$as_me:$LINENO: creating $ac_file" >&5
+echo "$as_me: creating $ac_file" >&6;}
+ rm -f "$ac_file"
+ fi
+ # Let's still pretend it is `configure' which instantiates (i.e., don't
+ # use $as_me), people would be surprised to read:
+ # /* config.h. Generated by config.status. */
+ if test x"$ac_file" = x-; then
+ configure_input=
+ else
+ configure_input="$ac_file. "
+ fi
+ configure_input=$configure_input"Generated from `echo $ac_file_in |
+ sed 's,.*/,,'` by configure."
+
+ # First look for the input files in the build tree, otherwise in the
+ # src tree.
+ ac_file_inputs=`IFS=:
+ for f in $ac_file_in; do
+ case $f in
+ -) echo $tmp/stdin ;;
+ [\\/$]*)
+ # Absolute (can't be DOS-style, as IFS=:)
+ test -f "$f" || { { echo "$as_me:$LINENO: error: cannot find input file: $f" >&5
+echo "$as_me: error: cannot find input file: $f" >&2;}
+ { (exit 1); exit 1; }; }
+ echo "$f";;
+ *) # Relative
+ if test -f "$f"; then
+ # Build tree
+ echo "$f"
+ elif test -f "$srcdir/$f"; then
+ # Source tree
+ echo "$srcdir/$f"
+ else
+ # /dev/null tree
+ { { echo "$as_me:$LINENO: error: cannot find input file: $f" >&5
+echo "$as_me: error: cannot find input file: $f" >&2;}
+ { (exit 1); exit 1; }; }
+ fi;;
+ esac
+ done` || { (exit 1); exit 1; }
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF
+ sed "$ac_vpsub
+$extrasub
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF
+:t
+/@[a-zA-Z_][a-zA-Z_0-9]*@/!b
+s, at configure_input@,$configure_input,;t t
+s, at srcdir@,$ac_srcdir,;t t
+s, at abs_srcdir@,$ac_abs_srcdir,;t t
+s, at top_srcdir@,$ac_top_srcdir,;t t
+s, at abs_top_srcdir@,$ac_abs_top_srcdir,;t t
+s, at builddir@,$ac_builddir,;t t
+s, at abs_builddir@,$ac_abs_builddir,;t t
+s, at top_builddir@,$ac_top_builddir,;t t
+s, at abs_top_builddir@,$ac_abs_top_builddir,;t t
+" $ac_file_inputs | (eval "$ac_sed_cmds") >$tmp/out
+ rm -f $tmp/stdin
+ if test x"$ac_file" != x-; then
+ mv $tmp/out $ac_file
+ else
+ cat $tmp/out
+ rm -f $tmp/out
+ fi
+
+done
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF
+
+{ (exit 0); exit 0; }
+_ACEOF
+chmod +x $CONFIG_STATUS
+ac_clean_files=$ac_clean_files_save
+
+
+# configure is writing to config.log, and then calls config.status.
+# config.status does its own redirection, appending to config.log.
+# Unfortunately, on DOS this fails, as config.log is still kept open
+# by configure, so config.status won't be able to write to it; its
+# output is simply discarded. So we exec the FD to /dev/null,
+# effectively closing config.log, so it can be properly (re)opened and
+# appended to by config.status. When coming back to configure, we
+# need to make the FD available again.
+if test "$no_create" != yes; then
+ ac_cs_success=:
+ ac_config_status_args=
+ test "$silent" = yes &&
+ ac_config_status_args="$ac_config_status_args --quiet"
+ exec 5>/dev/null
+ $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false
+ exec 5>>config.log
+ # Use ||, not &&, to avoid exiting from the if with $? = 1, which
+ # would make configure fail if this is the last instruction.
+ $ac_cs_success || { (exit 1); exit 1; }
+fi
+
diff --git a/configure.in b/configure.in
new file mode 100644
index 0000000..9cccaff
--- /dev/null
+++ b/configure.in
@@ -0,0 +1,10 @@
+AC_PREREQ(2.55)
+AC_INIT()
+
+VERSION=1.4
+RELEASE=1
+
+AC_SUBST(VERSION)
+AC_SUBST(RELEASE)
+
+AC_OUTPUT(Makefile rds-tools.spec)
diff --git a/docs/rds-architecture.txt b/docs/rds-architecture.txt
new file mode 100644
index 0000000..c67077c
--- /dev/null
+++ b/docs/rds-architecture.txt
@@ -0,0 +1,356 @@
+
+Overview
+========
+
+This readme tries to provide some background on the hows and whys of RDS,
+and will hopefully help you find your way around the code.
+
+In addition, please see this email about RDS origins:
+http://oss.oracle.com/pipermail/rds-devel/2007-November/000228.html
+
+RDS Architecture
+================
+
+RDS provides reliable, ordered datagram delivery by using a single
+reliable connection between any two nodes in the cluster. This allows
+applications to use a single socket to talk to any other process in the
+cluster - so in a cluster with N processes you need N sockets, in contrast
+to N*N if you use a connection-oriented socket transport like TCP.
+
+RDS is not Infiniband-specific; it was designed to support different
+transports. The current implementation used to support RDS over TCP as well
+as IB. Work is in progress to support RDS over iWARP, and using DCE to
+guarantee no dropped packets on Ethernet, it may be possible to use RDS over
+UDP in the future.
+
+The high-level semantics of RDS from the application's point of view are
+
+ * Addressing
+ RDS uses IPv4 addresses and 16bit port numbers to identify
+ the end point of a connection. All socket operations that involve
+ passing addresses between kernel and user space generally
+ use a struct sockaddr_in.
+
+ The fact that IPv4 addresses are used does not mean the underlying
+ transport has to be IP-based. In fact, RDS over IB uses a
+ reliable IB connection; the IP address is used exclusively to
+ locate the remote node's GID (by ARPing for the given IP).
+
+ The port space is entirely independent of UDP, TCP or any other
+ protocol.
+
+ * Socket interface
+ RDS sockets work *mostly* as you would expect from a BSD
+ socket. The next section will cover the details. At any rate,
+ all I/O is performed through the standard BSD socket API.
+ Some additions like zerocopy support are implemented through
+ control messages, while other extensions use the getsockopt/
+ setsockopt calls.
+
+ Sockets must be bound before you can send or receive data.
+ This is needed because binding also selects a transport and
+ attaches it to the socket. Once bound, the transport assignment
+ does not change. RDS will tolerate IPs moving around (eg in
+ a active-active HA scenario), but only as long as the address
+ doesn't move to a different transport.
+
+ * sysctls
+ RDS supports a number of sysctls in /proc/sys/net/rds
+
+
+Socket Interface
+================
+
+ AF_RDS, PF_RDS, SOL_RDS
+ These constants haven't been assigned yet, because RDS isn't in
+ mainline yet. Currently, the kernel module assigns some constant
+ and publishes it to user space through two sysctl files
+ /proc/sys/net/rds/pf_rds
+ /proc/sys/net/rds/sol_rds
+
+ fd = socket(PF_RDS, SOCK_SEQPACKET, 0);
+ This creates a new, unbound RDS socket.
+
+ setsockopt(SOL_SOCKET): send and receive buffer size
+ RDS honors the send and receive buffer size socket options.
+ You are not allowed to queue more than SO_SNDSIZE bytes to
+ a socket. A message is queued when sendmsg is called, and
+ it leaves the queue when the remote system acknowledges
+ its arrival.
+
+ The SO_RCVSIZE option controls the maximum receive queue length.
+ This is a soft limit rather than a hard limit - RDS will
+ continue to accept and queue incoming messages, even if that
+ takes the queue length over the limit. However, it will also
+ mark the port as "congested" and send a congestion update to
+ the source node. The source node is supposed to throttle any
+ processes sending to this congested port.
+
+ bind(fd, &sockaddr_in, ...)
+ This binds the socket to a local IP address and port, and a
+ transport.
+
+ sendmsg(fd, ...)
+ Sends a message to the indicated recipient. The kernel will
+ transparently establish the underlying reliable connection
+ if it isn't up yet.
+
+ An attempt to send a message that exceeds SO_SNDSIZE will
+ return with -EMSGSIZE
+
+ An attempt to send a message that would take the total number
+ of queued bytes over the SO_SNDSIZE threshold will return
+ EAGAIN.
+
+ An attempt to send a message to a destination that is marked
+ as "congested" will return ENOBUFS.
+
+ recvmsg(fd, ...)
+ Receives a message that was queued to this socket. The sockets
+ recv queue accounting is adjusted, and if the queue length
+ drops below SO_SNDSIZE, the port is marked uncongested, and
+ a congestion update is sent to all peers.
+
+ Applications can ask the RDS kernel module to receive
+ notifications via control messages (for instance, there is a
+ notification when a congestion update arrived, or when a RDMA
+ operation completes). These notifications are received through
+ the msg.msg_control buffer of struct msghdr. The format of the
+ messages is described in manpages.
+
+ poll(fd)
+ RDS supports the poll interface to allow the application
+ to implement async I/O.
+
+ POLLIN handling is pretty straightforward. When there's an
+ incoming message queued to the socket, or a pending notification,
+ we signal POLLIN.
+
+ POLLOUT is a little harder. Since you can essentially send
+ to any destination, RDS will always signal POLLOUT as long as
+ there's room on the send queue (ie the number of bytes queued
+ is less than the sendbuf size).
+
+ However, the kernel will refuse to accept messages to
+ a destination marked congested - in this case you will loop
+ forever if you rely on poll to tell you what to do.
+ This isn't a trivial problem, but applications can deal with
+ this - by using congestion notifications, and by checking for
+ ENOBUFS errors returned by sendmsg.
+
+ setsockopt(SOL_RDS, RDS_CANCEL_SENT_TO, &sockaddr_in)
+ This allows the application to discard all messages queued to a
+ specific destination on this particular socket.
+
+ This allows the application to cancel outstanding messages if
+ it detects a timeout. For instance, if it tried to send a message,
+ and the remote host is unreachable, RDS will keep trying forever.
+ The application may decide it's not worth it, and cancel the
+ operation. In this case, it would use RDS_CANCEL_SENT_TO to
+ nuke any pending messages.
+
+
+RDMA for RDS
+============
+
+ see rds-rdma(7) manpage (available in rds-tools)
+
+
+Congestion Notifications
+========================
+
+ see rds(7) manpage
+
+
+RDS Protocol
+============
+
+ Message header
+
+ The message header is a 'struct rds_header' (see rds.h):
+ Fields:
+ h_sequence:
+ per-packet sequence number
+ h_ack:
+ piggybacked acknowledgment of last packet received
+ h_len:
+ length of data, not including header
+ h_sport:
+ source port
+ h_dport:
+ destination port
+ h_flags:
+ CONG_BITMAP - this is a congestion update bitmap
+ ACK_REQUIRED - receiver must ack this packet
+ RETRANSMITTED - packet has previously been sent
+ h_credit:
+ indicate to other end of connection that
+ it has more credits available (i.e. there is
+ more send room)
+ h_padding[4]:
+ unused, for future use
+ h_csum:
+ header checksum
+ h_exthdr:
+ optional data can be passed here. This is currently used for
+ passing RDMA-related information.
+
+ ACK and retransmit handling
+
+ One might think that with reliable IB connections you wouldn't need
+ to ack messages that have been received. The problem is that IB
+ hardware generates an ack message before it has DMAed the message
+ into memory. This creates a potential message loss if the HCA is
+ disabled for any reason between when it sends the ack and before
+ the message is DMAed and processed. This is only a potential issue
+ if another HCA is available for fail-over.
+
+ Sending an ack immediately would allow the sender to free the sent
+ message from their send queue quickly, but could cause excessive
+ traffic to be used for acks. RDS piggybacks acks on sent data
+ packets. Ack-only packets are reduced by only allowing one to be
+ in flight at a time, and by the sender only asking for acks when
+ its send buffers start to fill up. All retransmissions are also
+ acked.
+
+ Flow Control
+
+ RDS's IB transport uses a credit-based mechanism to verify that
+ there is space in the peer's receive buffers for more data. This
+ eliminates the need for hardware retries on the connection.
+
+ Congestion
+
+ Messages waiting in the receive queue on the receiving socket
+ are accounted against the sockets SO_RCVBUF option value. Only
+ the payload bytes in the message are accounted for. If the
+ number of bytes queued equals or exceeds rcvbuf then the socket
+ is congested. All sends attempted to this socket's address
+ should return block or return -EWOULDBLOCK.
+
+ Applications are expected to be reasonably tuned such that this
+ situation very rarely occurs. An application encountering this
+ "back-pressure" is considered a bug.
+
+ This is implemented by having each node maintain bitmaps which
+ indicate which ports on bound addresses are congested. As the
+ bitmap changes it is sent through all the connections which
+ terminate in the local address of the bitmap which changed.
+
+ The bitmaps are allocated as connections are brought up. This
+ avoids allocation in the interrupt handling path which queues
+ sages on sockets. The dense bitmaps let transports send the
+ entire bitmap on any bitmap change reasonably efficiently. This
+ is much easier to implement than some finer-grained
+ communication of per-port congestion. The sender does a very
+ inexpensive bit test to test if the port it's about to send to
+ is congested or not.
+
+
+RDS Transport Layer
+==================
+
+ As mentioned above, RDS is not IB-specific. Its code is divided
+ into a general RDS layer and a transport layer.
+
+ The general layer handles the socket API, congestion handling,
+ loopback, stats, usermem pinning, and the connection state machine.
+
+ The transport layer handles the details of the transport. The IB
+ transport, for example, handles all the queue pairs, work requests,
+ CM event handlers, and other Infiniband details.
+
+
+RDS Kernel Structures
+=====================
+
+ struct rds_message
+ aka possibly "rds_outgoing", the generic RDS layer copies data to
+ be sent and sets header fields as needed, based on the socket API.
+ This is then queued for the individual connection and sent by the
+ connection's transport.
+ struct rds_incoming
+ a generic struct referring to incoming data that can be handed from
+ the transport to the general code and queued by the general code
+ while the socket is awoken. It is then passed back to the transport
+ code to handle the actual copy-to-user.
+ struct rds_socket
+ per-socket information
+ struct rds_connection
+ per-connection information
+ struct rds_transport
+ pointers to transport-specific functions
+ struct rds_statistics
+ non-transport-specific statistics
+ struct rds_cong_map
+ wraps the raw congestion bitmap, contains rbnode, waitq, etc.
+
+Connection management
+=====================
+
+ Connections may be in UP, DOWN, CONNECTING, DISCONNECTING, and
+ ERROR states.
+
+ The first time an attempt is made by an RDS socket to send data to
+ a node, a connection is allocated and connected. That connection is
+ then maintained forever -- if there are transport errors, the
+ connection will be dropped and re-established.
+
+ Dropping a connection while packets are queued will cause queued or
+ partially-sent datagrams to be retransmitted when the connection is
+ re-established.
+
+
+The send path
+=============
+
+ rds_sendmsg()
+ struct rds_message built from incoming data
+ CMSGs parsed (e.g. RDMA ops)
+ transport connection alloced and connected if not already
+ rds_message placed on send queue
+ send worker awoken
+ rds_send_worker()
+ calls rds_send_xmit() until queue is empty
+ rds_send_xmit()
+ transmits congestion map if one is pending
+ may set ACK_REQUIRED
+ calls transport to send either non-RDMA or RDMA message
+ (RDMA ops never retransmitted)
+ rds_ib_xmit()
+ allocs work requests from send ring
+ adds any new send credits available to peer (h_credits)
+ maps the rds_message's sg list
+ piggybacks ack
+ populates work requests
+ post send to connection's queue pair
+
+The recv path
+=============
+
+ rds_ib_recv_cq_comp_handler()
+ looks at write completions
+ unmaps recv buffer from device
+ no errors, call rds_ib_process_recv()
+ refill recv ring
+ rds_ib_process_recv()
+ validate header checksum
+ copy header to rds_ib_incoming struct if start of a new datagram
+ add to ibinc's fraglist
+ if competed datagram:
+ update cong map if datagram was cong update
+ call rds_recv_incoming() otherwise
+ note if ack is required
+ rds_recv_incoming()
+ drop duplicate packets
+ respond to pings
+ find the sock associated with this datagram
+ add to sock queue
+ wake up sock
+ do some congestion calculations
+ rds_recvmsg
+ copy data into user iovec
+ handle CMSGs
+ return to application
+
+
diff --git a/examples/Makefile b/examples/Makefile
new file mode 100644
index 0000000..ef35c1f
--- /dev/null
+++ b/examples/Makefile
@@ -0,0 +1,6 @@
+
+all: rds-sample
+
+rds-sample: rds-sample.o
+
+CFLAGS = -I ../net
diff --git a/examples/README b/examples/README
new file mode 100644
index 0000000..3433656
--- /dev/null
+++ b/examples/README
@@ -0,0 +1,6 @@
+The source in this directory is meant to serve as an aid for
+becoming familiar with RDS socket programming.
+
+Questions about this or other rds-tools code are welcomed on the
+rds-devel list: http://oss.oracle.com/mailman/listinfo/rds-devel
+
diff --git a/examples/rds-sample.c b/examples/rds-sample.c
new file mode 100644
index 0000000..b7cd325
--- /dev/null
+++ b/examples/rds-sample.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2008 Chelsio, Inc. All rights reserved.
+ *
+ * Author: Jon Mason <jon at opengridcomputing.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+/* FIXME - this is a hack to getaround RDS not exporting any header files.
+ * This is a local copy.
+ */
+#include "ib_rds.h"
+/* These are defined in rds.h....but that file is not happily included */
+#define SOL_RDS 272
+#define PF_RDS 28
+
+
+#define TESTPORT 4000
+#define BUFSIZE 94
+
+static int do_rdma_read(int sock, struct msghdr *msg, void *buf)
+{
+ struct rds_rdma_args *args;
+ struct rds_iovec iov;
+ struct cmsghdr *cmsg;
+ int rc;
+
+ cmsg = CMSG_FIRSTHDR(msg);
+ args = (struct rds_rdma_args *)CMSG_DATA(cmsg);
+
+ /* Do a sendmsg call to preform the RDMA */
+ cmsg->cmsg_level = SOL_RDS;
+ cmsg->cmsg_type = RDS_CMSG_RDMA_ARGS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct rds_rdma_args));
+
+ iov.addr = (uint64_t) buf;
+ iov.bytes = BUFSIZE * sizeof(char);
+
+ args->remote_vec.addr = 0;
+ args->remote_vec.bytes = BUFSIZE * sizeof(char);
+ args->local_vec_addr = (uint64_t) &iov;
+ args->nr_local = 1;
+ args->flags = RDS_RDMA_NOTIFY_ME;
+ args->user_token = 0;
+
+ msg->msg_controllen = CMSG_SPACE(sizeof(struct rds_rdma_args));
+
+ rc = sendmsg(sock, msg, 0);
+ if (rc < 0) {
+ printf("%s: Error sending message: %d %d\n", __func__, rc, errno);
+ return -1;
+ }
+
+ sleep(1);
+
+ rc = recvmsg(sock, msg, 0);
+ if (rc < 0) {
+ printf("%s: Error receiving message: %d %d\n", __func__, rc, errno);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void server(char *address)
+{
+ struct sockaddr_in sin, din;
+ void *buf, *ctlbuf;
+ struct msghdr msg;
+ struct iovec *iov;
+ int rc, sock;
+
+ buf = calloc(BUFSIZE, sizeof(char));
+ if (!buf) {
+ printf("%s: calloc failed\n", __func__);
+ return;
+ }
+
+ sock = socket(PF_RDS, SOCK_SEQPACKET, 0);
+ if (sock < 0) {
+ printf("%s: Error creating Socket: %d\n", __func__, sock);
+ goto out;
+ }
+
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = inet_addr(address);
+ sin.sin_port = TESTPORT;
+
+ rc = bind(sock, (struct sockaddr *)&sin, sizeof(sin));
+ if (rc < 0) {
+ printf("%s: Error binding to address: %d %d\n", __func__, rc, errno);
+ goto out;
+ }
+
+ /* The recv iov could contain a regular RDS packet or an RDMA RDS
+ * packet, so set it up for the worst case for both.
+ */
+ iov = calloc(1, sizeof(struct iovec));
+ if (!iov) {
+ printf("%s: calloc failed\n", __func__);
+ goto out;
+ }
+
+ ctlbuf = calloc(1, sizeof(struct rds_rdma_args));
+ if (!ctlbuf) {
+ printf("%s: calloc failed\n", __func__);
+ goto out1;
+ }
+
+ iov[0].iov_base = buf;
+ iov[0].iov_len = BUFSIZE * sizeof(char);
+
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_name = &din;
+ msg.msg_namelen = sizeof(din);
+ msg.msg_iov = iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = ctlbuf;
+ msg.msg_controllen = CMSG_SPACE(sizeof(struct rds_rdma_args));
+
+ printf("server listening on %s\n", inet_ntoa(sin.sin_addr));
+
+ rc = recvmsg(sock, &msg, 0);
+ if (rc < 0) {
+ printf("%s: Error receiving message: %d %d\n", __func__, rc, errno);
+ goto out2;
+ }
+
+ printf("Received a packet len %d, cmsg len %d, on port %d\n",
+ (uint32_t) iov[0].iov_len,
+ (uint32_t) msg.msg_controllen,
+ din.sin_port);
+
+ if (msg.msg_controllen) {
+ rc = do_rdma_read(sock, &msg, buf);
+ if (rc < 0)
+ goto out2;
+ }
+ printf("payload contains: %s\n", (char *)buf);
+
+out2:
+ free(ctlbuf);
+out1:
+ free(iov);
+out:
+ free(buf);
+}
+
+static void create_message(char *buf)
+{
+ int i;
+
+ for (i = 0; i < BUFSIZE; i++)
+ buf[i] = i + 0x21;
+}
+
+static int build_rds_rdma_packet(int sock, struct msghdr *msg, void *buf, uint64_t *cookie)
+{
+ struct rds_get_mr_args mr_args;
+ struct cmsghdr *cmsg;
+ void *ctlbuf;
+
+ mr_args.vec.addr = (uint64_t) buf;
+ mr_args.vec.bytes = BUFSIZE * sizeof(char);
+ mr_args.cookie_addr = (uint64_t) cookie;
+ mr_args.flags = RDS_RDMA_READWRITE;
+
+ ctlbuf = calloc(1, CMSG_SPACE(sizeof(mr_args)));
+ if (!ctlbuf) {
+ printf("%s: calloc failed\n", __func__);
+ return -1;
+ }
+
+ msg->msg_control = ctlbuf;
+ msg->msg_controllen = CMSG_SPACE(sizeof(mr_args));
+
+ cmsg = CMSG_FIRSTHDR(msg);
+ cmsg->cmsg_level = SOL_RDS;
+ cmsg->cmsg_type = RDS_CMSG_RDMA_MAP;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(mr_args));
+ memcpy(CMSG_DATA(cmsg), &mr_args, sizeof(mr_args));
+
+ msg->msg_iov = NULL;
+ msg->msg_iovlen = 0;
+
+ return 0;
+}
+
+static int build_rds_packet(struct msghdr *msg, char *buf)
+{
+ struct iovec *iov;
+
+ iov = calloc(1, sizeof(struct iovec));
+ if (!iov) {
+ printf("%s: calloc failed\n", __func__);
+ return -1;
+ }
+
+ msg->msg_iov = iov;
+ msg->msg_iovlen = 1;
+
+ iov[0].iov_base = buf;
+ iov[0].iov_len = BUFSIZE * sizeof(char);
+
+ return 0;
+}
+
+static void client(char *localaddr, char *remoteaddr, int rdma)
+{
+ struct sockaddr_in sin, din;
+ struct msghdr msg;
+ uint64_t cookie = 0;
+ int rc, sock;
+ void *buf;
+
+ buf = calloc(BUFSIZE, sizeof(char));
+ if (!buf) {
+ printf("%s: calloc failed\n", __func__);
+ return;
+ }
+
+ create_message((char *)buf);
+
+ sock = socket(PF_RDS, SOCK_SEQPACKET, 0);
+ if (sock < 0) {
+ printf("%s: Error creating Socket: %d\n", __func__, sock);
+ goto out;
+ }
+
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = inet_addr(localaddr);
+
+ rc = bind(sock, (struct sockaddr *)&sin, sizeof(sin));
+ if (rc < 0) {
+ printf("%s: Error binding to address: %d %d\n", __func__, rc, errno);
+ goto out;
+ }
+
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_name = &din;
+ msg.msg_namelen = sizeof(din);
+
+ memset(&din, 0, sizeof(din));
+ din.sin_family = AF_INET;
+ din.sin_addr.s_addr = inet_addr(remoteaddr);
+ din.sin_port = TESTPORT;
+
+ if (rdma) {
+ rc = build_rds_rdma_packet(sock, &msg, buf, &cookie);
+ if (rc < 0)
+ goto out;
+
+ printf("Client Sending RDMA message from %s to %s\n",
+ localaddr, remoteaddr);
+ } else {
+ rc = build_rds_packet(&msg, buf);
+ if (rc < 0)
+ goto out;
+
+ printf("client sending %d byte message %s from %s to %s on port %d\n",
+ (uint32_t) msg.msg_iov->iov_len,
+ (char *)buf,
+ localaddr,
+ remoteaddr,
+ sin.sin_port);
+ }
+
+ rc = sendmsg(sock, &msg, 0);
+ if (rc < 0) {
+ printf("%s: Error sending message: %d %d\n", __func__, rc, errno);
+ goto out1;
+ }
+
+ if (rdma) {
+ /* reuse the same msg, as it should no longer be necessary and this incoming
+ * msg should be empty
+ */
+ rc = recvmsg(sock, &msg, 0);
+ if (rc < 0) {
+ printf("%s: Error receiving message: %d %d\n", __func__, rc, errno);
+ }
+ }
+
+out1:
+ if (msg.msg_control)
+ free(msg.msg_control);
+ if (msg.msg_iov)
+ free(msg.msg_iov);
+out:
+ free(buf);
+}
+
+int main(int argc, char **argv)
+{
+ char *serveraddr = NULL, *clientaddr = NULL;
+ int i, rdma = 0;
+
+ if (argc < 3) {
+ printf("not enough args\n");
+ return -1;
+ }
+
+ for (i = 1; i < argc; i++) {
+ if (!strcmp("-s", argv[i])) {
+ serveraddr = argv[i+1];
+ i++;
+ } else if (!strcmp("-c", argv[i])) {
+ clientaddr = argv[i+1];
+ i++;
+ } else if (!strcmp("--rdma", argv[i])) {
+ rdma = 1;
+ } else
+ printf("Invalid param\n");
+ }
+
+ if (serveraddr && !clientaddr) {
+ server(serveraddr);
+ } else if (serveraddr && clientaddr) {
+ client(clientaddr, serveraddr, rdma);
+ }
+
+ return 0;
+}
+
diff --git a/kernel-list.h b/kernel-list.h
new file mode 100644
index 0000000..9e25b59
--- /dev/null
+++ b/kernel-list.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef _LINUX_LIST_H
+#define _LINUX_LIST_H
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct list_head {
+ struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+ struct list_head name = { &name, &name }
+
+#define INIT_LIST_HEAD(ptr) do { \
+ (ptr)->next = (ptr); (ptr)->prev = (ptr); \
+} while (0)
+
+#if (!defined(__GNUC__) && !defined(__WATCOMC__))
+#define __inline__
+#endif
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static __inline__ void __list_add(struct list_head * new,
+ struct list_head * prev,
+ struct list_head * next)
+{
+ next->prev = new;
+ new->next = next;
+ new->prev = prev;
+ prev->next = new;
+}
+
+/*
+ * Insert a new entry after the specified head..
+ */
+static __inline__ void list_add(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head, head->next);
+}
+
+/*
+ * Insert a new entry at the tail
+ */
+static __inline__ void list_add_tail(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static __inline__ void __list_del(struct list_head * prev,
+ struct list_head * next)
+{
+ next->prev = prev;
+ prev->next = next;
+}
+
+static __inline__ void list_del(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+}
+
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+ INIT_LIST_HEAD(entry);
+}
+
+/**
+ * list_move - delete from one list and add as another's head
+ * @list: the entry to move
+ * @head: the head that will precede our entry
+ */
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+ __list_del(list->prev, list->next);
+ list_add(list, head);
+}
+
+/**
+ * list_move_tail - delete from one list and add as another's tail
+ * @list: the entry to move
+ * @head: the head that will follow our entry
+ */
+static inline void list_move_tail(struct list_head *list,
+ struct list_head *head)
+{
+ __list_del(list->prev, list->next);
+ list_add_tail(list, head);
+}
+
+static __inline__ int list_empty(struct list_head *head)
+{
+ return head->next == head;
+}
+
+/*
+ * Splice in "list" into "head"
+ */
+static __inline__ void list_splice(struct list_head *list, struct list_head *head)
+{
+ struct list_head *first = list->next;
+
+ if (first != list) {
+ struct list_head *last = list->prev;
+ struct list_head *at = head->next;
+
+ first->prev = head;
+ head->next = first;
+
+ last->next = at;
+ at->prev = last;
+ }
+}
+
+#define list_entry(ptr, type, member) \
+ ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+#define list_for_each(pos, head) \
+ for (pos = (head)->next; pos != (head); pos = pos->next)
+
+#define list_for_each_safe(pos, n, head) \
+ for (pos = (head)->next, n = pos->next; pos != (head); \
+ pos = n, n = pos->next)
+
+/**
+ * list_for_each_entry - iterate over list of given type
+ * @pos: the type * to use as a loop counter.
+ * @head: the head for your list.
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_for_each_entry(pos, head, member) \
+ for (pos = list_entry((head)->next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = list_entry(pos->member.next, typeof(*pos), member))
+
+#endif
diff --git a/net/ib_rds.h b/net/ib_rds.h
new file mode 100644
index 0000000..992139c
--- /dev/null
+++ b/net/ib_rds.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2008 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef IB_RDS_H
+#define IB_RDS_H
+
+#include <linux/types.h>
+
+/* These sparse annotated types shouldn't be in any user
+ * visible header file. We should clean this up rather
+ * than kludging around them. */
+#ifndef __KERNEL__
+#define __be16 u_int16_t
+#define __be32 u_int32_t
+#define __be64 u_int64_t
+#endif
+
+#define RDS_IB_ABI_VERSION 0x301
+
+/*
+ * setsockopt/getsockopt for SOL_RDS
+ */
+#define RDS_CANCEL_SENT_TO 1
+#define RDS_GET_MR 2
+#define RDS_FREE_MR 3
+/* deprecated: RDS_BARRIER 4 */
+#define RDS_RECVERR 5
+#define RDS_CONG_MONITOR 6
+
+/*
+ * Control message types for SOL_RDS.
+ *
+ * CMSG_RDMA_ARGS (sendmsg)
+ * Request a RDMA transfer to/from the specified
+ * memory ranges.
+ * The cmsg_data is a struct rds_rdma_args.
+ * RDS_CMSG_RDMA_DEST (recvmsg, sendmsg)
+ * Kernel informs application about intended
+ * source/destination of a RDMA transfer
+ * RDS_CMSG_RDMA_MAP (sendmsg)
+ * Application asks kernel to map the given
+ * memory range into a IB MR, and send the
+ * R_Key along in an RDS extension header.
+ * The cmsg_data is a struct rds_get_mr_args,
+ * the same as for the GET_MR setsockopt.
+ * RDS_CMSG_RDMA_STATUS (recvmsg)
+ * Returns the status of a completed RDMA operation.
+ */
+#define RDS_CMSG_RDMA_ARGS 1
+#define RDS_CMSG_RDMA_DEST 2
+#define RDS_CMSG_RDMA_MAP 3
+#define RDS_CMSG_RDMA_STATUS 4
+#define RDS_CMSG_CONG_UPDATE 5
+
+#define RDS_INFO_COUNTERS 10000
+#define RDS_INFO_CONNECTIONS 10001
+/* 10002 aka RDS_INFO_FLOWS is deprecated */
+#define RDS_INFO_SEND_MESSAGES 10003
+#define RDS_INFO_RETRANS_MESSAGES 10004
+#define RDS_INFO_RECV_MESSAGES 10005
+#define RDS_INFO_SOCKETS 10006
+#define RDS_INFO_TCP_SOCKETS 10007
+#define RDS_INFO_IB_CONNECTIONS 10008
+
+struct rds_info_counter {
+ u_int8_t name[32];
+ u_int64_t value;
+} __attribute__((packed));
+
+#define RDS_INFO_CONNECTION_FLAG_SENDING 0x01
+#define RDS_INFO_CONNECTION_FLAG_CONNECTING 0x02
+#define RDS_INFO_CONNECTION_FLAG_CONNECTED 0x04
+
+struct rds_info_connection {
+ u_int64_t next_tx_seq;
+ u_int64_t next_rx_seq;
+ __be32 laddr;
+ __be32 faddr;
+ u_int8_t transport[15]; /* null term ascii */
+ u_int8_t flags;
+} __attribute__((packed));
+
+struct rds_info_flow {
+ __be32 laddr;
+ __be32 faddr;
+ u_int32_t bytes;
+ __be16 lport;
+ __be16 fport;
+} __attribute__((packed));
+
+#define RDS_INFO_MESSAGE_FLAG_ACK 0x01
+#define RDS_INFO_MESSAGE_FLAG_FAST_ACK 0x02
+
+struct rds_info_message {
+ u_int64_t seq;
+ u_int32_t len;
+ __be32 laddr;
+ __be32 faddr;
+ __be16 lport;
+ __be16 fport;
+ u_int8_t flags;
+} __attribute__((packed));
+
+struct rds_info_socket {
+ u_int32_t sndbuf;
+ __be32 bound_addr;
+ __be32 connected_addr;
+ __be16 bound_port;
+ __be16 connected_port;
+ u_int32_t rcvbuf;
+ uint64_t inum;
+} __attribute__((packed));
+
+struct rds_info_socket_v1 {
+ u_int32_t sndbuf;
+ __be32 bound_addr;
+ __be32 connected_addr;
+ __be16 bound_port;
+ __be16 connected_port;
+ u_int32_t rcvbuf;
+} __attribute__((packed));
+
+struct rds_info_tcp_socket {
+ __be32 local_addr;
+ __be16 local_port;
+ __be32 peer_addr;
+ __be16 peer_port;
+ u_int64_t hdr_rem;
+ u_int64_t data_rem;
+ u_int32_t last_sent_nxt;
+ u_int32_t last_expected_una;
+ u_int32_t last_seen_una;
+} __attribute__((packed));
+
+#define RDS_IB_GID_LEN 16
+struct rds_info_ib_connection {
+ __be32 src_addr;
+ __be32 dst_addr;
+ uint8_t src_gid[RDS_IB_GID_LEN];
+ uint8_t dst_gid[RDS_IB_GID_LEN];
+
+ uint32_t max_send_wr;
+ uint32_t max_recv_wr;
+ uint32_t max_send_sge;
+ uint32_t rdma_fmr_max;
+ uint32_t rdma_fmr_size;
+};
+
+/*
+ * Congestion monitoring.
+ * Congestion control in RDS happens at the host connection
+ * level by exchanging a bitmap marking congested ports.
+ * By default, a process sleeping in poll() is always woken
+ * up when the congestion map is updated.
+ * With explicit monitoring, an application can have more
+ * fine-grained control.
+ * The application installs a 64bit mask value in the socket,
+ * where each bit corresponds to a group of ports.
+ * When a congestion update arrives, RDS checks the set of
+ * ports that are now uncongested against the list bit mask
+ * installed in the socket, and if they overlap, we queue a
+ * cong_notification on the socket.
+ *
+ * To install the congestion monitor bitmask, use RDS_CONG_MONITOR
+ * with the 64bit mask.
+ * Congestion updates are received via RDS_CMSG_CONG_UPDATE
+ * control messages.
+ *
+ * The correspondence between bits and ports is
+ * 1 << (portnum % 64)
+ */
+#define RDS_CONG_MONITOR_SIZE 64
+#define RDS_CONG_MONITOR_BIT(port) (((unsigned int) port) % RDS_CONG_MONITOR_SIZE)
+#define RDS_CONG_MONITOR_MASK(port) (1ULL << RDS_CONG_MONITOR_BIT(port))
+
+/*
+ * RDMA related types
+ */
+
+/*
+ * This encapsulates a remote memory location.
+ * In the current implementation, it contains the R_Key
+ * of the remote memory region, and the offset into it
+ * (so that the application does not have to worry about
+ * alignment).
+ */
+typedef u_int64_t rds_rdma_cookie_t;
+
+struct rds_iovec {
+ u_int64_t addr;
+ u_int64_t bytes;
+};
+
+struct rds_get_mr_args {
+ struct rds_iovec vec;
+ u_int64_t cookie_addr;
+ uint64_t flags;
+};
+
+struct rds_free_mr_args {
+ rds_rdma_cookie_t cookie;
+ u_int64_t flags;
+};
+
+struct rds_rdma_args {
+ rds_rdma_cookie_t cookie;
+ struct rds_iovec remote_vec;
+ u_int64_t local_vec_addr;
+ u_int64_t nr_local;
+ u_int64_t flags;
+ u_int64_t user_token;
+};
+
+struct rds_rdma_notify {
+ u_int64_t user_token;
+ int32_t status;
+};
+
+#define RDS_RDMA_SUCCESS 0
+#define RDS_RDMA_REMOTE_ERROR 1
+#define RDS_RDMA_CANCELED 2
+#define RDS_RDMA_DROPPED 3
+#define RDS_RDMA_OTHER_ERROR 4
+
+/*
+ * Common set of flags for all RDMA related structs
+ */
+#define RDS_RDMA_READWRITE 0x0001
+#define RDS_RDMA_FENCE 0x0002 /* use FENCE for immediate send */
+#define RDS_RDMA_INVALIDATE 0x0004 /* invalidate R_Key after freeing MR */
+#define RDS_RDMA_USE_ONCE 0x0008 /* free MR after use */
+#define RDS_RDMA_DONTWAIT 0x0010 /* Don't wait in SET_BARRIER */
+#define RDS_RDMA_NOTIFY_ME 0x0020 /* Notify when operation completes */
+
+#endif /* IB_RDS_H */
diff --git a/net/rds.h b/net/rds.h
new file mode 100644
index 0000000..ec9aa6c
--- /dev/null
+++ b/net/rds.h
@@ -0,0 +1,50 @@
+/*
+ * net/rds.h - user space interface for RDS
+ *
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NET_RDS_H
+#define __NET_RDS_H
+
+#include "ib_rds.h"
+
+static inline int
+rds_rdma_id_sign(uint64_t id1, uint64_t id2)
+{
+ int64_t diff = id1 - id2;
+
+ return (diff < 0)? -1 : ((diff == 0)? 0 : 1);
+}
+
+#define rds_rdma_id_cmp(id1, cmp, id2) (rds_rdma_id_sign((id1), (id2)) cmp 0)
+
+#endif /* __NET_RDS_H */
diff --git a/options.c b/options.c
new file mode 100644
index 0000000..0956b25
--- /dev/null
+++ b/options.c
@@ -0,0 +1,481 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * options.c - options and stuff
+ */
+
+#define _LARGEFILE64_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <limits.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <signal.h>
+#include <getopt.h>
+#include <libgen.h>
+#include <inttypes.h>
+#include <errno.h>
+
+#include "kernel-list.h"
+#include "rdstool.h"
+
+
+/* This gets changed in parse_options() */
+char *progname = "rds-generic-tool";
+unsigned int verbose = 1;
+
+sig_atomic_t running = 1;
+
+
+/*
+ * Take "address:port" and return a sockaddr(_in) that describes it.
+ * Since RDS is IPv4 only, we don't worry about PF_INET6.
+ *
+ * XXX: Should we try a default IP or default port? RDS is very
+ * endpoint-oriented; right now we require explicitness.
+ *
+ * Since getaddrinfo(3) returns multiple addresses, we simply find the
+ * first SOCK_DGRAM AF_INET result. Note that RDS actually uses
+ * SOCK_SEQPACKET, but we're lying to getaddrinfo(3).
+ */
+static int parse_endpoint(struct rds_endpoint *nep)
+{
+ int rc;
+ char *host, *port;
+ struct addrinfo *list, *try;
+ struct addrinfo hint = {
+ .ai_family = PF_INET,
+ .ai_socktype = SOCK_DGRAM,
+ };
+
+ host = strdup(nep->re_name);
+ if (!host) {
+ rc = -ENOMEM;
+ verbosef(0, stderr, "%s: Unable to allocate memory\n",
+ progname);
+ goto out;
+ }
+
+ port = strchr(host, ':');
+ if (!port) {
+ rc = -EINVAL;
+ verbosef(0, stderr, "%s: Invalid endpoint: %s\n",
+ progname, nep->re_name);
+ goto out;
+ }
+
+ *port = '\0';
+ port++;
+
+ rc = getaddrinfo(host, port, &hint, &list);
+ if (rc) {
+ verbosef(0, stderr, "%s: Unable to resolve \"%s\": %s\n",
+ progname, nep->re_name, gai_strerror(rc));
+ goto out;
+ }
+
+ for (try = list; try; try = try->ai_next) {
+ if ((try->ai_family == PF_INET) &&
+ (try->ai_socktype == SOCK_DGRAM))
+ break;
+ }
+
+ if (try) {
+ if (try->ai_addrlen != sizeof(struct sockaddr_in))
+ verbosef(0, stderr,
+ "%s: OMG WTF BBQ! try->ai_addrlen = %d, sizeof(struct sockaddr_in) = %zd\n",
+ progname, try->ai_addrlen,
+ sizeof(struct sockaddr_in));
+
+ memcpy(&nep->re_addr, try->ai_addr, try->ai_addrlen);
+ }
+
+ if (list)
+ freeaddrinfo(list);
+
+out:
+ return rc;
+}
+
+static int add_endpoint(const char *endpoint, struct list_head *list)
+{
+ int rc;
+ struct rds_endpoint *nep;
+
+ nep = malloc(sizeof(struct rds_endpoint));
+ if (!nep)
+ return -ENOMEM;
+
+ nep->re_name = strdup(endpoint);
+ if (!nep->re_name) {
+ free(nep);
+ return -ENOMEM;
+ }
+
+ rc = parse_endpoint(nep);
+ if (!rc) {
+ list_add_tail(&nep->re_item, list);
+ } else {
+ free(nep->re_name);
+ free(nep);
+ }
+
+ return rc;
+}
+
+static int get_number(char *arg, uint64_t *res)
+{
+ char *ptr = NULL;
+ uint64_t num;
+
+ num = strtoull(arg, &ptr, 0);
+
+ if ((ptr == arg) || (num == UINT64_MAX))
+ return(-EINVAL);
+
+ switch (*ptr) {
+ case '\0':
+ break;
+
+ case 'g':
+ case 'G':
+ num *= 1024;
+ /* FALL THROUGH */
+
+ case 'm':
+ case 'M':
+ num *= 1024;
+ /* FALL THROUGH */
+
+ case 'k':
+ case 'K':
+ num *= 1024;
+ /* FALL THROUGH */
+
+ case 'b':
+ case 'B':
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ *res = num;
+
+ return 0;
+}
+
+extern char *optarg;
+extern int optopt;
+extern int optind;
+extern int opterr;
+int parse_options(int argc, char *argv[], const char *opts,
+ struct rds_context *ctxt)
+{
+ int c, rc = 0;
+ uint64_t val;
+ struct list_head saddrs;
+
+ if (argc && argv[0])
+ progname = basename(argv[0]);
+
+ INIT_LIST_HEAD(&saddrs);
+ opterr = 0;
+ while ((c = getopt(argc, argv, opts)) != EOF) {
+ switch (c) {
+ case 's':
+ if (!list_empty(&saddrs)) {
+ verbosef(0, stderr,
+ "%s: Only one source address allowed\n",
+ progname);
+ rc = -EINVAL;
+ } else
+ rc = add_endpoint(optarg, &saddrs);
+ break;
+
+ case 'd':
+ rc = add_endpoint(optarg, &ctxt->rc_daddrs);
+ break;
+
+ case 'm':
+ rc = get_number(optarg, &val);
+ if (rc) {
+ verbosef(0, stderr,
+ "%s: Invalid number: %s\n",
+ progname, optarg);
+ break;
+ }
+
+ if (val > UINT32_MAX) {
+ rc = -EINVAL;
+ verbosef(0, stderr,
+ "%s: Message size too large: %"PRIu64"\n",
+ progname, val);
+ } else
+ ctxt->rc_msgsize = (uint32_t)val;
+ break;
+
+ case 'l':
+ rc = get_number(optarg, &ctxt->rc_total);
+ if (rc) {
+ verbosef(0, stderr,
+ "%s: Invalid number: %s\n",
+ progname, optarg);
+ }
+ break;
+
+ case 'f':
+ ctxt->rc_filename = optarg;
+ stats_extended(1);
+ break;
+
+ case 'i':
+ rc = get_number(optarg, &val);
+ if (rc) {
+ verbosef(0, stderr,
+ "%s: Invalid number: %s\n",
+ progname, optarg);
+ break;
+ }
+
+ if (val > LONG_MAX) {
+ rc = -EINVAL;
+ verbosef(0, stderr,
+ "%s: Sleep interval too large: %"PRIu64"\n",
+ progname, val);
+ } else {
+ rc = stats_init((long)val);
+ }
+
+ break;
+
+
+ case 'v':
+ verbose++;
+ break;
+
+ case 'q':
+ if (verbose)
+ verbose--;
+ break;
+
+ case 'V':
+ print_version();
+ break;
+
+ case 'h':
+ print_usage(0);
+ break;
+
+ case '-':
+ if (!strcmp(optarg, "help"))
+ print_usage(0);
+ else if (!strcmp(optarg, "version"))
+ print_version();
+ else {
+ rc = -EINVAL;
+ verbosef(0, stderr,
+ "%s: Invalid argument: \'--%s\'\n",
+ progname, optarg);
+ }
+ break;
+
+ case '?':
+ verbosef(0, stderr,
+ "%s: Invalid option \'-%c\'\n",
+ progname, optopt);
+ rc = -EINVAL;
+ break;
+
+ case ':':
+ verbosef(0, stderr,
+ "%s: Option \'-%c\' requires an argument\n",
+ progname, optopt);
+ rc = -EINVAL;
+ break;
+
+ default:
+ verbosef(0, stderr,
+ "%s: Shouldn't get here %c %c\n",
+ progname, optopt, c);
+ rc = -EINVAL;
+ break;
+ }
+
+ if (rc)
+ goto out;
+ }
+
+ if (list_empty(&saddrs)) {
+ verbosef(0, stderr, "%s: Source endpoint address required\n",
+ progname);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ ctxt->rc_saddr = list_entry(saddrs.prev, struct rds_endpoint,
+ re_item);
+
+out:
+ return rc;
+}
+
+int rds_bind(struct rds_context *ctxt)
+{
+ int rc;
+ struct rds_endpoint *e = ctxt->rc_saddr;
+
+ rc = socket(PF_RDS, SOCK_SEQPACKET, 0);
+ if (rc < 0) {
+ rc = -errno;
+ verbosef(0, stderr, "%s: Unable to create socket: %s\n",
+ progname, strerror(-rc));
+ goto out;
+ }
+
+ e->re_fd = rc;
+ rc = bind(e->re_fd, (struct sockaddr *)&e->re_addr,
+ sizeof(struct sockaddr_in));
+ if (rc) {
+ rc = -errno;
+ verbosef(0, stderr, "%s: Unable to bind socket: %s\n",
+ progname, strerror(-rc));
+
+ close(e->re_fd);
+ e->re_fd = -1;
+ goto out;
+ }
+
+out:
+ return rc;
+}
+
+int dup_file(struct rds_context *ctxt, int fd, int flags)
+{
+ int tmp_fd, rc = 0;
+ char *type;
+
+ /* "-" is stdin/stdout */
+ if (!strcmp(ctxt->rc_filename, "-"))
+ goto out;
+
+ tmp_fd = open64(ctxt->rc_filename, flags);
+ if (tmp_fd < 0) {
+ rc = -errno;
+ verbosef(0, stderr, "%s: Unable to open file \"%s\": %s\n",
+ progname, ctxt->rc_filename, strerror(-rc));
+ goto out;
+ }
+
+ if (tmp_fd != fd) {
+ rc = dup2(tmp_fd, fd);
+ if (rc < 0) {
+ rc = -errno;
+ switch (fd) {
+ case STDIN_FILENO:
+ type = "stdin";
+ break;
+
+ case STDOUT_FILENO:
+ type = "stdout";
+ break;
+
+ case STDERR_FILENO:
+ type = "stderr";
+ break;
+
+ default:
+ type = "random fd";
+ break;
+ }
+
+ verbosef(0, stderr,
+ "%s: Unable to set file \"%s\" as %s: %s\n",
+ progname, ctxt->rc_filename, type,
+ strerror(-rc));
+ } else if (rc != fd) {
+ verbosef(0, stderr,
+ "%s: dup2(2) failed for some reason!\n",
+ progname);
+ rc = -EBADF;
+ } else
+ rc = 0;
+ }
+
+out:
+ return rc;
+}
+
+int runningp(void)
+{
+ return running;
+}
+
+void handler(int signum)
+{
+ running = 0;
+}
+
+int setup_signals(void)
+{
+ int rc = -EINVAL;
+ struct sigaction act;
+
+ sigemptyset(&act.sa_mask);
+ act.sa_handler = handler;
+ act.sa_flags = 0;
+
+ if (sigaction(SIGTERM, &act, NULL))
+ goto out;
+
+ if (sigaction(SIGINT, &act, NULL))
+ goto out;
+
+ act.sa_handler = SIG_IGN;
+ if (sigaction(SIGPIPE, &act, NULL)) /* Get EPIPE instead */
+ goto out;
+
+ rc = 0;
+
+out:
+ return rc;
+}
diff --git a/pfhack.c b/pfhack.c
new file mode 100644
index 0000000..7e320db
--- /dev/null
+++ b/pfhack.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * pfhack.c - discover the RDS constants
+ *
+ * PF_RDS and SOL_RDS should be assigned constants. However, we don't have
+ * official values yet. There is a hack to overload an existing PF_ value
+ * (21). This dynamic code detects what the running kernel is using.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <limits.h>
+
+#include "kernel-list.h"
+#include "pfhack.h"
+#include "rdstool.h"
+
+#define PF_RDS_PATH "/proc/sys/net/rds/pf_rds"
+#define SOL_RDS_PATH "/proc/sys/net/rds/sol_rds"
+
+/* We don't allow any system that can't read pf_rds */
+static void explode(const char *reason)
+{
+ fprintf(stderr,
+ "%s: Unable to determine RDS constant: %s\n",
+ progname, reason);
+
+ exit(1);
+}
+
+static int discover_constant(const char *path, int official, int *found)
+{
+ int fd;
+ ssize_t ret, total = 0;
+ char buf[PATH_MAX];
+ char *ptr;
+ long val;
+
+ if (*found >= 0)
+ return *found;
+
+ fd = open(path, O_RDONLY);
+ if (fd < 0) {
+ /* hmm, no more constants in /proc. we must not need it anymore
+ * so use official values.
+ */
+ *found = official;
+ return official;
+ }
+
+ while (total < sizeof(buf)) {
+ ret = read(fd, buf + total, sizeof(buf) - total);
+ if (ret > 0)
+ total += ret;
+ else
+ break;
+ }
+
+ close(fd);
+
+ if (ret < 0)
+ explode("Error reading address constant");
+
+ val = strtoul(buf, &ptr, 0);
+ if ((val > INT_MAX) || !ptr || (*ptr && (*ptr != '\n')))
+ explode("Invalid address constant");
+
+ *found = val;
+ return (int)val;
+}
+
+int discover_pf_rds()
+{
+ static int pf_rds = -1;
+
+ return discover_constant(PF_RDS_PATH, OFFICIAL_PF_RDS, &pf_rds);
+}
+
+int discover_sol_rds()
+{
+ static int sol_rds = -1;
+
+ return discover_constant(SOL_RDS_PATH, OFFICIAL_SOL_RDS, &sol_rds);
+}
diff --git a/pfhack.h b/pfhack.h
new file mode 100644
index 0000000..2a55b25
--- /dev/null
+++ b/pfhack.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * pfhack.h - discover the RDS constants
+ *
+ * PF_RDS and SOL_RDS should be assigned constants. However, we don't have
+ * official values yet. There is a hack to overload an existing PF_ value
+ * (21). This dynamic code detects what the running kernel is using.
+ */
+
+#ifndef __PF_HACK_H
+#define __PF_HACK_H
+
+#define OFFICIAL_PF_RDS 21
+#define OFFICIAL_SOL_RDS 276
+
+
+#ifdef DYNAMIC_PF_RDS
+extern int discover_pf_rds();
+extern int discover_sol_rds();
+
+#define AF_RDS discover_pf_rds()
+#define PF_RDS AF_RDS
+#define SOL_RDS discover_sol_rds()
+#endif /* DYNAMIC_PF_RDS */
+
+#endif /* __PF_HACK_H */
diff --git a/rds-gen.1 b/rds-gen.1
new file mode 100644
index 0000000..f203f59
--- /dev/null
+++ b/rds-gen.1
@@ -0,0 +1,89 @@
+.Dd October 30, 2006
+.Dt RDS-GEN-SINK 1
+.Os
+.Sh NAME
+.Nm rds-gen
+.Nd write data from a file to an RDS socket
+.Pp
+.Nm rds-sink
+.Nd write data from an RDS socket to a file
+.Sh SYNOPSIS
+.Nm rds-gen
+.Bk -words
+.Op Fl s Ar source_address:source_port
+.Op Fl d Ar destination_address:destination_port
+.Op Fl f Ar input_file
+.Op Fl m Ar message_size
+.Op Fl l Ar total_bytes
+.Op Fl i Ar interval
+
+.Nm rds-sink
+.Bk -words
+.Op Fl s Ar listen_address:listen_port
+.Op Fl f Ar output_file
+.Op Fl i Ar interval
+
+.Sh DESCRIPTION
+The
+.Nm
+and
+.Nm rds-sink
+utilities are used to stream data through RDS sockets. rds-gen
+reads data from a file descriptor and sends it as messages
+down an RDS socket. rds-sink receives messages from an RDS
+socket and writes it to a file descriptor.
+
+The following options are shared between rds-gen and rds-sink:
+.Bl -tag -width Ds
+.It Fl s Ar address:port
+Binds the RDS socket to the given address and port. rds-gen will
+send messages from this address and port. rds-sink will receive messages
+sent to this address and port.
+.It Fl f Ar file
+rds-gen will read data from this file and rds-sink will write
+data to this file. If '-' is given as the filename then rds-gen
+will use standard input and rds-sink will use standard output.
+.It Fl i Ar interval_seconds
+An iterative summary of the number and size of messages that are sent and
+received is written to standard error at this interval.
+.El
+.Pp
+
+In addition, rds-gen supports the following options:
+.Bl -tag -width Ds
+.It Fl d Ar address:port
+Messages are sent to this destination address and port. If this option
+is specified multiple times then the messages are sent to each destination
+address in a round-robin fashion.
+.It Fl m Ar message_size
+Specifies the size of the messages that are sent down the RDS socket. The default
+message size is 4k. The message size must not be greater than the buffer size.
+.It Fl l Ar total_bytes
+Specifies the number of bytes that will be sent out the socket before rds-gen
+exits. If this is not specified and rds-gen was given a source file then it
+will run until it gets EOF from the file. If no file was given and this
+option is not specified then rds-gen will send data indefinitely.
+.El
+.Pp
+
+.Sh EXAMPLES
+rds-gen on host src sends infinite data to rds-sink on dest who
+prints out the amount of data it receives every second.
+.Pp
+
+.Dl $ rds-sink -s dest:22222 -i 1
+.Dl $ rds-gen -s src:11111 -d dest:22222
+.Pp
+
+Read 100M from /dev/zero on src and write it to /dev/null on dest,
+printing stats on both sides every minute.
+
+.Dl $ rds-sink -s dest:22222 -f /dev/null -i 60
+.Dl $ rds-gen -s src:11111 -f /dev/zero -d dest:22222 -i 60
+.Pp
+
+Watch rds-gen write data as fast as it can into a local black hole because
+there is no bound receiving socket.
+
+.Dl $ rds-gen -s src:11111 -d localhost:31337 -i 1
+.Pp
diff --git a/rds-gen.c b/rds-gen.c
new file mode 100644
index 0000000..f9420c7
--- /dev/null
+++ b/rds-gen.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * rds-gen.c: Spew some RDS packets
+ */
+
+#define _LARGEFILE64_SOURCE
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include "kernel-list.h"
+#include "rdstool.h"
+
+void print_usage(int rc)
+{
+ int namelen = strlen(progname);
+ FILE *output = rc ? stderr : stdout;
+
+ verbosef(0, output,
+ "Usage: %s -s <source_ip>:<source_port> [[-d <dest_ip>:<dest_port>] ...]\n"
+ " %*s [-f <input_file>] [-m <msg_size>]\n"
+ " %*s [-l <total_bytes>] [-i <interval>]\n"
+ " %*s [-v ...] [-q ...]\n"
+ " %s -h\n"
+ " %s -V\n",
+ progname, namelen, "", namelen, "", namelen, "", progname,
+ progname);
+
+ exit(rc);
+}
+
+void print_version()
+{
+ verbosef(0, stdout, "%s version VERSION\n", progname);
+
+ exit(0);
+}
+
+/*
+ * Pick the next destination.
+ * Currently round-robin, but could be made fancy
+ */
+static struct rds_endpoint *pick_dest(struct rds_context *ctxt,
+ struct rds_endpoint *de)
+{
+ struct list_head *next;
+
+ if (!de || (de->re_item.next == &ctxt->rc_daddrs))
+ next = ctxt->rc_daddrs.next;
+ else
+ next = de->re_item.next;
+
+ return list_entry(next, struct rds_endpoint, re_item);
+}
+
+static ssize_t fill_stdin(struct rds_context *ctxt, char *bytes,
+ ssize_t len)
+{
+ ssize_t ret = 0;
+ char *ptr = bytes;
+
+ static int first = 1;
+
+ if (!first)
+ return ret;
+
+ if (ctxt->rc_filename && strcmp(ctxt->rc_filename,"-"))
+ first = 0;
+
+ while (len && runningp()) {
+ stats_print();
+ ret = read(STDIN_FILENO, ptr, len);
+ if (!ret) {
+ if (ptr != bytes) {
+ verbosef(0, stderr,
+ "%s: Unexpected end of file reading from %s\n",
+ progname, ctxt->rc_filename);
+ ret = -EPIPE;
+ }
+ break;
+ }
+ if (ret < 0) {
+ ret = -errno;
+ if (ret == -EINTR)
+ continue;
+
+ verbosef(0, stderr,
+ "%s: Error reading from %s: %s\n",
+ progname, ctxt->rc_filename,
+ strerror(-ret));
+ break;
+ }
+
+ stats_add_read(ret);
+ ptr += ret;
+ len -= ret;
+ ret = 0; /* If this filled the buffer, we return success */
+ }
+ verbosef(3, stderr, "Read %zd bytes from stdin\n",
+ ptr - bytes);
+
+ return ret;
+}
+
+static ssize_t fill_pattern(struct rds_context *ctxt, char *bytes,
+ ssize_t len)
+{
+ static int first = 1;
+
+ stats_print();
+
+ if (first) {
+ memset(bytes, 0, len);
+ first = 0;
+ }
+
+ return 0;
+}
+
+static ssize_t fill_buff(struct rds_context *ctxt, char *bytes, ssize_t len)
+{
+ ssize_t ret;
+
+ /* Each possible method must handle calling stats_print() */
+ if (ctxt->rc_filename)
+ ret = fill_stdin(ctxt, bytes, len);
+ else
+ ret = fill_pattern(ctxt, bytes, len);
+
+ return ret;
+}
+
+static ssize_t send_buff(struct rds_endpoint *se, struct msghdr *msg)
+{
+ ssize_t ret = 0;
+
+ while (runningp()) {
+ stats_print();
+
+ ret = sendmsg(se->re_fd, msg, 0);
+ if (ret < 0) {
+ ret = -errno;
+ if (ret == -EINTR)
+ continue;
+
+ verbosef(0, stderr,
+ "%s: Error from sendmsg: %s\n",
+ progname, strerror(-ret));
+ }
+
+ /* Success */
+ break;
+ }
+
+ return ret;
+}
+
+
+static int wli_do_send(struct rds_context *ctxt)
+{
+ char bytes[ctxt->rc_msgsize];
+ int ret = 0;
+ struct rds_endpoint *de = NULL, *se = ctxt->rc_saddr;
+ struct iovec iov = {
+ .iov_base = bytes,
+ .iov_len = ctxt->rc_msgsize,
+ };
+ struct msghdr msg = {
+ .msg_name = NULL, /* Picked later */
+ .msg_namelen = sizeof(struct sockaddr_in),
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_control = NULL,
+ .msg_controllen = 0,
+ .msg_flags = 0,
+ };
+
+ verbosef(2, stderr, "Starting send loop\n");
+
+ stats_start();
+
+ while (runningp()) {
+ /* Calls stats_print() */
+ ret = fill_buff(ctxt, bytes, ctxt->rc_msgsize);
+ if (ret) {
+ if (ret == -EINTR)
+ continue;
+ else
+ break;
+ }
+
+ de = pick_dest(ctxt, de);
+ verbosef(2, stderr, "Destination %s\n", de->re_name);
+
+ msg.msg_name = &de->re_addr;
+ if (ctxt->rc_total &&
+ ((stats_get_send() + ctxt->rc_msgsize) > ctxt->rc_total))
+ iov.iov_len = ctxt->rc_total - stats_get_send();
+
+ /* Calls stats_print() */
+ ret = send_buff(se, &msg);
+ if (ret < 0)
+ break;
+
+ stats_add_send(ret);
+
+ if (ctxt->rc_total && (stats_get_send() >= ctxt->rc_total))
+ break;
+ }
+ verbosef(2, stderr, "Stopping send loop\n");
+
+ stats_total();
+
+ return ret;
+}
+
+
+int main(int argc, char *argv[])
+{
+ int rc;
+ char ipbuf[INET_ADDRSTRLEN];
+ struct rds_endpoint *e;
+ struct rds_context ctxt = {
+ .rc_msgsize = RDS_DEFAULT_MSG_SIZE,
+ };
+
+ INIT_LIST_HEAD(&ctxt.rc_daddrs);
+
+ rc = parse_options(argc, argv, RDS_TOOL_BASE_OPTS RDS_GEN_OPTS,
+ &ctxt);
+ if (rc)
+ print_usage(rc);
+
+ if (list_empty(&ctxt.rc_daddrs)) {
+ verbosef(0, stderr,
+ "%s: Destination endpoint address required\n",
+ progname);
+ print_usage(-EINVAL);
+ }
+
+ inet_ntop(PF_INET, &ctxt.rc_saddr->re_addr.sin_addr, ipbuf,
+ INET_ADDRSTRLEN);
+ verbosef(2, stderr, "Binding endpoint %s:%d\n",
+ ipbuf, ntohs(ctxt.rc_saddr->re_addr.sin_port));
+
+ rc = rds_bind(&ctxt);
+ if (rc)
+ goto out;
+
+ if (ctxt.rc_filename) {
+ rc = dup_file(&ctxt, STDIN_FILENO, O_RDONLY);
+ if (rc)
+ goto out;
+ if (!strcmp(ctxt.rc_filename, "-"))
+ ctxt.rc_filename = "<standard input>";
+ }
+
+ list_for_each_entry(e, &ctxt.rc_daddrs, re_item) {
+ inet_ntop(PF_INET, &e->re_addr.sin_addr, ipbuf,
+ INET_ADDRSTRLEN);
+ verbosef(2, stderr,
+ "Adding destination %s:%d\n", ipbuf,
+ ntohs(e->re_addr.sin_port));
+ }
+
+ rc = setup_signals();
+ if (rc) {
+ verbosef(0, stderr, "%s: Unable to initialize signals\n",
+ progname);
+ goto out;
+ }
+
+ rc = wli_do_send(&ctxt);
+
+out:
+ free(ctxt.rc_saddr->re_name);
+ free(ctxt.rc_saddr);
+
+ return rc;
+}
diff --git a/rds-info.1 b/rds-info.1
new file mode 100644
index 0000000..499b72c
--- /dev/null
+++ b/rds-info.1
@@ -0,0 +1,162 @@
+.Dd October 30, 2006
+.Dt RDS-INFO 1
+.Os
+.Sh NAME
+.Nm rds-info
+.Nd display information from the RDS kernel module
+.Pp
+.Sh SYNOPSIS
+.Nm rds-info
+.Op Fl v
+.Bk -words
+.Op Fl cknrstIT
+
+.Sh DESCRIPTION
+The
+.Nm
+utility presents various sources of information that
+the RDS kernel module maintains. When run without any optional arguments
+.Nm
+will output all the information it knows of. When options are specified then
+only the information associated with those options is displayed.
+
+The options are as follows:
+.Bl -tag -width Ds
+.It Fl v
+Requests verbose output. When this option is given, some classes of information
+will display additional data.
+
+.It Fl c
+Display global counters. Each counter increments as its event
+occurs. The counters may not be reset. The set of supported counters
+may change over time.
+
+.Bl -tag -width 4
+.It CounterName
+The name of the counter. These names come from the kernel and can change
+depending on the capability of the kernel module.
+.It Value
+The number of times that the counter has been incremented since the kernel
+module was loaded.
+.El
+
+.It Fl k
+Display all the RDS sockets in the system. There will always be one socket
+listed that is neither bound to nor connected to any addresses because
+.Nm
+itself uses an unbound socket to collect information.
+
+.Bl -tag -width 4
+.It BoundAddr, BPort
+The IP address and port that the socket is bound to. 0.0.0.0 0 indicates that
+the socket has not been bound.
+.It ConnAddr, CPort
+The IP address and port that the socket is connected to. 0.0.0.0 0 indicates
+that the socket has not been connected.
+.It SndBuf, RcvBuf
+The number of bytes of message payload which can be queued for sending or
+receiving on the socket, respectively.
+.It Inode
+The number of the inode object associated with the socket. Can be used to
+locate the process owning a given socket by searching /proc/*/fd for
+open files referencing a socket with this inode number.
+.El
+
+.It Fl n
+Display all RDS connections. RDS connections are maintained between
+nodes by transports.
+
+.Bl -tag -width 4
+.It LocalAddr
+The IP address of this node. For connections that originate and terminate on
+the same node the local address indicates which address initiated the
+connection establishment.
+.It RemoteAddr
+The IP address of the remote end of the connection.
+.It NextTX
+The sequence number that will be given to the next message that is sent
+over the connection.
+.It NextRX
+The sequence number that is expected from the next message to arrive over
+the connection. Any incoming messages with sequence numbers less than this
+will be dropped.
+.It Flg
+Flags which indicate the state of the connection.
+.Bl -tag -width 4
+.It s
+A process is currently sending a message down the connection.
+.It c
+The transport is attempting to connect to the remote address.
+.It C
+The connection to the remote host is connected and active.
+.El
+.El
+
+.It Fl r, Fl s, Fl t
+Display the messages in the receive, send, or retransmit queues respectively.
+.Bl -tag -width 4
+.It LocalAddr, LPort
+The local IP address and port on this node associated with the message. For
+sent messages this is the source address, for receive messages it is the
+destination address.
+.It RemoteAddr, RPort
+The remote IP address and port associated with the message. For sent messages
+this is the destination address, for receive messages it is the source address.
+.It Seq
+The sequence number of the message.
+.It Bytes
+The number of bytes in the message payload.
+.El
+
+The following information sources are dependent on specific transports which
+may not always be available.
+
+.It Fl I
+Display the IB connections which the IB transport is using to provide
+RDS connections.
+
+.Bl -tag -width 4
+.It LocalAddr
+The local IP address of this connection.
+.It RemoteAddr
+The remote IP address of this connection.
+.It LocalDev
+The local IB Global Identifier, printed in IPv6 address syntax.
+.It RemoteDev
+The remote IB Global Identifier, printed in IPv6 address syntax.
+.El
+
+If verbose output is requested, per-connection settings such as the
+maximum number of send and receive work requests will be displayed
+in addition.
+
+.It Fl T
+Display the TCP sockets which the TCP transport is using to provide
+RDS connections.
+
+.Bl -tag -width 4
+.It LocalAddr, LPort
+The local IP address and port of this socket.
+.It RemoteAddr, RPort
+The remote IP address and port that this socket is connected to.
+.It HdrRemain
+The number of bytes that must be read off the socket to complete the next
+full RDS header.
+.It DataRemain
+The number of bytes that must be read off the socket to complete the data
+payload of the message which is being received.
+.It SentNxt
+The TCP sequence number of the first byte of the last message that we sent
+down the connection.
+.It ExpectedUna
+The TCP sequence number of the byte past the last byte of the last message
+that we sent down the connection. When we see that the remote side has
+acked up to this byte then we know that the remote side has received all
+our RDS messages.
+.It SeenUna
+The TCP sequence number of the byte past the last byte which has been
+acked by the remote host.
+.El
+
+.El
+.Pp
diff --git a/rds-info.c b/rds-info.c
new file mode 100644
index 0000000..d90cc16
--- /dev/null
+++ b/rds-info.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <errno.h>
+#include <string.h>
+#include <inttypes.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include "net/rds.h"
+#include "rdstool.h"
+
+#define rds_conn_flag(conn, flag, letter) \
+ (conn.flags & RDS_INFO_CONNECTION_FLAG_##flag ? letter : '-')
+
+#define min(a, b) (a < b ? a : b)
+#define array_size(foo) (sizeof(foo) / sizeof(foo[0]))
+
+#define copy_into(var, data, each) ({ \
+ int __ret = 1; \
+ memset(&var, 0, sizeof(var)); \
+ memcpy(&var, data, min(each, sizeof(var))); \
+ __ret; \
+})
+
+#define for_each(var, data, each, len) \
+ for (;len > 0 && copy_into(var, data, each); \
+ data += each, len -= min(len, each))
+
+static int opt_verbose = 0;
+
+/* Like inet_ntoa, but can be re-entered several times without clobbering
+ * the previously returned string. */
+static const char *paddr(int af, const void *addrp)
+{
+ static char nbuf[8][INET6_ADDRSTRLEN];
+ static int which = 0;
+ char *string;
+
+ string = nbuf[which];
+ which = (which + 1) % 8;
+
+ inet_ntop(af, addrp, string, INET6_ADDRSTRLEN);
+ return string;
+}
+
+static const char *ipv4addr(uint32_t addr)
+{
+ return paddr(AF_INET, &addr);
+}
+
+static const char *ipv6addr(const void *addr)
+{
+ return paddr(AF_INET6, addr);
+}
+
+static void print_counters(void *data, int each, socklen_t len, void *extra)
+{
+ struct rds_info_counter ctr;
+
+ printf("\nCounters:\n%25s %16s\n", "CounterName", "Value");
+
+ for_each(ctr, data, each, len)
+ printf("%25s %16"PRIu64"\n", ctr.name, ctr.value);
+}
+
+static void print_sockets_v1(void *data, int each, socklen_t len, void *extra)
+{
+ struct rds_info_socket_v1 sk;
+
+ printf("\nRDS Sockets:\n%15s %5s %15s %5s %10s %10s\n",
+ "BoundAddr", "BPort", "ConnAddr", "CPort", "SndBuf",
+ "RcvBuf");
+
+ for_each(sk, data, each, len) {
+ printf("%15s %5u %15s %5u %10u %10u\n",
+ ipv4addr(sk.bound_addr),
+ ntohs(sk.bound_port),
+ ipv4addr(sk.connected_addr),
+ ntohs(sk.connected_port),
+ sk.sndbuf, sk.rcvbuf);
+ }
+}
+
+static void print_sockets(void *data, int each, socklen_t len, void *extra)
+{
+ struct rds_info_socket sk;
+
+ if (each == sizeof(struct rds_info_socket_v1)) {
+ print_sockets_v1(data, each, len, extra);
+ return;
+ }
+
+ printf("\nRDS Sockets:\n%15s %5s %15s %5s %10s %10s %8s\n",
+ "BoundAddr", "BPort", "ConnAddr", "CPort", "SndBuf",
+ "RcvBuf", "Inode");
+
+ for_each(sk, data, each, len) {
+ printf("%15s %5u %15s %5u %10u %10u %8Lu\n",
+ ipv4addr(sk.bound_addr),
+ ntohs(sk.bound_port),
+ ipv4addr(sk.connected_addr),
+ ntohs(sk.connected_port),
+ sk.sndbuf, sk.rcvbuf,
+ (unsigned long long) sk.inum);
+ }
+}
+
+static void print_conns(void *data, int each, socklen_t len, void *extra)
+{
+ struct rds_info_connection conn;
+
+ printf("\nRDS Connections:\n%15s %15s %16s %16s %3s\n",
+ "LocalAddr", "RemoteAddr", "NextTX", "NextRX", "Flg");
+
+ for_each(conn, data, each, len) {
+ printf("%15s %15s %16"PRIu64" %16"PRIu64" %c%c%c\n",
+ ipv4addr(conn.laddr),
+ ipv4addr(conn.faddr),
+ conn.next_tx_seq,
+ conn.next_rx_seq,
+ rds_conn_flag(conn, SENDING, 's'),
+ rds_conn_flag(conn, CONNECTING, 'c'),
+ rds_conn_flag(conn, CONNECTED, 'C'));
+ }
+}
+
+static void print_msgs(void *data, int each, socklen_t len, void *extra)
+{
+ struct rds_info_message msg;
+
+ printf("\n%s Message Queue:\n%15s %5s %15s %5s %16s %10s\n",
+ (char *)extra,
+ "LocalAddr", "LPort", "RemoteAddr", "RPort", "Seq", "Bytes");
+
+ for_each(msg, data, each, len) {
+ printf("%15s %5u %15s %5u %16"PRIu64" %10u\n",
+ ipv4addr(msg.laddr),
+ ntohs(msg.lport),
+ ipv4addr(msg.faddr),
+ ntohs(msg.fport),
+ msg.seq, msg.len);
+ }
+}
+
+static void print_tcp_socks(void *data, int each, socklen_t len, void *extra)
+{
+ struct rds_info_tcp_socket ts;
+
+ printf("\nTCP Connections:\n"
+ "%15s %5s %15s %5s %10s %10s %10s %10s %10s\n",
+ "LocalAddr", "LPort", "RemoteAddr", "RPort",
+ "HdrRemain", "DataRemain", "SentNxt", "ExpectUna", "SeenUna");
+
+ for_each(ts, data, each, len) {
+ printf("%15s %5u %15s %5u %10"PRIu64" %10"PRIu64" %10u %10u %10u\n",
+ ipv4addr(ts.local_addr),
+ ntohs(ts.local_port),
+ ipv4addr(ts.peer_addr),
+ ntohs(ts.peer_port),
+ ts.hdr_rem, ts.data_rem, ts.last_sent_nxt,
+ ts.last_expected_una, ts.last_seen_una);
+ }
+}
+
+static void print_ib_conns(void *data, int each, socklen_t len, void *extra)
+{
+ struct rds_info_ib_connection ic;
+
+ printf("\nRDS IB Connections:\n%15s %15s %32s %32s\n",
+ "LocalAddr", "RemoteAddr", "LocalDev", "RemoteDev");
+
+ for_each(ic, data, each, len) {
+ printf("%15s %15s %32s %32s",
+ ipv4addr(ic.src_addr),
+ ipv4addr(ic.dst_addr),
+ ipv6addr(ic.src_gid),
+ ipv6addr(ic.dst_gid));
+
+ if (opt_verbose) {
+ printf(" send_wr=%u", ic.max_send_wr);
+ printf(", recv_wr=%u", ic.max_recv_wr);
+ printf(", send_sge=%u", ic.max_send_sge);
+ printf(", rdma_fmr_max=%u", ic.rdma_fmr_max);
+ printf(", rdma_fmr_size=%u", ic.rdma_fmr_size);
+ }
+
+ printf("\n");
+ }
+}
+
+struct info {
+ int opt_val;
+ char *description;
+ void (*print)(void *data, int each, socklen_t len, void *extra);
+ void *extra;
+ int option_given;
+};
+
+struct info infos[] = {
+ ['c'] = { RDS_INFO_COUNTERS, "statistic counters",
+ print_counters, NULL, 0 },
+ ['k'] = { RDS_INFO_SOCKETS, "sockets",
+ print_sockets, NULL, 0 },
+ ['n'] = { RDS_INFO_CONNECTIONS, "connections",
+ print_conns, NULL, 0 },
+ ['r'] = { RDS_INFO_RECV_MESSAGES, "recv queue messages",
+ print_msgs, "Receive", 0 },
+ ['s'] = { RDS_INFO_SEND_MESSAGES, "send queue messages",
+ print_msgs, "Send", 0 },
+ ['t'] = { RDS_INFO_RETRANS_MESSAGES, "retransmit queue messages",
+ print_msgs, "Retransmit", 0 },
+ ['T'] = { RDS_INFO_TCP_SOCKETS, "TCP transport sockets",
+ print_tcp_socks, NULL, 0 },
+ ['I'] = { RDS_INFO_IB_CONNECTIONS, "IB transport connections",
+ print_ib_conns, NULL, 0 },
+};
+
+void print_usage(int rc)
+{
+ FILE *output = rc ? stderr : stdout;
+ int i;
+
+ verbosef(0, output, "The following options limit output to the given "
+ "sources:\n");
+
+ for (i = 0; i < array_size(infos); i++) {
+ if (!infos[i].opt_val)
+ continue;
+ printf(" -%c %s\n", i, infos[i].description);
+ }
+
+ verbosef(0, output,
+ "\n\nIf no options are given then all sources are used.\n");
+ exit(rc);
+}
+
+void print_version()
+{
+}
+
+int main(int argc, char **argv)
+{
+ char optstring[258] = "v+";
+ int given_options = 0;
+ socklen_t len = 0;
+ void *data = NULL;
+ int fd;
+ int each;
+ int c;
+ char *last;
+ int i;
+
+ /* quickly append all our info options to the optstring */
+ last = &optstring[strlen(optstring)];
+ for (i = 0; i < array_size(infos); i++) {
+ if (!infos[i].opt_val)
+ continue;
+ *last = (char)i;
+ last++;
+ *last = '\0';
+ }
+
+ while ((c = getopt(argc, argv, optstring)) != EOF) {
+ switch (c) {
+ case 'v':
+ opt_verbose++;
+ continue;
+ }
+
+ if (c >= array_size(infos) || !infos[c].opt_val) {
+ verbosef(0, stderr, "%s: Invalid option \'-%c\'\n",
+ progname, optopt);
+ print_usage(1);
+ }
+
+ infos[c].option_given = 1;
+ given_options++;
+ }
+
+ fd = socket(PF_RDS, SOCK_SEQPACKET, 0);
+ if (fd < 0) {
+ verbosef(0, stderr, "%s: Unable to create socket: %s\n",
+ progname, strerror(errno));
+ return 1;
+ }
+
+ for (i = 0; i < array_size(infos); i++) {
+ if (!infos[i].opt_val ||
+ (given_options && !infos[i].option_given))
+ continue;
+
+ /* read in the info until we get a full snapshot */
+ while ((each = getsockopt(fd, SOL_RDS, infos[i].opt_val, data,
+ &len)) < 0) {
+ if (errno != ENOSPC) {
+ verbosef(0, stderr,
+ "%s: Unable get statistics: %s\n",
+ progname, strerror(errno));
+ return 1;
+ }
+ if (data)
+ data = realloc(data, len);
+ else
+ data = malloc(len);
+
+ if (data == NULL) {
+ verbosef(0, stderr,
+ "%s: Unable to allocate memory "
+ "for %u bytes of info: %s\n",
+ progname, len, strerror(errno));
+ return 1;
+ }
+ }
+
+ infos[i].print(data, each, len, infos[i].extra);
+
+ if (given_options && --given_options == 0)
+ break;
+ }
+
+ return 0;
+}
diff --git a/rds-ping.1 b/rds-ping.1
new file mode 100644
index 0000000..ae06787
--- /dev/null
+++ b/rds-ping.1
@@ -0,0 +1,69 @@
+.Dd Apr 22, 2008
+.Dt RDS-PING 1
+.Os
+.Sh NAME
+.Nm rds-ping
+.Nd test reachability of remote node over RDS
+.Pp
+.Sh SYNOPSIS
+.Nm rds-ping
+.Bk -words
+.Op Fl c Ar count
+.Op Fl i Ar interval
+.Op Fl I Ar local_addr
+.Ar remote_addr
+
+.Sh DESCRIPTION
+.Nm rds-ping
+is used to test whether a remote node is reachable over RDS.
+Its interface is designed to operate pretty much the standard
+.Xr ping 8
+utility, even though the way it works is pretty different.
+.Pp
+.Nm rds-ping
+opens several RDS sockets and sends packets to port 0 on
+the indicated host. This is a special port number to which
+no socket is bound; instead, the kernel processes incoming
+packets and responds to them.
+.Sh OPTIONS
+The following options are available for use on the command line:
+.Bl -tag -width Ds
+.It Fl c Ar count
+Causes
+.Nm rds-ping
+to exit after sending (and receiving) the specified number of
+packets.
+.It Fl I Ar address
+By default,
+.Nm rds-ping
+will pick the local source address for the RDS socket based
+on routing information for the destination address (i.e. if
+packets to the given destination would be routed through interface
+.Nm ib0 ,
+then it will use the IP address of
+.Nm ib0
+as source address).
+Using the
+.Fl I
+option, you can override this choice.
+.It Fl i Ar timeout
+By default,
+.Nm rds-ping
+will wait for one second between sending packets. Use this option
+to specified a different interval. The timeout value is given in
+seconds, and can be a floating point number. Optionally, append
+.Nm msec
+or
+.Nm usec
+to specify a timeout in milliseconds or microseconds, respectively.
+.It
+Specifying a timeout considerably smaller than the packet round-trip
+time will produce unexpected results.
+.El
+.Sh AUTHORS
+.Nm rds-ping
+was written by Olaf Kirch <olaf.kirch at oracle.com>.
+.Sh SEE ALSO
+.Xr rds 7 ,
+.Xr rds-info 1 ,
+.Xr rds-stress 1 .
diff --git a/rds-ping.c b/rds-ping.c
new file mode 100644
index 0000000..e9c88fc
--- /dev/null
+++ b/rds-ping.c
@@ -0,0 +1,385 @@
+/*
+ * rds-ping utility
+ *
+ * Test reachability of a remote RDS node by sending a packet to port 0.
+ *
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <errno.h>
+#include <string.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <sys/poll.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include "net/rds.h"
+
+#ifdef DYNAMIC_PF_RDS
+#include "pfhack.h"
+#endif
+
+#define die(fmt...) do { \
+ fprintf(stderr, fmt); \
+ exit(1); \
+} while (0)
+
+#define die_errno(fmt, args...) do { \
+ fprintf(stderr, fmt ", errno: %d (%s)\n", ##args , errno,\
+ strerror(errno)); \
+ exit(1); \
+} while (0)
+
+static struct timeval opt_wait = { 1, 1 }; /* 1s */
+static unsigned long opt_count;
+static struct in_addr opt_srcaddr;
+static struct in_addr opt_dstaddr;
+
+/* For reasons of simplicity, RDS ping does not use a packet
+ * payload that is being echoed, the way ICMP does.
+ * Instead, we open a number of sockets on different ports, and
+ * match packet sequence numbers with ports.
+ */
+#define NSOCKETS 8
+
+struct socket {
+ int fd;
+ unsigned int sent_id;
+ struct timeval sent_ts;
+ unsigned int nreplies;
+};
+
+
+static int do_ping(void);
+static void report_packet(struct socket *sp, const struct timeval *now,
+ const struct in_addr *from, int err);
+static void usage(const char *complaint);
+static int rds_socket(struct in_addr *src, struct in_addr *dst);
+static int parse_timeval(const char *, struct timeval *);
+static int parse_long(const char *ptr, unsigned long *);
+static int parse_addr(const char *ptr, struct in_addr *);
+
+int
+main(int argc, char **argv)
+{
+ int c;
+
+ while ((c = getopt(argc, argv, "c:i:I:")) != -1) {
+ switch (c) {
+ case 'c':
+ if (!parse_long(optarg, &opt_count))
+ die("Bad packet count <%s>\n", optarg);
+ break;
+
+ case 'I':
+ if (!parse_addr(optarg, &opt_srcaddr))
+ die("Unknown source address <%s>\n", optarg);
+ break;
+
+ case 'i':
+ if (!parse_timeval(optarg, &opt_wait))
+ die("Bad wait time <%s>\n", optarg);
+ break;
+
+ default:
+ usage("Unknown option");
+ }
+ }
+
+ if (optind + 1 != argc)
+ usage("Missing destination address");
+ if (!parse_addr(argv[optind], &opt_dstaddr))
+ die("Cannot parse destination address <%s>\n", argv[optind]);
+
+ return do_ping();
+}
+
+/* returns a - b in usecs */
+static inline long
+usec_sub(const struct timeval *a, const struct timeval *b)
+{
+ return ((long)(a->tv_sec - b->tv_sec) * 1000000UL) + a->tv_usec - b->tv_usec;
+}
+
+static int
+do_ping(void)
+{
+ struct sockaddr_in sin;
+ unsigned int sent = 0, recv = 0;
+ struct timeval next_ts;
+ struct socket socket[NSOCKETS];
+ struct pollfd pfd[NSOCKETS];
+ int i, next = 0;
+
+ for (i = 0; i < NSOCKETS; ++i) {
+ int fd;
+
+ fd = rds_socket(&opt_srcaddr, &opt_dstaddr);
+
+ socket[i].fd = fd;
+ pfd[i].fd = fd;
+ pfd[i].events = POLLIN;
+ }
+
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_family = AF_INET;
+ sin.sin_addr = opt_dstaddr;
+
+ gettimeofday(&next_ts, NULL);
+ while (1) {
+ struct timeval now;
+ struct sockaddr_in from;
+ socklen_t alen = sizeof(from);
+ long deadline;
+ int ret;
+
+ /* Fast way out - if we have received all packets, bail now.
+ * If we're still waiting for some to come back, we need
+ * to do the poll() below */
+ if (opt_count && recv >= opt_count)
+ break;
+
+ gettimeofday(&now, NULL);
+ if (timercmp(&now, &next_ts, >=)) {
+ struct socket *sp = &socket[next];
+ int err = 0;
+
+ if (opt_count && sent >= opt_count)
+ break;
+
+ timeradd(&next_ts, &opt_wait, &next_ts);
+ if (sendto(sp->fd, NULL, 0, 0, (struct sockaddr *) &sin, sizeof(sin)))
+ err = errno;
+ sp->sent_id = ++sent;
+ sp->sent_ts = now;
+ sp->nreplies = 0;
+ next = (next + 1) % NSOCKETS;
+
+ if (err) {
+ static unsigned int nerrs = 0;
+
+ report_packet(sp, NULL, NULL, err);
+ if (err == EINVAL && nerrs++ == 0)
+ printf(" Maybe your kernel does not support rds ping yet\n");
+ }
+ }
+
+ deadline = usec_sub(&next_ts, &now);
+ ret = poll(pfd, NSOCKETS, deadline / 1000);
+ if (ret < 0) {
+ if (errno == EINTR)
+ continue;
+ die_errno("poll");
+ }
+ if (ret == 0)
+ continue;
+
+ for (i = 0; i < NSOCKETS; ++i) {
+ struct socket *sp = &socket[i];
+
+ if (!(pfd[i].revents & POLLIN))
+ continue;
+
+ ret = recvfrom(sp->fd, NULL, 0, MSG_DONTWAIT,
+ (struct sockaddr *) &from, &alen);
+ gettimeofday(&now, NULL);
+
+ if (ret < 0) {
+ if (errno != EAGAIN &&
+ errno != EINTR)
+ report_packet(sp, &now, NULL, errno);
+ } else {
+ report_packet(sp, &now, &from.sin_addr, 0);
+ recv++;
+ }
+ }
+ }
+
+ /* Program exit code: signal success if we received any response. */
+ return recv == 0;
+}
+
+static void
+report_packet(struct socket *sp, const struct timeval *now,
+ const struct in_addr *from_addr, int err)
+{
+ printf(" %3u:", sp->sent_id);
+ if (now)
+ printf(" %ld usec", usec_sub(now, &sp->sent_ts));
+ if (from_addr && from_addr->s_addr != opt_dstaddr.s_addr)
+ printf(" (%s)", inet_ntoa(*from_addr));
+ if (sp->nreplies)
+ printf(" DUP!");
+ if (err)
+ printf(" ERROR: %s", strerror(err));
+ printf("\n");
+
+ sp->nreplies++;
+}
+
+static int
+rds_socket(struct in_addr *src, struct in_addr *dst)
+{
+ struct sockaddr_in sin;
+ int fd;
+
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_family = AF_INET;
+
+ fd = socket(PF_RDS, SOCK_SEQPACKET, 0);
+ if (fd < 0)
+ die_errno("unable to create RDS socket");
+
+ /* Guess the local source addr if not given. */
+ if (src->s_addr == 0) {
+ socklen_t alen;
+ int ufd;
+
+ ufd = socket(PF_INET, SOCK_DGRAM, 0);
+ if (ufd < 0)
+ die_errno("unable to create UDP socket");
+ sin.sin_addr = *dst;
+ sin.sin_port = htons(1);
+ if (connect(ufd, (struct sockaddr *) &sin, sizeof(sin)) < 0)
+ die_errno("unable to connect to %s",
+ inet_ntoa(*dst));
+
+ alen = sizeof(sin);
+ if (getsockname(ufd, (struct sockaddr *) &sin, &alen) < 0)
+ die_errno("getsockname failed");
+
+ *src = sin.sin_addr;
+ close(ufd);
+ }
+
+ sin.sin_addr = *src;
+ sin.sin_port = 0;
+
+ if (bind(fd, (struct sockaddr *) &sin, sizeof(sin)))
+ die_errno("bind() failed");
+
+ return fd;
+}
+
+static void
+usage(const char *complaint)
+{
+ fprintf(stderr,
+ "%s\nUsage: rds-ping [options] dst_addr\n"
+ "Options:\n"
+ " -c count limit packet count\n"
+ " -I interface source IP address\n",
+ complaint);
+ exit(1);
+}
+
+static int
+parse_timeval(const char *ptr, struct timeval *ret)
+{
+ double seconds;
+ char *endptr;
+
+ seconds = strtod(ptr, &endptr);
+ if (!strcmp(endptr, "ms")
+ || !strcmp(endptr, "msec")) {
+ seconds *= 1e-3;
+ } else
+ if (!strcmp(endptr, "us")
+ || !strcmp(endptr, "usec")) {
+ seconds *= 1e-6;
+ } else if (*endptr)
+ return 0;
+
+ ret->tv_sec = (long) seconds;
+ seconds -= ret->tv_sec;
+
+ ret->tv_usec = (long) (seconds * 1e6);
+ return 1;
+}
+
+static int
+parse_long(const char *ptr, unsigned long *ret)
+{
+ unsigned long long val;
+ char *endptr;
+
+ val = strtoull(ptr, &endptr, 0);
+ switch (*endptr) {
+ case 'k': case 'K':
+ val <<= 10;
+ endptr++;
+ break;
+
+ case 'm': case 'M':
+ val <<= 20;
+ endptr++;
+ break;
+
+ case 'g': case 'G':
+ val <<= 30;
+ endptr++;
+ break;
+ }
+
+ if (*endptr)
+ return 0;
+
+ *ret = val;
+ return 1;
+}
+
+static int
+parse_addr(const char *ptr, struct in_addr *ret)
+{
+ struct hostent *hent;
+
+ hent = gethostbyname(ptr);
+ if (hent &&
+ hent->h_addrtype == AF_INET && hent->h_length == sizeof(*ret)) {
+ memcpy(ret, hent->h_addr, sizeof(*ret));
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * This are completely stupid. options.c should be removed.
+ */
+void print_usage(int durr) { }
+void print_version() { }
diff --git a/rds-rdma.7 b/rds-rdma.7
new file mode 100644
index 0000000..20b1575
--- /dev/null
+++ b/rds-rdma.7
@@ -0,0 +1,427 @@
+.TH "RDS zerocopy" 7
+.SH NAME
+RDS-rdma \- Zerocopy Interface for RDMA over RDS
+.SH DESCRIPTION
+This manual page describes the zerocopy interface of RDS, which
+was added in RDSv3. For a description of the basic RDS interface,
+please refer to
+.BR rds (7).
+.PP
+The principal mode of operation for RDS zerocopy is like this:
+one participant (the client) wishes to initiate a direct transfer
+to or from some area of memory in its process address space.
+This memory does not have to be aligned.
+.PP
+The client obtains a handle for this region of memory, and
+passes it to the other participant (the server). This is called
+the RDMA cookie. To the application, the cookie is an opaque 64bit
+data type.
+.PP
+The client sends this handle to
+the server application, along with other details of the RDMA
+request (such as which data to transfer to that memory area).
+Throughout the following discussion, we will refer to this
+message as the RDMA request.
+.PP
+The server uses this RDMA cookie to initiate the requested RDMA
+transfer. The RDMA transfer is combined atomically with a
+normal RDS message, which is delivered to the client. This
+message is called the RDMA ACK throughout the following. Atomic
+in this context means that either both the RDMA succeeds and the
+RDMA ACK is delivered, or neither succeeds.
+.PP
+Thus, when the client receives the RDMA ACK, it knows that
+the RDMA has completed successfully. It can then release the
+RDMA cookie for this memory region, if it wishes to.
+.PP
+RDMA operations are not reliable, in the sense that unlike normal
+RDS messages, RDS RDMA operations may fail, and get
+dropped.
+.\"-------------------------------
+.SH INTERFACE
+The interface is currently based on control messages (ancillary
+data) sent or received via the
+.BR sendmsg (2)
+and
+.BR recvmsg (2)
+system calls. Optionally, an older interface can be used that
+is based on the
+.BR setsockopt (2)
+system call. However, we recommend using control messages, as
+this reduces the number of system calls required.
+.\"-------------------------------
+.SS Control message interface
+With the control message interface, the RDMA cookie is passed to
+the server out-of-band, included in an extension header attached
+to the RDS message.
+.PP
+The following outlines the mode of operation; the data
+types used will be specified in details in a subsequent section.
+.PP
+Initially, the client will send RDMA requests along with a
+.B RDS_CMSG_RDMA_MAP
+control message. The control message contains the address and
+length of the memory region for which to obtain a handle, some
+flags, and a pointer to a memory location (in the caller's address
+space) where the kernel will store the RDMA cookie.
+.PP
+Alternatively, if the application has already obtained a RDMA cookie
+for the memory range it wants to RDMA to/from, it can hand this
+cookie to the kernel using the
+.B RDS_CMSG_RDMA_DEST
+control message.
+.PP
+Either way, the kernel will include the resulting RDMA cookie
+in an extension header that is transmitted as part of the RDMA
+request to the server.
+.PP
+When the server receives the RDMA request, the kernel will deliver the
+cookie wrapped inside a
+.B RDS_CMSG_RDMA_DEST
+control message.
+.PP
+The server then initiates the data transfer by sending the RDMA ACK message
+along with a
+.B RDS_CMSG_RDMA_ARGS
+control message. This message contains the RDMA cookie, and the local
+memory to copy to or from.
+.PP
+The server process may request a notification when an RDMA operation
+completes. Notifications are delivered as a
+.B RDS_CMSG_RDMA_STATUS
+control messages. When an application calls
+.BR recvmsg (2),
+it will either receive a regular RDS message (possibly with other RDMA
+related control messages), or an empty message with one or more
+status control messages.
+.PP
+In addition, applications
+When an RDMA operation fails for some reason and is discarded, the
+application can ask to receive notifications for failed messages as
+well, regardless of whether it asked for success notification of an
+individual message or not. This behavior is turned on by setting the
+.B RDS_RECVERR
+socket option.
+.\"-------------------------------
+.SS Setsockopt interface
+In addition to the control message interface, RDS allows a process
+to register and release memory ranges for RDMA through calls to
+.BR setsockopt (2).
+.TP
+.B RDS_GET_MR
+To obtain a RDMA cookie for a given memory range, the application can
+use
+.BR setsockopt " with " RDS_GET_MR .
+This operates essentially the same way as the
+.B RDS_CMSG_RDMA_MAP
+control message: the argument contains the address and length of the
+memory range to be registered, and a pointer to a RDMA cookie variable,
+in which the system call will store the cookie for the registered
+range.
+.TP
+.B RDS_FREE_MR
+Memory ranges can be released by calling
+.BR setsockopt " with " RDS_FREE_MR ,
+giving the RDMA cookie and additional flags as arguments.
+.TP
+.B RDS_RECVERR
+This is a boolean option which can be set as well as queried
+(using
+.BR getsockopt ).
+When enabled, RDS will send RDMA notification messages to
+the application for any RDMA operation that fails. This
+option defaults to off.
+.PP
+For all of these calls, the
+.B level
+argument to
+.B setsockopt
+is
+.BR SOL_RDS .
+.PP
+.\"-------------------------------
+.SH RDMA MACROS AND TYPES
+.fi
+.TP
+.B RDMA cookie
+.nf
+typedef u_int64_t rds_rdma_cookie_t
+.fi
+.IP
+This encapsulates a memory location in the client process. In the
+current implementation, it contains the R_Key of the remote memory
+region, and the offset into it (so that the application does not
+have to worry about alignment.
+.IP
+The RDMA cookie is used in several struct types described below.
+The
+.BR RDS_CMSG_RDMA_DEST
+control message contains a rds_rdma_cookie_t all by itself as payload.
+.TP
+.B Mapping arguments
+The following data type is used with
+.B RDS_CMSG_RDMA_MAP
+control messages and with the
+.B RDS_GET_MR
+socket option:
+.IP
+.nf
+struct rds_iovec {
+ u_int64_t addr;
+ u_int64_t bytes;
+};
+
+struct rds_get_mr_args {
+ struct rds_iovec vec;
+ u_int64_t cookie_addr;
+ uint64_t flags;
+};
+.fi
+.IP
+The
+.B cookie_addr
+specifies a memory location where to store the RDMA cookie.
+.IP
+The
+.B flags
+value is a bitwise OR of any of the following flags:
+.RS
+.TP
+.B RDS_RDMA_USE_ONCE
+This tells the kernel that the allocated RDMA cookie is to be used
+exactly once. When the RDMA ACK message arrives, the kernel will
+automatically unbind the memory area and release any resources
+associated with the cookie.
+.IP
+If this flag is not set, it is the application's responsibility to
+release the memory region at a later time using the
+.BR RDS_FREE_MR
+socket option.
+.TP
+.B RDS_RDMA_INVALIDATE
+Normally, RDMA memory mappings are invalidated lazily, as this
+requires some relatively costly synchronization with the HCA. However,
+this means that the server application can continue to access the
+registered memory for some indeterminate amount of time.
+If this flag is set, the RDS code will invalidate
+the mapping at the time it is released (either upon arrival of the
+RDMA ACK, if
+.B USE_ONCE
+was specified; or when the application destroys it using
+.BR FREE_MR ).
+.RE
+.TP
+.B RDMA Operation
+RDMA operations are initiated by the server using the
+.BR RDS_CMSG_RDMA_ARGS
+control message, which takes the following data as payload:
+.IP
+.nf
+struct rds_rdma_args {
+ rds_rdma_cookie_t cookie;
+ struct rds_iovec remote_vec;
+ u_int64_t local_vec_addr;
+ u_int64_t nr_local;
+ u_int64_t flags;
+ u_int32_t user_token;
+};
+.fi
+.IP
+The
+.B cookie
+argument contains the RDMA cookie received from the client.
+The local memory is given via an array of
+.BR rds_iovec s.
+The array address is given in
+.BR local_vec_addr ,
+and its number of elements is given in
+.BR nr_local .
+.IP
+The struct member
+.B remote_vec
+specifies a location relative to the memory area identified
+by the cookie:
+.BR remote_vec . addr
+is an offset into that region, and
+.BR remote_vec . bytes
+is the length of the memory window to copy to/from.
+This length must match the size of the local memory area,
+i.e. the sum of bytes in all members of the local iovec.
+.IP
+The flags field contains the bitwise OR of any of the following
+flags:
+.RS
+.TP
+.B RDS_RDMA_READWRITE
+If set, any RDMA WRITE is initiated from the server's memory
+to the client's. If not set, RDS will do a RDMA READ from the
+client's memory to the server's memory.
+.TP
+.B RDS_RDMA_FENCE
+By default, Infiniband makes no guarantee about the ordering of
+an RDMA READ with respect to subsequent SEND operations. Setting
+this flag asks that the RDMA READ should be fenced off the subsequent
+RDS ACK message. Setting this flag requires an additional round-trip
+of the IB fabric, but it is a good idea to use set this flag
+by default, unless you are really sure you do not want it.
+.TP
+.B RDS_RDMA_NOTIFY_ME
+This flag requests a notification upon completion of the RDMA
+operation (successful or otherwise). The noticiation will contain
+the value of the
+.B user_token
+field passed in by the application. This allows the application to
+release resources (such as buffers) assosicated with the RDMA transfer.
+.RE
+.IP
+The
+.B user_token
+can be used to pass an application specific identifier to the
+kernel. This token is returned to the application when a status
+notification is generated (see the following section).
+.TP
+.B RDMA Notification
+The RDS kernel code is able to notify the server application when
+an RDMA operation completes. These notifications are delivered
+via
+.B RDS_CMSG_RDMA_STATUS
+control messages.
+.IP
+By default, no notifications are generated. There are two ways an
+application can request them. On one hand, status notifications can
+be enabled on a per-operation basis by setting the
+.B RDS_RDMA_NOTIFY_ME
+flag in the RDMA arguments. On the other hand, the application can
+request notifications for all RDMA operations that fail by setting
+the
+.B RDS_RECVERR
+socket option (see below).
+In both cases, the format of the notification is the same; and at
+most one notification will be sent per completed operation.
+.IP
+The message format is this:
+.IP
+.nf
+struct rds_rdma_notify {
+ u_int32_t user_token;
+ int32_t status;
+};
+.fi
+.IP
+The
+.B user_token
+field contains the value previously given to the kernel in the
+.BR RDS_CMSG_RDMA_ARGS
+control message. The
+.BR status
+field contains a status value, with 0 indicating success, and
+non-zero indicating an error.
+.IP
+The following status codes are currently defined:
+.RS
+.TP
+.B RDS_RDMA_SUCCESS
+The RDMA operation succeeded.
+.TP
+.B RDS_RDMA_REMOTE_ERROR
+The RDMA operation failed due to a remote access error. This is
+usually due to an invalid R_key, offset or transfer size.
+.TP
+.B RDS_RDMA_CANCELED
+The RDMA operation was canceled by the application.
+(This error code is not yet generated).
+.TP
+.B RDS_RDMA_DROPPED
+RDMA operations were discarded after the connection broke and
+was re-established. The RDMA operation may have been processed
+partially.
+.TP
+.B RDS_RDMA_OTHER_ERROR
+Any other failure.
+.RE
+.TP
+.B RDMA setsockopt arguments
+When using the
+.B RDS_GET_MR
+socket option to register a memory range, the application passes
+a pointer to a
+.B struct rds_get_mr_args
+variable, described above.
+.IP
+The
+.B RDS_FREE_MR
+call takes an argument of type
+.BR "struct rds_free_mr_args" :
+.IP
+.nf
+struct rds_free_mr_args {
+ rds_rdma_cookie_t cookie;
+ u_int64_t flags;
+};
+.fi
+.IP
+.B cookie
+specifies the RDMA cookie to be released. RDMA access to the memory
+range will usually not be invoked instantly, because the operation is
+rather costly. However, if the
+.B flags
+argument contains
+.BR RDS_RDMA_INVALIDATE ,
+RDS will invalidate the indicated mapping immediately,
+as described in section
+.B "Mapping arguments"
+above.
+.IP
+If the
+.B cookie
+argument is 0, and
+.BR RDS_RDMA_INVALIDATE
+is set, RDS will invalidate old memory mappings on all devices.
+.\"-------------------------------
+.SH ERRORS
+In addition to the usual error codes returned by
+.BR sendmsg ", " recvmsg " and " setsockopt ,
+RDS returns the following error codes:
+.TP
+.BR EAGAIN
+RDS was unable to map a memory range because the limit was
+exceeded (returned by
+.BR RDS_CMSG_RDMA_MAP " and " RDS_GET_MR ).
+.TP
+.BR EINVAL
+When sending a message, there were were conflicting control messages
+(e.g. two
+.B RDMA_MAP
+messages, or a
+.B RDMA_MAP " and a " RDMA_DEST
+message).
+.IP
+In a
+.BR RDS_CMSG_RDMA_MAP " or " RDS_GET_MR
+operation, the application specified memory range greater than the
+maximum size supported.
+.IP
+When setting up an RDMA operation with
+.BR RDS_CMSG_RDMA_ARGS ,
+the size of the local memory (given in the
+.BR rds_iovec )
+did not match the size of the remote memory range.
+.TP
+.B EBUSY
+RDS was unable to obtain a DMA mapping for the indicated memory.
+.\"-------------------------------
+.SH LIMITS
+Currently, the following limits apply
+.IP \(bu
+The maximum size of a zerocopy transfer is 1MB. This can be
+adjusted via the
+.B fmr_message_size
+module parameter.
+.IP \(bu
+The maximum number of memory ranges that can be mapped is
+limited to 2048 at the moment. This can be adjusted via the
+.B fmr_pool_size
+module parameter. However, the actual limit imposed by the
+hardware may in fact be lower.
+.SH AUTHORS
+RDS was written and is Copyright (C) 2007-2008 by Oracle, Inc.
diff --git a/rds-sink.1 b/rds-sink.1
new file mode 100644
index 0000000..05c9d73
--- /dev/null
+++ b/rds-sink.1
@@ -0,0 +1 @@
+.so man1/rds-gen.1
diff --git a/rds-sink.c b/rds-sink.c
new file mode 100644
index 0000000..2d47ade
--- /dev/null
+++ b/rds-sink.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * rds-sink.c: Collect some RDS packets.
+ */
+#define _LARGEFILE64_SOURCE
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <errno.h>
+#include <inttypes.h>
+
+#include "kernel-list.h"
+#include "rdstool.h"
+
+void print_usage(int rc)
+{
+ int namelen = strlen(progname);
+ FILE *output = rc ? stderr : stdout;
+
+ verbosef(0, output,
+ "Usage: %s -s <source_ip>:<source_port>\n"
+ " %*s [-f <output_file>] [-i <interval>]\n"
+ " %*s [-v ...] [-q ...]\n"
+ " %s -h\n"
+ " %s -V\n",
+ progname, namelen, "", namelen, "", progname, progname);
+
+ exit(rc);
+}
+
+void print_version()
+{
+ verbosef(0, stdout, "%s version VERSION\n", progname);
+
+ exit(0);
+}
+
+static int empty_buff(struct rds_context *ctxt, char *bytes, ssize_t len)
+{
+ int ret = 0;
+ char *ptr = bytes;
+
+ if (!ctxt->rc_filename)
+ len = 0; /* Throw it away */
+
+ while (len && runningp()) {
+ stats_print();
+
+ ret = write(STDOUT_FILENO, ptr, len);
+ if (!ret) {
+ verbosef(0, stderr,
+ "%s: Unexpected end of file writing to %s\n",
+ progname, ctxt->rc_filename);
+ ret = -EPIPE;
+ break;
+ }
+ if (ret < 0) {
+ ret = -errno;
+ if (ret == -EINTR)
+ continue;
+
+ verbosef(0, stderr,
+ "%s: Error writing to %s: %s\n",
+ progname, ctxt->rc_filename,
+ strerror(-ret));
+ break;
+ }
+
+ stats_add_write(ret);
+ ptr += ret;
+ len -= ret;
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static ssize_t recv_buff(struct rds_endpoint *e, struct msghdr *msg,
+ int flags)
+{
+ ssize_t ret = 0;
+
+ while (runningp()) {
+ stats_print();
+
+ ret = recvmsg(e->re_fd, msg, flags);
+ if (ret < 0) {
+ ret = -errno;
+ if (ret == -EINTR)
+ continue;
+
+ verbosef(0, stderr,
+ "%s: Error from recvmsg: %s\n",
+ progname, strerror(-ret));
+ }
+
+ /* Success */
+ break;
+ }
+
+ return ret;
+}
+
+static int wli_do_recv(struct rds_context *ctxt)
+{
+ struct rds_endpoint *e = ctxt->rc_saddr;
+ ssize_t alloced = 0;
+ ssize_t ret = 0;
+ struct iovec iov = {
+ .iov_base = NULL,
+ };
+ struct msghdr msg = {
+ .msg_name = &e->re_addr,
+ .msg_namelen = sizeof(struct sockaddr_in),
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ };
+
+ verbosef(2, stderr, "Starting receive loop\n");
+
+ stats_start();
+
+ while (runningp()) {
+ /* Calls stats_print() */
+ iov.iov_len = 0;
+ ret = recv_buff(e, &msg, MSG_PEEK|MSG_TRUNC);
+ if (ret < 0)
+ break;
+
+ if (ret > alloced) {
+ verbosef(3, stderr,
+ "Growing buffer to %zd bytes\n",
+ ret);
+ iov.iov_base = realloc(iov.iov_base, ret);
+ if (iov.iov_base == NULL) {
+ ret = -ENOMEM;
+ break;
+ }
+ alloced = ret;
+ }
+
+ /* Calls stats_print() */
+ iov.iov_len = ret;
+ ret = recv_buff(e, &msg, 0);
+ if (ret < 0)
+ break;
+
+ stats_add_recv(ret);
+
+ /* Calls stats_print() */
+ ret = empty_buff(ctxt, iov.iov_base, ret);
+ if (ret)
+ break;
+ }
+ verbosef(2, stderr, "Stopping receive loop\n");
+
+ stats_total();
+
+ return ret;
+}
+
+int main(int argc, char *argv[])
+{
+ int rc;
+ char ipbuf[INET_ADDRSTRLEN];
+ struct rds_context ctxt = {
+ .rc_filename = "-",
+ };
+
+
+ INIT_LIST_HEAD(&ctxt.rc_daddrs);
+
+ rc = parse_options(argc, argv, RDS_TOOL_BASE_OPTS RDS_SINK_OPTS,
+ &ctxt);
+ if (rc)
+ print_usage(rc);
+
+ inet_ntop(PF_INET, &ctxt.rc_saddr->re_addr.sin_addr, ipbuf,
+ INET_ADDRSTRLEN);
+ verbosef(2, stderr, "Binding endpoint %s:%d\n",
+ ipbuf, ntohs(ctxt.rc_saddr->re_addr.sin_port));
+
+ rc = rds_bind(&ctxt);
+ if (rc)
+ goto out;
+
+ if (ctxt.rc_filename) {
+ rc = dup_file(&ctxt, STDOUT_FILENO, O_CREAT|O_WRONLY);
+ if (rc)
+ goto out;
+ if (!strcmp(ctxt.rc_filename, "-"))
+ ctxt.rc_filename = "<standard output>";
+ }
+
+ setup_signals();
+ if (rc) {
+ verbosef(0, stderr, "%s: Unable to initialize signals\n",
+ progname);
+ goto out;
+ }
+
+ rc = wli_do_recv(&ctxt);
+
+out:
+ free(ctxt.rc_saddr->re_name);
+ free(ctxt.rc_saddr);
+
+ return rc;
+}
diff --git a/rds-stress.1 b/rds-stress.1
new file mode 100644
index 0000000..ec99887
--- /dev/null
+++ b/rds-stress.1
@@ -0,0 +1,174 @@
+.Dd May 15, 2007
+.Dt RDS-STRESS 1
+.Os
+.Sh NAME
+.Nm rds-stress
+.Nd send messages between processes over RDS sockets
+.Pp
+.Sh SYNOPSIS
+.Nm rds-stress
+.Bk -words
+.Op Fl p Ar port_number
+.Op Fl r Ar receive_address
+.Op Fl s Ar send_address
+.Op Fl a Ar ack_bytes
+.Op Fl q Ar request_bytes
+.Op Fl D Ar rdma_bytes
+.Op Fl d Ar queue_depth
+.Op Fl t Ar nr_tasks
+.Op Fl c
+.Op Fl R
+.Op Fl V
+.Op Fl v
+
+.Sh DESCRIPTION
+.Nm rds-stress
+sends messages between groups tasks, usually running on seperate
+machines.
+.Pp
+First a passive receiving instance is started.
+.Pp
+.Dl $ rds-stress
+.Pp
+Then an active sending instance is started, giving it
+the address and port at which it will find a listening
+passive receiver. In addition, it is given configuration options which
+both instances will use.
+.Pp
+.Dl $ rds-stress -s recvhost -p 4000 -t 1 -d 1
+.Pp
+The active sender will parse the options, connect to the passive receiver, and
+send the options over this connection. From this point on both instances
+exhibit the exact same behaviour.
+.Pp
+They will create a number of child tasks as specified by the -t option. Once
+the children are created the parent sleeps for a second at a time, printing a
+summary of statistics at each interval.
+.Pp
+Each child will open an RDS socket, each binding to a port number in order
+after the port number given on the command line. The first child would bind to
+port 4001 in our example. Each child sets the send and receive buffers to
+exactly fit the number of messages, requests and acks, that will be in flight
+as determind by the command line arguments.
+.Pp
+The children then enter their loop. They will keep a number of sent messages
+outstanding as specified by the -d option. When they reach this limit they
+will wait to receive acks which will allow them to send again. As they receive
+messages from their peers they immediately send acks.
+.Pp
+Every second, the parent process will display statistics of the ongoing
+stress test. The output is described in section OUTPUT below.
+.Pp
+If the -T option is given, the test will terminate after the specified time,
+and a summary is printed.
+.Pp
+Each child maintains outstanding messages to all other children of the other instance.
+They do not send to their siblings.
+.Sh OPTIONS
+The following options are available for use on the command line:
+.Bl -tag -width Ds
+.It Fl p Ar port_number
+Each parent binds a TCP socket to this port number and their respective
+address. They will trade the negotiated options over this socket. Each
+child will bind an RDS socket to the range of ports immediately following
+this port number, for as many children as there are.
+.It Fl s Ar send_address
+A connection attempt is made to this address. Once its complete and the
+options are sent over it then children will be created and work will proceed.
+.It Fl r Ar receive_address
+This specifies the address that messages will be sent from. If -s is not
+specified then rds-stress waits for a connection on this address before
+proceeding.
+.Pp
+If this option is not given, rds-stress will choose an appropriate address.
+The passive process will accept connections on all local interfaces, and
+obtain the address once the control connection is established.
+The active process will choose a local address based on the interface through
+which it connects to the destination address.
+.It Fl a Ar ack_bytes
+This specifies the size of the ack messages, in bytes. There is a minimum size
+which depends on the format of the ack messages, which may change over time.
+See section "Message Sizes" below.
+.It Fl q Ar request_bytes
+This specifies the size of the request messages, in bytes.
+It also has a minimum size which may change over time.
+See section "Message Sizes" below.
+.It Fl D Ar rdma_bytes
+RDSv3 is capable of transmitting part of a message via RDMA directly from
+application buffer to application buffer. This option enables RDMA support
+in rds-stress: request packets include parameters for an RDMA READ or WRITE
+operation, which the receiving process executes at the time the ACK packet
+is sent.
+See section "Message Sizes" below.
+.It Fl d Ar queue_depth
+Each child will try to maintain this many sent messages outstanding to each
+of its peers on the remote address.
+.It Fl t Ar nr_tasks
+Each parent will create this many children tasks.
+.It Fl T Ar seconds
+Specify the duration of the test run. After the specified number of seconds,
+all processes on both ends of the connection will terminate, and the
+active instance will print a summary. By default, rds-stress will keep
+on sending and receiving messages.
+.It Fl z
+This flag can be used in conjunction with -T. It suppresses the ongoing
+display of statistics, and prints a summary only.
+.It Fl c
+This causes rds-stress to create child tasks which just consume CPU cycles.
+One task is created for each CPU in the system. First each child observes the
+maximum rate at which it can consume cycles. This means that this option
+should only be given on an idle system. rds-stress can then calculate the CPU
+use of the system by observing the lesser rate at which the children consume
+cycles. This option is *not* shared between the active and passive instances.
+It must be specified on each rds-stress command line.
+.It Fl R
+This tells the rds-stress parent process to run with SCHED_RR priority,
+giving it precedence over the child processes. This is useful when running
+with lots of tasks, where there is a risk of the child processes starving
+the parent, and skewing the results.
+.It Fl v
+With this option enabled, packets are filled with a pattern that is
+verified by the receiver. This check can help detect data corruption
+occuring under high load.
+.El
+.Pp
+
+.Ss Message Sizes
+Options which set a message size (such as -a) specify a number of bytes
+by default. By appending \fBK\fP, \fBM\fP, or \fBG\fP, you can specify the size
+in kilobytes, megabytes or gigabytes, respectively. For instance,
+the following will run rds-stress with a message and ACK size of 1024
+bytes, and an RDMA message size of 1048576 bytes:
+.Pp
+.Dl rds-stress ... -q 1K -a 1K -D 1M
+.Pp
+.Pp
+.Sh OUTPUT
+Each parent outputs columns of statistics at a regular interval:
+.Bl -tag -width Ds
+.It tsks
+The number of child tasks which are running.
+.It tx/s
+The number of sendmsg() calls that all children are executing, per second.
+.It tx+rx K/s
+The total number of bytes that are flowing through sendmsg() and recvmsg() for all children.
+This includes both request and ack messages.
+.It rw+rr K/s
+The total number of bytes that are being transferred via RDMA READs and
+WRITEs for all children.
+.It tx us/c
+The average number of microseconds spent in sendmsg() calls.
+.It rtt us
+The average round trip time for a request and ack message pair. This measures
+the total time between when a task sends a request and when it finally receives
+the ack for that message. Because it includes the time it takes for the
+receiver to wake up, receive the message, and send an ack, it can grow to be
+quite large under load.
+.It cpu %
+This is the percentage of available CPU resources on this machine that are being
+consumed since rds-stress started running. It will show -1.00 if -c is not
+given. It is calculated based on the amount of CPU resources that CPU soaking
+tasks are able to consume. This lets it measure CPU use by the system, say in
+interrupt handlers, that task-based CPU accounting does not include.
+For this to work rds-stress must be started with -c on an idle system.
+.El
diff --git a/rds-stress.c b/rds-stress.c
new file mode 100644
index 0000000..dfe68c5
--- /dev/null
+++ b/rds-stress.c
@@ -0,0 +1,2715 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <errno.h>
+#include <string.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sys/uio.h>
+#include <netdb.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <sys/time.h>
+#include <time.h>
+#include <inttypes.h>
+#include <syscall.h>
+#include <sys/stat.h>
+#include <sys/poll.h>
+#include <ctype.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <getopt.h>
+#include <byteswap.h>
+#include "net/rds.h"
+
+#ifdef DYNAMIC_PF_RDS
+#include "pfhack.h"
+#endif
+
+/*
+ *
+ * TODO
+ * - checksum the data some day.
+ * - use poll to wait instead of blocking recvmsg? doesn't seem great.
+ * - measure us/call of nonblocking recvmsg
+ * - do something about receiver congestion
+ * - notice when parent tcp socket dies
+ * - should the parent be at a higher priority?
+ * - catch ctl-c
+ * - final stats summary page
+ */
+
+struct options {
+ uint32_t req_depth;
+ uint32_t req_size;
+ uint32_t ack_size;
+ uint32_t rdma_size;
+ uint32_t send_addr;
+ uint32_t receive_addr;
+ uint16_t starting_port;
+ uint16_t nr_tasks;
+ uint32_t run_time;
+ uint8_t summary_only;
+ uint8_t rtprio;
+ uint8_t tracing;
+ uint8_t verify;
+ uint8_t show_params;
+ uint8_t show_perfdata;
+ uint8_t use_cong_monitor;
+ uint8_t rdma_use_once;
+ uint8_t rdma_use_get_mr;
+ uint8_t rdma_use_fence;
+ uint8_t rdma_cache_mrs;
+ uint8_t rdma_key_o_meter;
+ uint8_t suppress_warnings;
+
+ uint32_t rdma_alignment;
+ uint32_t connect_retries;
+} __attribute__((packed));
+
+static struct options opt;
+static int control_fd;
+
+struct counter {
+ uint64_t nr;
+ uint64_t sum;
+ uint64_t min;
+ uint64_t max;
+};
+
+enum {
+ S_REQ_TX_BYTES = 0,
+ S_REQ_RX_BYTES,
+ S_ACK_TX_BYTES,
+ S_ACK_RX_BYTES,
+ S_RDMA_WRITE_BYTES,
+ S_RDMA_READ_BYTES,
+ S_SENDMSG_USECS,
+ S_RTT_USECS,
+ S__LAST
+};
+
+#define NR_STATS S__LAST
+
+/*
+ * Parents share a mapped array of these with their children. Each child
+ * gets one. It's used to communicate between the child and the parent
+ * simply.
+ */
+struct child_control {
+ pid_t pid;
+ int ready;
+ struct timeval start;
+ struct counter cur[NR_STATS];
+ struct counter last[NR_STATS];
+} __attribute__((aligned (256))); /* arbitrary */
+
+struct soak_control {
+ pid_t pid;
+ uint64_t per_sec;
+ uint64_t counter;
+ uint64_t last;
+ struct timeval start;
+} __attribute__((aligned (256))); /* arbitrary */
+
+void stop_soakers(struct soak_control *soak_arr);
+
+/*
+ * Requests tend to be larger and we try to keep a certain number of them
+ * in flight at a time. Acks are sent in response to requests and tend
+ * to be smaller.
+ */
+#define OP_REQ 1
+#define OP_ACK 2
+
+#define RDMA_OP_READ 1
+#define RDMA_OP_WRITE 2
+#define RDMA_OP_TOGGLE(x) (3 - (x)) /* read becomes write and vice versa */
+
+/*
+ * Every message sent with sendmsg gets a header. This lets the receiver
+ * verify that it got what was sent.
+ */
+struct header {
+ uint32_t seq;
+ uint32_t from_addr;
+ uint32_t to_addr;
+ uint16_t from_port;
+ uint16_t to_port;
+ uint16_t index;
+ uint8_t op;
+
+ /* RDMA related.
+ * rdma_op must be the first field, because we
+ * use offsetof(rdma_op) in fill_hdr and check_hdr
+ */
+ uint8_t rdma_op;
+ uint64_t rdma_addr;
+ uint64_t rdma_phyaddr;
+ uint64_t rdma_pattern;
+ uint64_t rdma_key;
+ uint32_t rdma_size;
+
+ uint8_t data[0];
+} __attribute__((packed));
+
+#define MIN_MSG_BYTES (sizeof(struct header))
+#define BASIC_HEADER_SIZE (size_t)(&((struct header *) 0)->rdma_op)
+
+#define die(fmt...) do { \
+ fprintf(stderr, fmt); \
+ exit(1); \
+} while (0)
+
+#define die_errno(fmt, args...) do { \
+ fprintf(stderr, fmt ", errno: %d (%s)\n", ##args , errno,\
+ strerror(errno)); \
+ exit(1); \
+} while (0)
+
+static int mrs_allocated = 0;
+
+#define trace(fmt...) do { \
+ if (opt.tracing) \
+ fprintf(stderr, fmt); \
+} while (0)
+
+#define min(a,b) (a < b ? a : b)
+#define max(a,b) (a > b ? a : b)
+
+static unsigned long sys_page_size;
+
+/* This macro casts a pointer to uint64_t without producing
+ warnings on either 32bit or 64bit platforms. At least
+ with gcc, that is.
+ */
+#define ptr64(p) ((unsigned long) (p))
+
+/* zero is undefined */
+static inline uint64_t minz(uint64_t a, uint64_t b)
+{
+ if (a == 0)
+ return b;
+ if (b == 0)
+ return a;
+ return min(a, b);
+}
+
+static unsigned long long parse_ull(char *ptr, unsigned long long max)
+{
+ unsigned long long val;
+ char *endptr;
+
+ val = strtoull(ptr, &endptr, 0);
+ switch (*endptr) {
+ case 'k': case 'K':
+ val <<= 10;
+ endptr++;
+ break;
+
+ case 'm': case 'M':
+ val <<= 20;
+ endptr++;
+ break;
+
+ case 'g': case 'G':
+ val <<= 30;
+ endptr++;
+ break;
+ }
+
+ if (*ptr && !*endptr && val <= max)
+ return val;
+
+ die("invalid number '%s'\n", ptr);
+}
+
+static uint32_t parse_addr(char *ptr)
+{
+ uint32_t addr;
+ struct hostent *hent;
+
+ hent = gethostbyname(ptr);
+ if (hent &&
+ hent->h_addrtype == AF_INET && hent->h_length == sizeof(addr)) {
+ memcpy(&addr, hent->h_addr, sizeof(addr));
+ return ntohl(addr);
+ }
+
+ die("invalid host name or dotted quad '%s'\n", ptr);
+}
+
+static void usage(void)
+{
+ printf(
+ "\n"
+ "Send & Recv parameters:\n"
+ " -r [addr] use this local address\n"
+ " -p [port, 4000] starting port number\n"
+ "\n"
+ "Send parameters:\n"
+ " -s [addr] send to this address (required)\n"
+ " -a [bytes, %u] ack message length\n"
+ " -q [bytes, 1024] request message length\n"
+ " -d [depth, 1] request pipeline depth, nr outstanding\n"
+ " -t [nr, 1] number of child tasks\n"
+ " -T [seconds, 0] runtime of test, 0 means infinite\n"
+ " -D [bytes] RDMA size (RDSv3 only)\n"
+ "\n"
+ "Optional flags:\n"
+ " -c measure cpu use with per-cpu soak processes\n"
+ " -V trace execution\n"
+ " -z print a summary at end of test only\n"
+ "\n"
+ "Example:\n"
+ " recv$ rds-stress\n"
+ " send$ rds-stress -s recv -q 4096 -t 2 -d 2\n"
+ "\n", (int) MIN_MSG_BYTES);
+
+ exit(2);
+}
+
+static void set_rt_priority(void)
+{
+ struct sched_param param;
+
+ memset(¶m, 0, sizeof(param));
+ param.sched_priority = 1;
+
+ if (sched_setscheduler(0, SCHED_RR, ¶m) < 0)
+ die_errno("sched_setscheduler(SCHED_RR) failed");
+}
+
+/* This hack lets children notice when their parents die.
+ * We could also use kill(0), but that results in false
+ * positives when the parent is a zombie (and that happens
+ * if you have a script parsing the output of rds-stress,
+ * and the parent dies).
+ */
+static void check_parent(pid_t pid)
+{
+ if (pid != getppid())
+ die("parent %u exited\n", pid);
+}
+
+/*
+ * put a pattern in the message so the remote side can verify that it's
+ * what was expected.
+ */
+static unsigned char * msg_pattern;
+
+static void init_msg_pattern(struct options *opts)
+{
+ unsigned int max_size = max(opts->req_size, opts->ack_size);
+ unsigned int i, k = 11;
+
+ msg_pattern = malloc(max_size);
+
+ /* k = 41 * (k + 3) is a generator of Z(256). Adding
+ * (i >> 8) makes sure the pattern is shifted by 1 in
+ * every successive 256 byte block, so that we can detect
+ * swapped blocks. */
+ for (i = 0; i < max_size; i++, k = 41 * (k + 3) + (i >> 8))
+ msg_pattern[i] = k;
+}
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define htonll(x) bswap_64(x)
+#define ntohll(x) bswap_64(x)
+#else
+#define htonll(x) (x)
+#define ntohll(x) (x)
+#endif
+
+static void encode_hdr(struct header *dst, const struct header *hdr)
+{
+ memset(dst, 0, sizeof(*dst));
+
+ dst->seq = htonl(hdr->seq);
+ dst->from_addr = hdr->from_addr; /* always network byte order */
+ dst->from_port = hdr->from_port; /* ditto */
+ dst->to_addr = hdr->to_addr; /* ditto */
+ dst->to_port = hdr->to_port; /* ditto */
+ dst->index = htons(hdr->index);
+ dst->op = hdr->op;
+
+ dst->rdma_op = hdr->rdma_op;
+ dst->rdma_addr = htonll(hdr->rdma_addr);
+ dst->rdma_phyaddr = htonll(hdr->rdma_phyaddr);
+ dst->rdma_pattern = htonll(hdr->rdma_pattern);
+ dst->rdma_key = htonll(hdr->rdma_key);
+ dst->rdma_size = htonl(hdr->rdma_size);
+}
+
+static void decode_hdr(struct header *dst, const struct header *hdr)
+{
+ memset(dst, 0, sizeof(*dst));
+
+ dst->seq = ntohl(hdr->seq);
+ dst->from_addr = hdr->from_addr; /* always network byte order */
+ dst->from_port = hdr->from_port; /* ditto */
+ dst->to_addr = hdr->to_addr; /* ditto */
+ dst->to_port = hdr->to_port; /* ditto */
+ dst->index = ntohs(hdr->index);
+ dst->op = hdr->op;
+
+ dst->rdma_op = hdr->rdma_op;
+ dst->rdma_addr = ntohll(hdr->rdma_addr);
+ dst->rdma_phyaddr = ntohll(hdr->rdma_phyaddr);
+ dst->rdma_pattern = ntohll(hdr->rdma_pattern);
+ dst->rdma_key = ntohll(hdr->rdma_key);
+ dst->rdma_size = ntohl(hdr->rdma_size);
+}
+
+static void fill_hdr(void *message, uint32_t bytes, struct header *hdr)
+{
+ encode_hdr(message, hdr);
+ if (opt.verify)
+ memcpy(message + sizeof(*hdr), msg_pattern, bytes - sizeof(*hdr));
+}
+
+/* inet_ntoa uses a static buffer, so calling it twice in
+ * a single printf as we do below will produce undefined
+ * results. We copy the output to two static buffers,
+ * and switch between them.
+ */
+static char *inet_ntoa_32(uint32_t val)
+{
+ struct in_addr addr = { .s_addr = val };
+ static char buffer[2][64];
+ static unsigned int select = 0;
+
+ select = 1 - select;
+ strncpy(buffer[select], inet_ntoa(addr), 63);
+
+ return buffer[select];
+}
+
+/*
+ * Compare incoming message header with expected header. All header fields
+ * are in host byte order except for address and port fields.
+ */
+static int check_hdr(void *message, uint32_t bytes, const struct header *hdr)
+{
+ struct header msghdr;
+
+ decode_hdr(&msghdr, message);
+ if (memcmp(&msghdr, hdr, BASIC_HEADER_SIZE)) {
+#define bleh(var, disp) \
+ disp(hdr->var), \
+ msghdr.var == hdr->var ? " =" : "!=", \
+ disp(msghdr.var)
+
+ /*
+ * This is printed as one GIANT printf() so that it serializes
+ * with stdout() and we don't get things stomping on each
+ * other
+ */
+ printf( "An incoming message had a header which\n"
+ "didn't contain the fields we expected:\n"
+ " member expected eq got\n"
+ " seq %15u %s %15u\n"
+ " from_addr %15s %s %15s\n"
+ " from_port %15u %s %15u\n"
+ " to_addr %15s %s %15s\n"
+ " to_port %15u %s %15u\n"
+ " index %15u %s %15u\n"
+ " op %15u %s %15u\n",
+ bleh(seq, /**/),
+ bleh(from_addr, inet_ntoa_32),
+ bleh(from_port, ntohs),
+ bleh(to_addr, inet_ntoa_32),
+ bleh(to_port, ntohs),
+ bleh(index, /**/),
+ bleh(op, /**/));
+#undef bleh
+
+ return 1;
+ }
+
+ if (opt.verify
+ && memcmp(message + sizeof(*hdr), msg_pattern, bytes - sizeof(*hdr))) {
+ unsigned char *p = message + sizeof(*hdr);
+ unsigned int i, count = 0, total = bytes - sizeof(*hdr);
+ int offset = -1;
+
+ for (i = 0; i < total; ++i) {
+ if (p[i] != msg_pattern[i]) {
+ if (offset < 0)
+ offset = i;
+ count++;
+ }
+ }
+
+ printf("An incoming message has a corrupted payload at offset %u; "
+ "%u out of %u bytes corrupted\n",
+ offset, count, total);
+ return 1;
+ }
+
+ return 0;
+}
+
+void stat_inc(struct counter *ctr, uint64_t val)
+{
+ ctr->nr++;
+ ctr->sum += val;
+ ctr->min = minz(val, ctr->min);
+ ctr->max = max(val, ctr->max);
+}
+
+int64_t tv_cmp(const struct timeval *a, const struct timeval *b)
+{
+ int64_t a_usecs = ((uint64_t)a->tv_sec * 1000000ULL) + a->tv_usec;
+ int64_t b_usecs = ((uint64_t)b->tv_sec * 1000000ULL) + b->tv_usec;
+
+ return a_usecs - b_usecs;
+}
+
+/* returns a - b in usecs */
+uint64_t usec_sub(struct timeval *a, struct timeval *b)
+{
+ return ((uint64_t)(a->tv_sec - b->tv_sec) * 1000000ULL) +
+ a->tv_usec - b->tv_usec;
+}
+
+static int bound_socket(int domain, int type, int protocol,
+ struct sockaddr_in *sin)
+{
+ int fd;
+ int opt;
+
+ fd = socket(domain, type, protocol);
+ if (fd < 0)
+ die_errno("socket(%d, %d, %d) failed", domain, type, protocol);
+
+ opt = 1;
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)))
+ die_errno("setsockopt(SO_REUSEADDR) failed");
+
+ if (bind(fd, (struct sockaddr *)sin, sizeof(struct sockaddr_in)))
+ die_errno("bind() failed");
+
+ return fd;
+}
+
+static uint32_t get_local_address(int fd, struct sockaddr_in *sin)
+{
+ socklen_t alen = sizeof(*sin);
+
+ if (getsockname(fd, (struct sockaddr *) sin, &alen))
+ die_errno("getsockname failed");
+ return ntohl(sin->sin_addr.s_addr);
+}
+
+static int rds_socket(struct options *opts, struct sockaddr_in *sin)
+{
+ int bytes;
+ int fd;
+ int val;
+ socklen_t optlen;
+
+ fd = bound_socket(PF_RDS, SOCK_SEQPACKET, 0, sin);
+
+ bytes = opts->nr_tasks * opts->req_depth *
+ (opts->req_size + opts->ack_size) * 2;
+
+ if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &bytes, sizeof(bytes)))
+ die_errno("setsockopt(SNDBUF, %d) failed", bytes);
+ if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &bytes, sizeof(bytes)))
+ die_errno("setsockopt(RCVBUF, %d) failed", bytes);
+
+ optlen = sizeof(val);
+ if (getsockopt(fd, SOL_SOCKET, SO_SNDBUF, &val, &optlen))
+ die_errno("getsockopt(SNDBUF) failed");
+ if (val / 2 < bytes && !opts->suppress_warnings)
+ fprintf(stderr,
+ "getsockopt(SNDBUF) returned %d, we wanted %d * 2\n",
+ val, bytes);
+
+ optlen = sizeof(val);
+ if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &val, &optlen))
+ die_errno("getsockopt(RCVBUF) failed");
+ if (val / 2 < bytes && !opts->suppress_warnings)
+ fprintf(stderr,
+ "getsockopt(RCVBUF) returned %d, we need %d * 2\n",
+ val, bytes);
+
+ val = 1;
+ if (opts->use_cong_monitor
+ && setsockopt(fd, SOL_RDS, RDS_CONG_MONITOR, &val, sizeof(val))) {
+ if (errno != ENOPROTOOPT)
+ die_errno("setsockopt(RDS_CONG_MONITOR) failed");
+ printf("Kernel does not support congestion monitoring; disabled\n");
+ opts->use_cong_monitor = 0;
+ }
+
+ fcntl(fd, F_SETFL, O_NONBLOCK);
+
+ return fd;
+}
+
+static int check_rdma_support(struct options *opts)
+{
+ struct sockaddr_in sin;
+ struct rds_free_mr_args args;
+ int fd, okay = 0;
+
+ /* We need a local address to bind to. If the user
+ * didn't specify the -r option, we tell him to go on for
+ * now - he'll call back once more later. */
+ if (opts->receive_addr == 0)
+ return 1;
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = htons(opts->starting_port);
+ sin.sin_addr.s_addr = htonl(opts->receive_addr);
+
+ fd = bound_socket(AF_RDS, SOCK_SEQPACKET, 0, &sin);
+
+ memset(&args, 0, sizeof(args));
+ if (setsockopt(fd, SOL_RDS, RDS_FREE_MR, &args, sizeof(args)) >= 0) {
+ okay = 1;
+ } else if (errno == ENOPROTOOPT) {
+ okay = 0;
+ } else {
+ die_errno("%s: RDS_FREE_MR failed with unexpected error",
+ __FUNCTION__);
+ }
+ close(fd);
+
+ return okay;
+}
+
+static uint64_t get_rdma_key(int fd, uint64_t addr, uint32_t size)
+{
+ uint64_t cookie = 0;
+ struct rds_get_mr_args mr_args;
+
+ mr_args.vec.addr = addr;
+ mr_args.vec.bytes = size;
+ mr_args.cookie_addr = ptr64(&cookie);
+ mr_args.flags = RDS_RDMA_READWRITE; /* for now, always assume r/w */
+ if (opt.rdma_use_once)
+ mr_args.flags |= RDS_RDMA_USE_ONCE;
+
+ if (setsockopt(fd, SOL_RDS, RDS_GET_MR, &mr_args, sizeof(mr_args)))
+ die_errno("setsockopt(RDS_GET_MR) failed (%u allocated)", mrs_allocated);
+
+ trace("RDS get_rdma_key() = %Lx\n",
+ (unsigned long long) cookie);
+
+ mrs_allocated++;
+ return cookie;
+}
+
+static void free_rdma_key(int fd, uint64_t key)
+{
+ struct rds_free_mr_args mr_args;
+
+ trace("RDS free_rdma_key(%Lx)\n", (unsigned long long) key);
+
+ mr_args.cookie = key;
+#if 1
+ mr_args.flags = 0;
+#else
+ mr_args.flags = RDS_FREE_MR_ARGS_INVALIDATE;
+#endif
+ if (setsockopt(fd, SOL_RDS, RDS_FREE_MR, &mr_args, sizeof(mr_args)))
+ die_errno("setsockopt(RDS_FREE_MR) failed");
+ mrs_allocated--;
+}
+
+/*
+ * RDMA key-o-meter. We track how frequently the kernel
+ * re-issues R_Keys
+ *
+ * The key_o_meter data structures are shared between the processes
+ * without any locking. We don't care much for locking here...
+ */
+#define RDMA_MAX_TRACKED_KEYS (32*1024)
+struct rdma_key_stamp {
+ uint32_t r_key;
+ struct timeval issued;
+};
+struct rdma_key_trace {
+ uint32_t count, max;
+ struct rdma_key_stamp *entry;
+};
+struct rdma_key_o_meter {
+ struct rdma_key_trace *current;
+ struct rdma_key_trace *idle;
+};
+static struct rdma_key_o_meter *rdma_key_o_meter;
+static unsigned int rdma_key_task;
+
+static void rdma_key_o_meter_init(unsigned int nr_tasks)
+{
+ struct rdma_key_trace *kt;
+ struct rdma_key_stamp *ks;
+ uint32_t max;
+ unsigned int i, size;
+ void *base;
+
+ size = sizeof(struct rdma_key_o_meter)
+ + 2 * nr_tasks * sizeof(*kt)
+ + 2 * RDMA_MAX_TRACKED_KEYS * sizeof(*ks);
+ base = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, 0, 0);
+ if (base == MAP_FAILED)
+ die_errno("alloc_rdma_buffers: mmap failed");
+
+ rdma_key_o_meter = (struct rdma_key_o_meter *) base;
+ base = rdma_key_o_meter + 1;
+
+ rdma_key_o_meter->current = (struct rdma_key_trace *) base;
+ base = rdma_key_o_meter->current + nr_tasks;
+
+ rdma_key_o_meter->idle = (struct rdma_key_trace *) base;
+ base = rdma_key_o_meter->idle + nr_tasks;
+
+ ks = (struct rdma_key_stamp *) base;
+ max = RDMA_MAX_TRACKED_KEYS / nr_tasks;
+ for (i = 0, kt = rdma_key_o_meter->current; i < 2 * nr_tasks; ++i, ++kt) {
+ kt->count = 0;
+ kt->max = max;
+ kt->entry = ks + i * max;
+ }
+}
+
+/* This is called in the child process to set the index of
+ * the key-o-meter to use */
+static void rdma_key_o_meter_set_self(unsigned int task_idx)
+{
+ rdma_key_task = task_idx;
+}
+
+static void rdma_key_o_meter_add(uint32_t key)
+{
+ struct rdma_key_trace *kt;
+
+ if (!rdma_key_o_meter)
+ return;
+
+ kt = &rdma_key_o_meter->current[rdma_key_task];
+ if (kt->count < kt->max) {
+ kt->entry[kt->count].r_key = key;
+ gettimeofday(&kt->entry[kt->count].issued, NULL);
+ kt->count++;
+ }
+}
+
+static int rdma_key_stamp_compare(const void *p1, const void *p2)
+{
+ const struct rdma_key_stamp *ks1 = p1, *ks2 = p2;
+
+ if (ks1->r_key < ks2->r_key)
+ return -1;
+ if (ks1->r_key > ks2->r_key)
+ return 1;
+ return tv_cmp(&ks1->issued, &ks2->issued);
+}
+
+static void rdma_key_o_meter_check(unsigned int nr_tasks)
+{
+ struct rdma_key_stamp *ks, sorted[RDMA_MAX_TRACKED_KEYS];
+ struct rdma_key_trace *kt;
+ unsigned int i, j, count = 0;
+ unsigned int reissued = 0;
+ double min_elapsed = 0, avg_elapsed = 0;
+
+ if (!rdma_key_o_meter)
+ return;
+
+ /* Extract keys from all tasks and sort them. */
+ kt = rdma_key_o_meter->idle;
+ for (i = 0; i < nr_tasks; ++i, ++kt) {
+ ks = kt->entry;
+
+ for (j = 0; j < kt->count; ++j)
+ sorted[count++] = *ks++;
+ kt->count = 0;
+ }
+ qsort(sorted, count, sizeof(*sorted), rdma_key_stamp_compare);
+
+ /* Now see how many were reissued */
+ ks = sorted;
+ for (i = 0; i + 1 < count; ++i, ++ks) {
+ double elapsed;
+
+ if (ks[0].r_key != ks[1].r_key)
+ continue;
+ elapsed = 1e-6 * usec_sub(&ks[1].issued, &ks[0].issued);
+ if (reissued == 0 || elapsed < min_elapsed)
+ min_elapsed = elapsed;
+ avg_elapsed += elapsed;
+ }
+
+ if (reissued)
+ printf(" *** %u R_Keys were re-issued; min distance=%f sec, avg distance=%f sec\n",
+ reissued, min_elapsed, avg_elapsed / reissued);
+
+ /* Swap current and idle */
+ kt = rdma_key_o_meter->current;
+ rdma_key_o_meter->current = rdma_key_o_meter->idle;
+ rdma_key_o_meter->idle = kt;
+}
+
+static void rds_fill_buffer(void *buf, size_t size, uint64_t pattern)
+{
+ uint64_t *pos, *end;
+
+ pos = (uint64_t *) buf;
+ end = (uint64_t *) (buf + size);
+ while (pos < end)
+ *pos++ = pattern;
+}
+
+#if 0
+static void rds_dump_buffer(const void *buf, size_t size)
+{
+ const uint64_t *pos;
+ unsigned int i, count;
+
+ pos = (const uint64_t *) buf;
+
+ count = size / sizeof(uint64_t);
+ pos = (const uint64_t *) buf;
+
+ printf("rds_dump_buffer(%p, %u)\n", buf, (int) size);
+ for (i = 0; i < count; ++i) {
+ if ((i % 4) == 0)
+ printf("\n%08x:", i);
+ printf(" %016Lx", (unsigned long long) *pos++);
+ }
+}
+#endif
+
+static void rds_compare_buffer(uint64_t *addr, int size, uint64_t pattern)
+{
+ int d, failed = 0;
+
+ for (d = 0; d < size / sizeof(uint64_t); d++) {
+ if (addr[d] == pattern)
+ continue;
+
+ failed = 1;
+ trace("compare fail pattern offset %u: expected %Lx got %Lx\n",
+ 8 * d,
+ (unsigned long long) pattern,
+ (unsigned long long) addr[d]);
+
+#if 0
+ rds_dump_buffer(addr, size);
+ die("compare pass\n");
+#endif
+ }
+
+ if (!failed)
+ trace("compare pass pattern %Lx addr %p\n",
+ (unsigned long long) pattern, addr);
+}
+
+struct task {
+ unsigned int nr;
+ unsigned int pending;
+ unsigned int unacked;
+ struct sockaddr_in src_addr; /* same for all tasks */
+ struct sockaddr_in dst_addr;
+ unsigned char congested;
+ unsigned char drain_rdmas;
+ uint32_t send_seq;
+ uint32_t recv_seq;
+ uint16_t send_index;
+ uint16_t recv_index;
+ struct timeval * send_time;
+ struct header * ack_header;
+
+ /* RDMA related stuff */
+ uint64_t ** local_buf;
+ uint64_t ** rdma_buf;
+ uint64_t * rdma_req_key;
+ uint8_t * rdma_inflight;
+ uint32_t buffid;
+ uint8_t rdma_next_op;
+};
+
+static void alloc_rdma_buffers(struct task *t, struct options *opts)
+{
+ unsigned int i, j;
+ size_t len;
+ caddr_t base;
+
+ /* We use mmap here rather than malloc, because it is always
+ * page aligned. */
+ len = 2 * opts->nr_tasks * opts->req_depth * opts->rdma_size + sys_page_size;
+ base = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
+ if (base == MAP_FAILED)
+ die_errno("alloc_rdma_buffers: mmap failed");
+ memset(base, 0x2f, len);
+ base += opts->rdma_alignment;
+
+ for (i = 0; i < opts->nr_tasks; ++i, ++t) {
+ for (j = 0; j < opts->req_depth; ++j) {
+ t->rdma_buf[j] = (uint64_t *) base;
+ base += opts->rdma_size;
+
+ t->local_buf[j] = (uint64_t *) base;
+ base += opts->rdma_size;
+
+ t->rdma_req_key[j] = 0;
+ t->rdma_inflight[j] = 0;
+ }
+ }
+}
+
+static void rdma_build_req(int fd, struct header *hdr, struct task *t,
+ unsigned int rdma_size, unsigned int req_depth)
+{
+ uint64_t *rdma_addr, *rdma_key_p;
+
+ rdma_addr = t->rdma_buf[t->send_index];
+
+ rdma_key_p = &t->rdma_req_key[t->send_index];
+ if (opt.rdma_use_get_mr && *rdma_key_p == 0)
+ *rdma_key_p = get_rdma_key(fd, ptr64(rdma_addr), rdma_size);
+
+ /* We alternate between RDMA READ and WRITEs */
+ hdr->rdma_op = t->rdma_next_op;
+ t->rdma_next_op = RDMA_OP_TOGGLE(t->rdma_next_op);
+
+ hdr->rdma_pattern = (((uint64_t) t->send_seq) << 32) | getpid();
+ hdr->rdma_addr = ptr64(rdma_addr);
+ hdr->rdma_phyaddr = 0;
+ hdr->rdma_size = rdma_size;
+ hdr->rdma_key = *rdma_key_p;
+
+ if (RDMA_OP_READ == hdr->rdma_op) {
+ if (opt.verify)
+ rds_fill_buffer(rdma_addr, rdma_size, hdr->rdma_pattern);
+ trace("Requesting RDMA read for pattern %Lx "
+ "local addr to rdma read %p\n",
+ (unsigned long long) hdr->rdma_pattern,
+ rdma_addr);
+ } else {
+ if (opt.verify)
+ rds_fill_buffer(rdma_addr, rdma_size, 0);
+ trace("Requesting RDMA write for pattern %Lx "
+ "local addr to rdma write %p\n",
+ (unsigned long long) hdr->rdma_pattern,
+ rdma_addr);
+ }
+}
+
+static void rdma_validate(const struct header *in_hdr, struct options *opts)
+{
+ unsigned long rdma_size;
+
+ rdma_size = in_hdr->rdma_size;
+ if (rdma_size != opts->rdma_size)
+ die("Unexpected RDMA size %lu in request\n", rdma_size);
+
+ if (in_hdr->rdma_op != RDMA_OP_READ && in_hdr->rdma_op != RDMA_OP_WRITE)
+ die("Unexpected RDMA op %u in request\n", in_hdr->rdma_op);
+
+
+ trace("RDS received request to issue rdma %s len %lu rva %Lx key %Lx pattern %Lx\n",
+ in_hdr->rdma_op == RDMA_OP_WRITE? "write to" : "read from",
+ rdma_size,
+ (unsigned long long) in_hdr->rdma_addr,
+ (unsigned long long) in_hdr->rdma_key,
+ (unsigned long long) in_hdr->rdma_pattern);
+}
+
+static void rdma_build_ack(struct header *hdr, const struct header *in_hdr)
+{
+ hdr->rdma_op = in_hdr->rdma_op;
+ hdr->rdma_size = in_hdr->rdma_size;
+ hdr->rdma_key = in_hdr->rdma_key;
+ hdr->rdma_phyaddr = in_hdr->rdma_phyaddr; /* remote's address to rdma to / from */
+ hdr->rdma_addr = in_hdr->rdma_addr; /* remote's address to rdma to / from */
+ hdr->rdma_pattern = in_hdr->rdma_pattern;
+}
+
+static inline unsigned int rdma_user_token(struct task *t, unsigned int qindex)
+{
+ return t->nr * opt.req_depth + qindex;
+}
+
+static void rdma_mark_completed(struct task *tasks, unsigned int token, int status)
+{
+ struct task *t;
+ unsigned int i;
+
+ trace("RDS rdma completion for token %x\n", token);
+
+ t = &tasks[token / opt.req_depth];
+ i = token % opt.req_depth;
+
+ if (status) {
+ const char *errmsg;
+
+ switch (status) {
+ case RDS_RDMA_REMOTE_ERROR:
+ errmsg = "remote error"; break;
+ case RDS_RDMA_CANCELED:
+ errmsg = "operation was cancelled"; break;
+ case RDS_RDMA_DROPPED:
+ errmsg = "operation was dropped"; break;
+ case RDS_RDMA_OTHER_ERROR:
+ errmsg = "other error"; break;
+ default:
+ errmsg = "unknown error"; break;
+ }
+
+ printf("%s:%u: RDMA op %u failed: %s\n",
+ inet_ntoa(t->dst_addr.sin_addr),
+ ntohs(t->dst_addr.sin_port),
+ i, errmsg);
+ }
+
+ t->rdma_inflight[i] = 0;
+ t->drain_rdmas = 0;
+}
+
+#define MSG_MAXIOVLEN 2
+
+/*
+ * Add a control message to the outgoing message
+ */
+static void rdma_put_cmsg(struct msghdr *msg, int type,
+ const void *ptr, size_t size)
+{
+ static char ctlbuf[1024];
+ struct cmsghdr *cmsg;
+
+ msg->msg_control = ctlbuf;
+ msg->msg_controllen = CMSG_SPACE(size);
+
+ cmsg = CMSG_FIRSTHDR(msg);
+ cmsg->cmsg_level = SOL_RDS;
+ cmsg->cmsg_type = type;
+ cmsg->cmsg_len = CMSG_LEN(size);
+ memcpy(CMSG_DATA(cmsg), ptr, size);
+}
+
+/*
+ * This sets up all the fields for an RDMA transfer.
+ * The request is passed as a control message along with
+ * the ACK packet.
+ */
+static void rdma_build_cmsg_xfer(struct msghdr *msg, const struct header *hdr,
+ unsigned int user_token, void *local_buf)
+{
+ static struct rds_iovec iov;
+ struct rds_rdma_args args;
+ unsigned int rdma_size;
+
+ rdma_size = hdr->rdma_size;
+
+ trace("RDS issuing rdma for token %x key %Lx len %u local_buf %p\n",
+ user_token,
+ (unsigned long long) hdr->rdma_key,
+ rdma_size, local_buf);
+
+ /* rdma args */
+ memset(&args, 0, sizeof(args));
+
+ /* Set up the iovec pointing to the RDMA buffer */
+ args.local_vec_addr = (uint64_t) &iov;
+ args.nr_local = 1;
+ iov.addr = ptr64(local_buf);
+ iov.bytes = rdma_size;
+
+ /* The remote could either give us a physical address, or
+ * an index into a zero-based FMR. Either way, we just copy it.
+ */
+ args.remote_vec.addr = hdr->rdma_phyaddr;
+ args.remote_vec.bytes = rdma_size;
+ args.cookie = hdr->rdma_key;
+
+ /* read or write */
+ switch (hdr->rdma_op) {
+ case RDMA_OP_WRITE:
+ args.flags = RDS_RDMA_READWRITE;
+
+ if (opt.verify)
+ rds_fill_buffer(local_buf, rdma_size, hdr->rdma_pattern);
+ break;
+
+ case RDMA_OP_READ:
+ args.flags = 0;
+ break;
+ }
+
+ /* Fence off subsequent SENDs - this is the default */
+ if (opt.rdma_use_fence)
+ args.flags |= RDS_RDMA_FENCE;
+
+ args.flags |= RDS_RDMA_NOTIFY_ME;
+ args.user_token = user_token;
+
+ rdma_put_cmsg(msg, RDS_CMSG_RDMA_ARGS, &args, sizeof(args));
+}
+
+static void rdma_build_cmsg_dest(struct msghdr *msg, rds_rdma_cookie_t rdma_dest)
+{
+ rdma_put_cmsg(msg, RDS_CMSG_RDMA_DEST, &rdma_dest, sizeof(rdma_dest));
+}
+
+static void rdma_build_cmsg_map(struct msghdr *msg, uint64_t addr, uint32_t size,
+ rds_rdma_cookie_t *cookie)
+{
+ struct rds_get_mr_args args;
+
+ args.vec.addr = addr;
+ args.vec.bytes = size;
+ args.cookie_addr = ptr64(cookie);
+ args.flags = RDS_RDMA_READWRITE; /* for now, always assume r/w */
+ if (opt.rdma_use_once)
+ args.flags |= RDS_RDMA_USE_ONCE;
+
+ rdma_put_cmsg(msg, RDS_CMSG_RDMA_MAP, &args, sizeof(args));
+}
+
+static void rdma_process_ack(int fd, struct header *hdr,
+ struct child_control *ctl)
+{
+ trace("RDS rcvd rdma %s ACK for request key %Lx len %u local addr %Lx\n",
+ RDMA_OP_WRITE == hdr->rdma_op ? "write" : "read",
+ (unsigned long long) hdr->rdma_key,
+ hdr->rdma_size,
+ (unsigned long long) hdr->rdma_addr);
+
+ /* Need to free the MR unless allocated with use_once */
+ if (!opt.rdma_use_once && !opt.rdma_cache_mrs)
+ free_rdma_key(fd, hdr->rdma_key);
+
+ /* if acking an rdma write request - then remote node wrote local host buffer
+ * (data in) so count this as rdma data coming in (rdma_read) - else remote node read
+ * local host buffer so count this as rdma write (data out)
+ */
+ switch (hdr->rdma_op) {
+ case RDMA_OP_WRITE:
+ /* remote node wrote local buffer check pattern
+ * sent via immediate data in rdma buffer
+ */
+ stat_inc(&ctl->cur[S_RDMA_READ_BYTES], hdr->rdma_size);
+
+ if (opt.verify) {
+ /* This funny looking cast avoids compile warnings
+ * on 32bit platforms. */
+ rds_compare_buffer((void *)(unsigned long) hdr->rdma_addr,
+ hdr->rdma_size,
+ hdr->rdma_pattern);
+ }
+ break;
+
+ case RDMA_OP_READ:
+ stat_inc(&ctl->cur[S_RDMA_WRITE_BYTES], hdr->rdma_size);
+ break;
+ }
+}
+
+static void build_header(struct task *t, struct header *hdr,
+ unsigned int op, unsigned int qindex)
+{
+ memset(hdr, 0, sizeof(*hdr));
+ hdr->op = op;
+ hdr->seq = t->send_seq;
+ hdr->from_addr = t->src_addr.sin_addr.s_addr;
+ hdr->from_port = t->src_addr.sin_port;
+ hdr->to_addr = t->dst_addr.sin_addr.s_addr;
+ hdr->to_port = t->dst_addr.sin_port;
+ hdr->index = qindex;
+}
+
+static int send_packet(int fd, struct task *t,
+ struct header *hdr, unsigned int size)
+{
+ unsigned char buf[size], *rdma_flight_recorder = NULL;
+ rds_rdma_cookie_t cookie = 0;
+ struct msghdr msg;
+ struct iovec iov;
+ ssize_t ret;
+
+ /* Make sure we always have the current sequence number.
+ * When we send ACK packets, the seq that gets filled in is
+ * stale. */
+ hdr->seq = t->send_seq;
+ fill_hdr(buf, size, hdr);
+
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_name = (struct sockaddr *) &t->dst_addr;
+ msg.msg_namelen = sizeof(t->dst_addr);
+
+ msg.msg_iovlen = 1;
+ msg.msg_iov = &iov;
+ iov.iov_base = buf;
+ iov.iov_len = size;
+
+ /* If this is a REQ packet in which we pass the MR to the
+ * peer, extract the RDMA cookie and pass it on in the control
+ * message for now. */
+ if (hdr->op == OP_REQ && hdr->rdma_op != 0) {
+ if (hdr->rdma_key != 0) {
+ /* We used GET_MR to obtain a key */
+ rdma_build_cmsg_dest(&msg, hdr->rdma_key);
+ cookie = hdr->rdma_key;
+ hdr->rdma_key = 0;
+ } else {
+ /* Use the RDMA_MAP cmsg to have sendmsg do the
+ * mapping on the fly. */
+ rdma_build_cmsg_map(&msg, hdr->rdma_addr,
+ hdr->rdma_size, &cookie);
+ }
+ }
+
+ /* If this is an ACK packet with RDMA, build the cmsg
+ * header that goes with it. */
+ if (hdr->op == OP_ACK && hdr->rdma_op != 0) {
+ unsigned int qindex = hdr->index;
+
+ if (t->rdma_inflight[qindex] != 0) {
+ /* It is unlikely but (provably) possible for
+ * new requests to arrive before the RDMA notification.
+ * That's because RDMA notifications are triggered
+ * by the RDS ACK processing, which happens after new
+ * messages were queued on the socket.
+ *
+ * We return one of the more obscure error messages,
+ * which we recognize and handle in the top loop. */
+ trace("Drain RDMA 0x%x\n", rdma_user_token(t, qindex));
+ errno = EBADSLT;
+ return -1;
+ }
+ rdma_build_cmsg_xfer(&msg, hdr,
+ rdma_user_token(t, qindex),
+ t->local_buf[qindex]);
+ rdma_flight_recorder = &t->rdma_inflight[qindex];
+ }
+
+ ret = sendmsg(fd, &msg, 0);
+ if (ret < 0) {
+ if (errno != EAGAIN && errno != ENOBUFS)
+ die_errno("sendto() failed");
+ return ret;
+ }
+ if (ret != size)
+ die("sendto() truncated - %zd", ret);
+
+ if (rdma_flight_recorder)
+ *rdma_flight_recorder = 1;
+ if (cookie) {
+ /* We just happen to know that the r_key is in the
+ * lower 32bit of the cookie */
+ rdma_key_o_meter_add(cookie);
+ }
+ t->send_seq++;
+ return ret;
+}
+
+static int send_one(int fd, struct task *t,
+ struct options *opts,
+ struct child_control *ctl)
+{
+ struct timeval start;
+ struct timeval stop;
+ struct header hdr;
+ int ret;
+
+ build_header(t, &hdr, OP_REQ, t->send_index);
+ if (opts->rdma_size && t->send_seq > 10)
+ rdma_build_req(fd, &hdr, t,
+ opts->rdma_size,
+ opts->req_depth);
+
+
+ gettimeofday(&start, NULL);
+ ret = send_packet(fd, t, &hdr, opts->req_size);
+ gettimeofday(&stop, NULL);
+
+ if (ret < 0)
+ return ret;
+
+ t->send_time[t->send_index] = start;
+ if (!opts->rdma_cache_mrs)
+ t->rdma_req_key[t->send_index] = 0; /* we consumed this key */
+ stat_inc(&ctl->cur[S_REQ_TX_BYTES], ret);
+ stat_inc(&ctl->cur[S_SENDMSG_USECS],
+ usec_sub(&stop, &start));
+
+ t->send_index = (t->send_index + 1) % opts->req_depth;
+ t->pending++;
+ return ret;
+}
+
+static int send_ack(int fd, struct task *t, unsigned int qindex,
+ struct options *opts,
+ struct child_control *ctl)
+{
+ struct header *hdr = &t->ack_header[qindex];
+ ssize_t ret;
+
+ /* send an ack in response to the req we just got */
+ ret = send_packet(fd, t, hdr, opts->ack_size);
+ if (ret < 0)
+ return ret;
+ if (ret != opts->ack_size)
+ die_errno("sendto() returned %zd", ret);
+
+ stat_inc(&ctl->cur[S_ACK_TX_BYTES], ret);
+
+ /* need separate rdma stats cells for send/recv */
+ switch (hdr->rdma_op) {
+ case RDMA_OP_WRITE:
+ stat_inc(&ctl->cur[S_RDMA_WRITE_BYTES], opts->rdma_size);
+ break;
+
+ case RDMA_OP_READ:
+ stat_inc(&ctl->cur[S_RDMA_READ_BYTES], opts->rdma_size);
+ break;
+ }
+
+ return ret;
+}
+
+static int ack_anything(int fd, struct task *t,
+ struct options *opts,
+ struct child_control *ctl,
+ int can_send)
+{
+ while (t->unacked) {
+ uint16_t qindex;
+
+ qindex = (t->recv_index - t->unacked + opts->req_depth) % opts->req_depth;
+ if (!can_send)
+ goto eagain;
+ if (send_ack(fd, t, qindex, opts, ctl) < 0)
+ return -1;
+ t->unacked -= 1;
+ }
+ return 0;
+
+eagain:
+ errno = EAGAIN;
+ return -1;
+}
+
+static int send_anything(int fd, struct task *t,
+ struct options *opts,
+ struct child_control *ctl,
+ int can_send)
+{
+ if (ack_anything(fd, t, opts, ctl, can_send) < 0)
+ return -1;
+ while (t->pending < opts->req_depth) {
+ if (!can_send)
+ goto eagain;
+ if (send_one(fd, t, opts, ctl) < 0)
+ return -1;
+ }
+
+ return 0;
+
+eagain:
+ errno = EAGAIN;
+ return -1;
+}
+
+static int recv_message(int fd,
+ void *buffer, size_t size,
+ rds_rdma_cookie_t *cookie,
+ struct sockaddr_in *sin,
+ struct timeval *tstamp,
+ struct task *tasks)
+{
+ struct cmsghdr *cmsg;
+ char cmsgbuf[256];
+ struct msghdr msg;
+ struct iovec iov;
+ ssize_t ret;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_name = (struct sockaddr *) sin;
+ msg.msg_namelen = sizeof(struct sockaddr_in);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsgbuf;
+ msg.msg_controllen = sizeof(cmsgbuf);
+ iov.iov_base = buffer;
+ iov.iov_len = size;
+
+ ret = recvmsg(fd, &msg, MSG_DONTWAIT);
+ gettimeofday(tstamp, NULL);
+
+ if (ret < 0)
+ return ret;
+ if (ret && ret < sizeof(struct header))
+ die("recvmsg() returned short data: %zd", ret);
+ if (msg.msg_namelen < sizeof(struct sockaddr_in))
+ die("socklen = %d < sizeof(sin) (%zu)\n",
+ msg.msg_namelen, sizeof(struct sockaddr_in));
+
+ /* See if the message comes with a RDMA destination */
+ for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+ struct rds_rdma_notify notify;
+
+ if (cmsg->cmsg_level != SOL_RDS)
+ continue;
+ switch (cmsg->cmsg_type) {
+ case RDS_CMSG_CONG_UPDATE:
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(uint64_t)))
+ die("RDS_CMSG_CONG_UPDATE data too small");
+ else {
+ unsigned int i, port;
+ uint64_t mask;
+
+ memcpy(&mask, CMSG_DATA(cmsg), sizeof(mask));
+ for (i = 0; i < opt.nr_tasks; ++i) {
+ port = ntohs(tasks[i].dst_addr.sin_port);
+ if (mask & RDS_CONG_MONITOR_MASK(port))
+ tasks[i].congested = 0;
+ }
+ }
+ break;
+ case RDS_CMSG_RDMA_DEST:
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie)))
+ die("RDS_CMSG_RDMA_DEST data too small");
+ memcpy(cookie, CMSG_DATA(cmsg), sizeof(*cookie));
+ break;
+
+ case RDS_CMSG_RDMA_STATUS:
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(notify)))
+ die("RDS_CMSG_RDMA_DEST data too small");
+ memcpy(¬ify, CMSG_DATA(cmsg), sizeof(notify));
+ rdma_mark_completed(tasks, notify.user_token, notify.status);
+ break;
+ }
+ }
+ return ret;
+}
+
+static int recv_one(int fd, struct task *tasks,
+ struct options *opts,
+ struct child_control *ctl)
+{
+ char buf[max(opts->req_size, opts->ack_size)];
+ rds_rdma_cookie_t rdma_dest = 0;
+ struct sockaddr_in sin;
+ struct header hdr, in_hdr;
+ struct timeval tstamp;
+ struct task *t;
+ uint16_t expect_index;
+ int task_index;
+ ssize_t ret;
+
+ ret = recv_message(fd, buf, sizeof(buf), &rdma_dest, &sin, &tstamp, tasks);
+ if (ret < 0)
+ return ret;
+
+ /* If we received only RDMA completions or cong updates,
+ * ret will be 0 */
+ if (ret == 0)
+ return 0;
+
+ /* check the incoming sequence number */
+ task_index = ntohs(sin.sin_port) - opts->starting_port - 1;
+ if (task_index >= opts->nr_tasks)
+ die("received bad task index %u\n", task_index);
+ t = &tasks[task_index];
+
+ /* make sure the incoming message's size matches its op */
+ decode_hdr(&in_hdr, (struct header *) buf);
+ switch(in_hdr.op) {
+ case OP_REQ:
+ stat_inc(&ctl->cur[S_REQ_RX_BYTES], ret);
+ if (ret != opts->req_size)
+ die("req size %zd, not %u\n", ret,
+ opts->req_size);
+ expect_index = t->recv_index;
+ break;
+ case OP_ACK:
+ stat_inc(&ctl->cur[S_ACK_RX_BYTES], ret);
+ if (ret != opts->ack_size)
+ die("ack size %zd, not %u\n", ret,
+ opts->ack_size);
+
+ /* This ACK should be for the oldest outstanding REQ */
+ expect_index = (t->send_index - t->pending + opts->req_depth) % opts->req_depth;
+ break;
+ default:
+ die("unknown op %u\n", in_hdr.op);
+ }
+
+ /*
+ * Verify that the incoming header indicates that this
+ * is the next in-order message to us. We can't predict
+ * op.
+ */
+ hdr.op = in_hdr.op;
+ hdr.seq = t->recv_seq;
+ hdr.from_addr = sin.sin_addr.s_addr;
+ hdr.from_port = sin.sin_port;
+ hdr.to_addr = t->src_addr.sin_addr.s_addr;
+ hdr.to_port = t->src_addr.sin_port;
+ hdr.index = expect_index;
+
+ if (check_hdr(buf, ret, &hdr))
+ die("header from %s:%u to id %u bogus\n",
+ inet_ntoa(sin.sin_addr), htons(sin.sin_port),
+ ntohs(t->src_addr.sin_port));
+
+ if (hdr.op == OP_ACK) {
+ stat_inc(&ctl->cur[S_RTT_USECS],
+ usec_sub(&tstamp, &t->send_time[expect_index]));
+ t->pending -= 1;
+
+ if (in_hdr.rdma_key)
+ rdma_process_ack(fd, &in_hdr, ctl);
+ } else {
+ struct header *ack_hdr;
+
+ /* Build the ACK header right away */
+ ack_hdr = &t->ack_header[t->recv_index];
+ build_header(t, ack_hdr, OP_ACK, t->recv_index);
+
+ /* The RDMA is performed at the time the ACK
+ * message is sent. We need to mirror all
+ * RDMA related header fields in our response
+ * anyway, so that's a good place for send_ack
+ * to pick them up from.
+ */
+ if (rdma_dest)
+ in_hdr.rdma_key = rdma_dest;
+ if (in_hdr.rdma_key) {
+ rdma_validate(&in_hdr, opts);
+ rdma_build_ack(ack_hdr, &in_hdr);
+ }
+
+ t->unacked += 1;
+ t->recv_index = (t->recv_index + 1) % opts->req_depth;
+ }
+ t->recv_seq++;
+
+ return ret;
+}
+
+static void run_child(pid_t parent_pid, struct child_control *ctl,
+ struct options *opts, uint16_t id)
+{
+ struct sockaddr_in sin;
+ struct pollfd pfd;
+ int fd;
+ uint16_t i;
+ ssize_t ret;
+ struct task tasks[opts->nr_tasks];
+ struct timeval start;
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = htons(opts->starting_port + 1 + id);
+ sin.sin_addr.s_addr = htonl(opts->receive_addr);
+
+ /* give main display thread a little edge? */
+ nice(5);
+
+ memset(tasks, 0, sizeof(tasks));
+ for (i = 0; i < opts->nr_tasks; i++) {
+ tasks[i].nr = i;
+ tasks[i].src_addr = sin;
+ tasks[i].dst_addr.sin_family = AF_INET;
+ tasks[i].dst_addr.sin_addr.s_addr = htonl(opts->send_addr);
+ tasks[i].dst_addr.sin_port = htons(opts->starting_port + 1 + i);
+ tasks[i].send_time = alloca(opts->req_depth * sizeof(struct timeval));
+ tasks[i].rdma_req_key = alloca(opts->req_depth * sizeof(uint64_t));
+ tasks[i].rdma_inflight = alloca(opts->req_depth * sizeof(uint8_t));
+ tasks[i].rdma_buf = alloca(opts->req_depth * sizeof(uint64_t *));
+ tasks[i].local_buf = alloca(opts->req_depth * sizeof(uint64_t *));
+ tasks[i].ack_header = alloca(opts->req_depth * sizeof(struct header));
+ tasks[i].rdma_next_op = (i & 1)? RDMA_OP_READ : RDMA_OP_WRITE;
+ }
+
+ if (opts->rdma_size)
+ alloc_rdma_buffers(tasks, opts);
+
+ fd = rds_socket(opts, &sin);
+
+ ctl->ready = 1;
+
+ while (ctl->start.tv_sec == 0) {
+ check_parent(parent_pid);
+ sleep(1);
+ }
+
+ /* sleep until we're supposed to start */
+ gettimeofday(&start, NULL);
+ if (tv_cmp(&start, &ctl->start) < 0)
+ usleep(usec_sub(&ctl->start, &start));
+
+ sin.sin_family = AF_INET;
+
+ pfd.fd = fd;
+ pfd.events = POLLIN | POLLOUT;
+ while (1) {
+ struct task *t;
+ int can_send;
+
+ check_parent(parent_pid);
+
+ ret = poll(&pfd, 1, -1);
+ if (ret < 0) {
+ if (errno == EINTR)
+ continue;
+ die_errno("poll failed");
+ }
+
+ pfd.events = POLLIN;
+
+ if (pfd.revents & POLLIN) {
+ while (recv_one(fd, tasks, opts, ctl) >= 0)
+ ;
+ }
+
+ /* keep the pipeline full */
+ can_send = !!(pfd.revents & POLLOUT);
+ for (i = 0, t = tasks; i < opts->nr_tasks; i++, t++) {
+ if (opt.use_cong_monitor && t->congested)
+ continue;
+ if (t->drain_rdmas)
+ continue;
+ if (send_anything(fd, t, opts, ctl, can_send) < 0) {
+ pfd.events |= POLLOUT;
+
+ /* If the send queue is full, we will see EAGAIN.
+ * If a particular destination is congested, the
+ * kernel will return ENOBUFS. In the former case,
+ * there's no point in trying other destinations;
+ * in the latter case we certainly want to try
+ * sending to other tasks.
+ *
+ * It would be nice if we could map the congestion
+ * map into user space :-)
+ */
+ if (errno == ENOBUFS)
+ t->congested = 1;
+ else if (errno == EBADSLT)
+ t->drain_rdmas = 1;
+ else
+ break;
+ }
+ }
+ }
+}
+
+static struct child_control *start_children(struct options *opts)
+{
+ struct child_control *ctl;
+ pid_t parent = getpid();
+ pid_t pid;
+ size_t len;
+ uint32_t i;
+
+ len = opts->nr_tasks * sizeof(*ctl);
+ ctl = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED,
+ 0, 0);
+ if (ctl == MAP_FAILED)
+ die("mmap of %u child control structs failed", opts->nr_tasks);
+
+ memset(ctl, 0, len);
+
+ init_msg_pattern(opts);
+
+ if (opts->rdma_key_o_meter)
+ rdma_key_o_meter_init(opts->nr_tasks);
+
+ for (i = 0; i < opts->nr_tasks; i++) {
+ pid = fork();
+ if (pid == -1)
+ die_errno("forking child nr %u failed", i);
+ if (pid == 0) {
+ opts->suppress_warnings = (i > 0);
+ if (control_fd >= 0) {
+ close(control_fd);
+ control_fd = -1;
+ }
+ rdma_key_o_meter_set_self(i);
+ run_child(parent, ctl + i, opts, i);
+ exit(0);
+ }
+ ctl[i].pid = pid;
+ }
+
+ for (i = 0; i < opts->nr_tasks; i++) {
+ if (ctl[i].ready)
+ continue;
+ pid = waitpid(-1, NULL, WNOHANG);
+ if (pid)
+ die("child %u (pid %u) exited\n", i, pid);
+ sleep(1);
+ i--; /* try this child again */
+ }
+
+ return ctl;
+}
+
+static double avg(struct counter *ctr)
+{
+ if (ctr->nr)
+ return (double)ctr->sum / (double)ctr->nr;
+ else
+ return 0.0;
+}
+
+static double throughput(struct counter *disp)
+{
+ return disp[S_REQ_TX_BYTES].sum + disp[S_REQ_RX_BYTES].sum +
+ disp[S_ACK_TX_BYTES].sum + disp[S_ACK_RX_BYTES].sum;
+}
+
+static double throughput_rdma(struct counter *disp)
+{
+ return disp[S_RDMA_WRITE_BYTES].sum + disp[S_RDMA_READ_BYTES].sum;
+}
+
+void stat_snapshot(struct counter *disp, struct child_control *ctl,
+ uint16_t nr_tasks)
+{
+ struct counter tmp[NR_STATS];
+ uint16_t i;
+ uint16_t s;
+
+ memset(disp, 0, sizeof(tmp));
+
+ for (i = 0; i < nr_tasks; i++) {
+ memcpy(tmp, ctl[i].cur, sizeof(tmp));
+
+ for (s = 0; s < NR_STATS; s++) {
+ disp[s].nr += tmp[s].nr - ctl[i].last[s].nr;
+ disp[s].sum += tmp[s].sum - ctl[i].last[s].sum;
+ disp[s].min = minz(tmp[s].min, ctl[i].last[s].min);
+ disp[s].max = max(tmp[s].max, ctl[i].last[s].max);
+ }
+
+ memcpy(ctl[i].last, tmp, sizeof(tmp));
+ }
+}
+
+void stat_accumulate(struct counter *accum, const struct counter *cur)
+{
+ uint16_t s;
+
+ for (s = 0; s < NR_STATS; ++s, ++cur, ++accum) {
+ accum->nr += cur->nr;
+ accum->sum += cur->sum;
+ accum->min = minz(accum->min, cur->min);
+ accum->max = max(accum->max, cur->max);
+ }
+}
+
+void stat_total(struct counter *disp, struct child_control *ctl,
+ uint16_t nr_tasks)
+{
+ uint16_t i;
+ uint16_t s;
+
+ memset(disp, 0, sizeof(struct counter) * NR_STATS);
+
+ for (i = 0; i < nr_tasks; i++) {
+ for (s = 0; s < NR_STATS; s++) {
+ disp[s].nr += ctl[i].cur[s].nr;
+ disp[s].sum += ctl[i].cur[s].sum;
+ disp[s].min = minz(disp[s].min, ctl[i].cur[s].min);
+ disp[s].max = max(disp[s].max, ctl[i].cur[s].max);
+ }
+ }
+}
+
+static double cpu_use(struct soak_control *soak_arr)
+{
+ struct soak_control *soak;
+ uint64_t capacity = 0;
+ uint64_t soaked = 0;
+ uint64_t this;
+
+ if (soak_arr == NULL)
+ return -1.0;
+
+ for (soak = soak_arr; soak && soak->per_sec; soak++) {
+ capacity += soak->per_sec;
+ this = soak->counter;
+ soaked += min(soak->per_sec, this - soak->last);
+ soak->last = this;
+ }
+
+ return (double)(capacity - soaked) * 100 / (double)capacity;
+}
+
+static void
+get_stats(int initialize)
+{
+#define NTIMES 8
+ struct sys_stats {
+ /* Where we spent out time */
+ unsigned long long times[NTIMES];
+ unsigned long long other;
+
+ /* Interrupt count */
+ unsigned long long intr;
+ };
+ static struct sys_stats prev, current;
+ static int disable = 0;
+ char buffer[2048];
+ FILE *fp;
+
+ if (disable)
+ return;
+ if ((fp = fopen("/proc/stat", "r")) == NULL) {
+ fprintf(stderr, "Cannot open /proc/stat (%s) - "
+ "not printing cpu stats\n",
+ strerror(errno));
+ disable = 1;
+ return;
+ }
+
+ memset(¤t, 0, sizeof(current));
+ while (fgets(buffer, sizeof(buffer), fp)) {
+ if (!strncmp(buffer, "cpu ", 4)) {
+ char *s = buffer + 4;
+ int j;
+
+ for (j = 0; 1; ++j) {
+ unsigned long long v;
+
+ while (*s == ' ')
+ ++s;
+ if (!isdigit(*s))
+ break;
+ v = strtoull(s, &s, 10);
+ if (j < NTIMES)
+ current.times[j] = v;
+ else
+ current.other += v;
+ }
+ } else
+ if (!strncmp(buffer, "intr ", 5)) {
+ sscanf(buffer + 5, "%Lu", ¤t.intr);
+ }
+ }
+ fclose(fp);
+
+ if (initialize) {
+ printf(",user:percent,system:percent,idle:percent"
+ ",irq:percent,intr:count");
+ } else {
+ struct sys_stats sys;
+ unsigned long sum = 0;
+ double scale;
+ int j;
+
+ sum = sys.other = current.other - prev.other;
+ for (j = 0; j < NTIMES; ++j) {
+ sys.times[j] = current.times[j] - prev.times[j];
+ sum += current.times[j];
+ }
+ sys.intr = current.intr - prev.intr;
+
+ scale = sum? 100.0 / sum : 0;
+
+ /* Magic procfs offsets
+ * 0 user
+ * 1 nice
+ * 2 system
+ * 3 idle
+ * 4 iowait
+ * 5 irq
+ * 6 softirq
+ */
+ printf(",%f,%f,%f,%f,%Lu",
+ (sys.times[0] + sys.times[1]) * scale,
+ sys.times[2] * scale,
+ (sys.times[3] + sys.times[4]) * scale,
+ (sys.times[5] + sys.times[6]) * scale,
+ sys.intr);
+ }
+ prev = current;
+}
+
+static void
+get_perfdata(int initialize)
+{
+ static struct timeval last_ts, now;
+ static struct rds_info_counter *prev, *ctr;
+ static unsigned char *curr = NULL;
+ static socklen_t buflen = 0;
+ static int sock_fd = -1;
+ int i, count, item_size;
+
+ if (sock_fd < 0) {
+ sock_fd = socket(PF_RDS, SOCK_SEQPACKET, 0);
+ if (sock_fd < 0)
+ die_errno("Unable to create socket");
+ }
+
+ /* We should only loop once on the first call; after that the
+ * buffer requirements for RDS counters should not change. */
+ while ((item_size = getsockopt(sock_fd, SOL_RDS, RDS_INFO_COUNTERS, curr, &buflen)) < 0) {
+ if (errno != ENOSPC)
+ die_errno("getsockopt(RDS_INFO_COUNTERS) failed");
+ curr = realloc(curr, buflen);
+ if (!curr)
+ die_errno("Cannot allocate buffer for stats counters");
+ }
+
+ if (item_size > sizeof(*ctr))
+ die("Bad counter item size in RDS_INFO_COUNTERS (got %d, max %zd)\n",
+ item_size, sizeof(*ctr));
+ count = buflen / item_size;
+
+ if (prev == NULL) {
+ /* First call - allocate buffer */
+ prev = calloc(count, sizeof(*ctr));
+ ctr = calloc(count, sizeof(*ctr));
+ }
+
+ for (i = 0; i < count; ++i)
+ memcpy(ctr + i, curr + i * item_size, item_size);
+
+ gettimeofday(&now, NULL);
+
+ if (initialize) {
+ for (i = 0; i < count; ++i) {
+ printf(",%s", ctr[i].name);
+ if (strstr((char *) ctr[i].name, "_bytes"))
+ printf(":bytes");
+ else
+ printf(":count");
+ }
+ } else {
+ double scale;
+
+ scale = 1e6 / usec_sub(&now, &last_ts);
+ for (i = 0; i < count; ++i) {
+ printf(",%f",
+ (ctr[i].value - prev[i].value) * scale);
+ }
+ }
+
+ memcpy(prev, ctr, count * sizeof(*ctr));
+ last_ts = now;
+
+ get_stats(initialize);
+}
+
+static int reap_one_child(int wflags)
+{
+ pid_t pid;
+ int status;
+
+ pid = waitpid(-1, &status, wflags);
+ if (pid < 0)
+ die("waitpid returned %u", pid);
+ if (pid == 0)
+ return 0;
+
+ if (WIFEXITED(status)) {
+ if (WEXITSTATUS(status) == 0)
+ return 1;
+ die("child pid %u exited with status %d\n",
+ pid, WEXITSTATUS(status));
+ }
+ if (WIFSIGNALED(status)) {
+ if (WTERMSIG(status) == SIGTERM)
+ return 1;
+ die("child pid %u exited with signal %d\n",
+ pid, WTERMSIG(status));
+ }
+ die("child pid %u wait status %d\n", pid, status);
+}
+
+static void release_children_and_wait(struct options *opts,
+ struct child_control *ctl,
+ struct soak_control *soak_arr,
+ int active)
+{
+ struct counter disp[NR_STATS];
+ struct counter summary[NR_STATS];
+ struct timeval start, end, now, first_ts, last_ts;
+ double cpu_total = 0;
+ uint16_t i, cpu_samples = 0;
+ uint16_t nr_running;
+
+ gettimeofday(&start, NULL);
+ start.tv_sec += 2;
+ for (i = 0; i < opts->nr_tasks; i++)
+ ctl[i].start = start;
+
+ /* Allow for a 4 second delay: 2 seconds for the children
+ * to come up, and 2 more of burn-in time
+ */
+ printf("Starting up"); fflush(stdout);
+ for (i = 0; i < 4; ++i) {
+ sleep(1);
+ stat_snapshot(disp, ctl, opts->nr_tasks);
+ cpu_use(soak_arr);
+ printf(".");
+ fflush(stdout);
+ }
+ printf("\n");
+
+ gettimeofday(&first_ts, NULL);
+ if (opts->run_time && active) {
+ end = first_ts;
+ end.tv_sec += opts->run_time;
+ } else {
+ timerclear(&end);
+ }
+
+ nr_running = opts->nr_tasks;
+ memset(summary, 0, sizeof(summary));
+
+ if (opts->rtprio)
+ set_rt_priority();
+
+ /* Prime the perf data counters and display the CSV header line
+ * You can filter the CSV data from the rds-stress output by
+ * grepping for the "::" marker.
+ */
+ if (opt.show_perfdata) {
+ printf("::");
+ printf("nr_tasks:count"
+ ",req_size:bytes"
+ ",ack_size:bytes"
+ ",rdma_size:bytes");
+
+ printf(",req_sent:count"
+ ",thruput:kB/s"
+ ",thruput_rdma:kB/s"
+ ",tx_delay:microseconds"
+ ",rtt:microseconds"
+ ",cpu:percent");
+ get_perfdata(1);
+ printf("\n");
+ } else {
+ printf("%4s %6s %10s %10s %7s %8s %5s\n",
+ "tsks", "tx/s", "tx+rx K/s", "rw+rr K/s",
+ "tx us/c", "rtt us", "cpu %");
+ }
+
+ last_ts = first_ts;
+ while (nr_running) {
+ double cpu;
+
+ if (active) {
+ sleep(1);
+ } else {
+ struct pollfd pfd;
+
+ pfd.fd = control_fd;
+ pfd.events = POLLIN|POLLHUP;
+ if (poll(&pfd, 1, 1000) == 1)
+ break;
+ }
+
+ /* XXX big bug, need to mark some ctl elements dead */
+ stat_snapshot(disp, ctl, nr_running);
+ gettimeofday(&now, NULL);
+ cpu = cpu_use(soak_arr);
+
+ if (!opts->summary_only) {
+ double scale;
+
+ /* Every loop takes a little more than one second;
+ * and system load can actually introduce latencies.
+ * So try to measure the actual time elapsed as precise
+ * as possible, and scale all values by its inverse.
+ */
+ scale = 1e6 / usec_sub(&now, &last_ts);
+
+ if (!opt.show_perfdata) {
+ printf("%4u %6"PRIu64" %10.2f %10.2f %7.2f %8.2f %5.2f\n",
+ nr_running,
+ disp[S_REQ_TX_BYTES].nr,
+ scale * throughput(disp) / 1024.0,
+ scale * throughput_rdma(disp) / 1024.0,
+ scale * avg(&disp[S_SENDMSG_USECS]),
+ scale * avg(&disp[S_RTT_USECS]),
+ scale * cpu);
+ } else {
+ printf("::");
+ printf("%u,%u,%u,%u,",
+ opts->nr_tasks, opts->req_size,
+ opts->ack_size, opts->rdma_size);
+
+ printf("%Lu,%f,%f,%f,%f,%f",
+ (unsigned long long) disp[S_REQ_TX_BYTES].nr,
+ scale * throughput(disp) / 1024.0,
+ scale * throughput_rdma(disp) / 1024.0,
+ scale * avg(&disp[S_SENDMSG_USECS]),
+ scale * avg(&disp[S_RTT_USECS]),
+ cpu >= 0? scale * cpu : 0);
+
+ /* Print RDS perf counters etc */
+ get_perfdata(0);
+ printf("\n");
+ }
+
+ rdma_key_o_meter_check(opts->nr_tasks);
+ }
+
+ stat_accumulate(summary, disp);
+ cpu_total += cpu;
+ cpu_samples++;
+ last_ts = now;
+
+ if (timerisset(&end) && timercmp(&now, &end, >=))
+ break;
+
+ /* see if any children have finished or died.
+ * This is a bit touchy - we should really be
+ * able to tell an exited soaker from an exiting
+ * RDS child. */
+ if (reap_one_child(WNOHANG))
+ nr_running--;
+ }
+
+ close(control_fd);
+ control_fd = -1;
+
+ if (nr_running) {
+ for (i = 0; i < opts->nr_tasks; i++)
+ kill(ctl[i].pid, SIGTERM);
+ stop_soakers(soak_arr);
+ }
+
+ while (nr_running && reap_one_child(0))
+ nr_running--;
+
+ rdma_key_o_meter_check(opts->nr_tasks);
+
+ stat_total(disp, ctl, opts->nr_tasks);
+ if (!opts->summary_only)
+ printf("---------------------------------------------\n");
+ {
+ double scale;
+
+ scale = 1e6 / usec_sub(&last_ts, &first_ts);
+
+ printf("%4u %6lu %10.2f %10.2f %7.2f %8.2f %5.2f (average)\n",
+ opts->nr_tasks,
+ (long) (scale * summary[S_REQ_TX_BYTES].nr),
+ scale * throughput(summary) / 1024.0,
+ scale * throughput_rdma(disp) / 1024.0,
+ avg(&summary[S_SENDMSG_USECS]),
+ avg(&summary[S_RTT_USECS]),
+ soak_arr? scale * cpu_total : -1.0);
+ }
+}
+
+static void peer_connect(int fd, const struct sockaddr_in *sin)
+{
+ int retries = 0;
+
+ printf("connecting to %s:%u",
+ inet_ntoa(sin->sin_addr),
+ ntohs(sin->sin_port));
+ fflush(stdout);
+
+ while (connect(fd, (struct sockaddr *) sin, sizeof(*sin))) {
+ if (retries == 0)
+ printf(": %s", strerror(errno));
+
+ switch (errno) {
+ case ECONNREFUSED:
+ case EHOSTUNREACH:
+ case ENETUNREACH:
+ if (retries >= opt.connect_retries)
+ break;
+ if (retries++ == 0)
+ printf(" - retrying");
+ printf(".");
+ fflush(stdout);
+ sleep(1);
+ continue;
+ }
+
+ printf("\n");
+ die("connect(%s) failed", inet_ntoa(sin->sin_addr));
+ }
+ printf("\n");
+}
+
+static void peer_send(int fd, const void *ptr, size_t size)
+{
+ ssize_t ret;
+
+ while (size) {
+ ret = write(fd, ptr, size);
+ if (ret < 0)
+ die_errno("Cannot send to peer");
+ size -= ret;
+ ptr += ret;
+ }
+}
+
+static void peer_recv(int fd, void *ptr, size_t size)
+{
+ ssize_t ret;
+
+ while (size) {
+ ret = read(fd, ptr, size);
+ if (ret < 0)
+ die_errno("Cannot recv from peer");
+ if (ret == 0)
+ die("Peer unexpectedly closed connection\n");
+ size -= ret;
+ ptr += ret;
+ }
+}
+
+static void encode_options(struct options *dst, const struct options *src)
+{
+ dst->req_depth = htonl(src->req_depth);
+ dst->req_size = htonl(src->req_size);
+ dst->ack_size = htonl(src->ack_size);
+ dst->rdma_size = htonl(src->rdma_size);
+ dst->send_addr = htonl(src->send_addr); /* host byte order */
+ dst->receive_addr = htonl(src->receive_addr); /* host byte order */
+ dst->starting_port = htons(src->starting_port); /* host byte order */
+ dst->nr_tasks = htons(src->nr_tasks);
+ dst->run_time = htonl(src->run_time);
+ dst->summary_only = src->summary_only; /* byte sized */
+ dst->rtprio = src->rtprio; /* byte sized */
+ dst->tracing = src->tracing; /* byte sized */
+ dst->verify = src->verify; /* byte sized */
+ dst->show_params = src->show_params; /* byte sized */
+ dst->show_perfdata = src->show_perfdata; /* byte sized */
+ dst->use_cong_monitor = src->use_cong_monitor; /* byte sized */
+ dst->rdma_use_once = src->rdma_use_once; /* byte sized */
+ dst->rdma_use_get_mr = src->rdma_use_get_mr; /* byte sized */
+ dst->rdma_use_fence = src->rdma_use_fence; /* byte sized */
+ dst->rdma_cache_mrs = src->rdma_cache_mrs; /* byte sized */
+ dst->rdma_key_o_meter = src->rdma_key_o_meter; /* byte sized */
+
+ dst->rdma_alignment = htonl(src->rdma_alignment);
+ dst->connect_retries = htonl(src->connect_retries);
+
+ dst->suppress_warnings = src->suppress_warnings;/* byte sized */
+}
+
+static void decode_options(struct options *dst, const struct options *src)
+{
+ dst->req_depth = ntohl(src->req_depth);
+ dst->req_size = ntohl(src->req_size);
+ dst->ack_size = ntohl(src->ack_size);
+ dst->rdma_size = ntohl(src->rdma_size);
+ dst->send_addr = ntohl(src->send_addr); /* host byte order */
+ dst->receive_addr = ntohl(src->receive_addr); /* host byte order */
+ dst->starting_port = ntohs(src->starting_port); /* host byte order */
+ dst->nr_tasks = ntohs(src->nr_tasks);
+ dst->run_time = ntohl(src->run_time);
+ dst->summary_only = src->summary_only; /* byte sized */
+ dst->rtprio = src->rtprio; /* byte sized */
+ dst->tracing = src->tracing; /* byte sized */
+ dst->verify = src->verify; /* byte sized */
+ dst->show_params = src->show_params; /* byte sized */
+ dst->show_perfdata = src->show_perfdata; /* byte sized */
+ dst->use_cong_monitor = src->use_cong_monitor; /* byte sized */
+ dst->rdma_use_once = src->rdma_use_once; /* byte sized */
+ dst->rdma_use_get_mr = src->rdma_use_get_mr; /* byte sized */
+ dst->rdma_use_fence = src->rdma_use_fence; /* byte sized */
+ dst->rdma_cache_mrs = src->rdma_cache_mrs; /* byte sized */
+ dst->rdma_key_o_meter = src->rdma_key_o_meter; /* byte sized */
+
+ dst->rdma_alignment = ntohl(src->rdma_alignment);
+ dst->connect_retries = ntohl(src->connect_retries);
+
+ dst->suppress_warnings = src->suppress_warnings;/* byte sized */
+}
+
+static void verify_option_encdec(const struct options *opts)
+{
+ struct options ebuf, dbuf;
+ unsigned int i;
+
+ memcpy(&dbuf, opts, sizeof(*opts));
+ for (i = 0; i < sizeof(*opts); ++i) {
+ unsigned char *x = &((unsigned char *) &dbuf)[i];
+
+ *x = ~*x;
+ }
+
+ encode_options(&ebuf, opts);
+ decode_options(&dbuf, &ebuf);
+
+ if (memcmp(&dbuf, opts, sizeof(*opts)))
+ die("encode/decode check of options struct failed");
+}
+
+static int active_parent(struct options *opts, struct soak_control *soak_arr)
+{
+ struct options enc_options;
+ struct child_control *ctl;
+ struct sockaddr_in sin;
+ int fd;
+ uint8_t ok;
+
+ if (opts->show_params) {
+ unsigned int k;
+
+ printf("Options:\n"
+ " %-10s %-7u\n"
+ " %-10s %-7u\n"
+ " %-10s %-7u\n"
+ " %-10s %-7u\n",
+ "Tasks", opts->nr_tasks,
+ "Req size", opts->req_size,
+ "ACK size", opts->ack_size,
+ "RDMA size", opts->rdma_size);
+
+ k = 0;
+ printf(" %-10s", "RDMA opts");
+ if (opts->rdma_use_once) {
+ printf(" use_once"); ++k;
+ }
+ if (opts->rdma_use_get_mr) {
+ printf(" use_get_mr"); ++k;
+ }
+ if (opts->rdma_use_fence) {
+ printf(" use_fence"); ++k;
+ }
+ if (opts->rdma_cache_mrs) {
+ printf(" cache_mrs"); ++k;
+ }
+ if (opts->rdma_alignment) {
+ printf(" align=%u", opts->rdma_alignment); ++k;
+ }
+ if (!k)
+ printf(" (defaults)");
+ printf("\n");
+ printf("\n");
+ }
+
+ /* Make sure that when we add new options, we don't forget
+ * to add them to the encode/decode routines. */
+ verify_option_encdec(opts);
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = htons(opts->starting_port);
+ sin.sin_addr.s_addr = htonl(opts->receive_addr);
+
+ fd = bound_socket(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sin);
+ control_fd = fd;
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = htons(opts->starting_port);
+ sin.sin_addr.s_addr = htonl(opts->send_addr);
+
+ peer_connect(fd, &sin);
+
+ if (opts->receive_addr == 0) {
+ opts->receive_addr = get_local_address(fd, &sin);
+ if (opts->rdma_size && !check_rdma_support(opts))
+ die("RDMA not supported by this kernel\n");
+ }
+
+ /* "negotiation" is overstating things a bit :-)
+ * We just tell the peer what options to use.
+ */
+ encode_options(&enc_options, opts);
+ peer_send(fd, &enc_options, sizeof(struct options));
+
+ printf("negotiated options, tasks will start in 2 seconds\n");
+ ctl = start_children(opts);
+
+ /* Tell the peer to start up. This is necessary when testing
+ * with a large number of tasks, because otherwise the peer
+ * may start sending before we have all our tasks running.
+ */
+ peer_send(fd, &ok, sizeof(ok));
+ peer_recv(fd, &ok, sizeof(ok));
+
+ release_children_and_wait(opts, ctl, soak_arr, 1);
+
+ return 0;
+}
+
+static int passive_parent(uint32_t addr, uint16_t port,
+ struct soak_control *soak_arr)
+{
+ struct options remote, *opts;
+ struct child_control *ctl;
+ struct sockaddr_in sin;
+ socklen_t socklen;
+ int lfd, fd;
+ uint8_t ok;
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = htons(port);
+ sin.sin_addr.s_addr = htonl(addr);
+
+ lfd = bound_socket(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sin);
+
+ if (listen(lfd, 255))
+ die_errno("listen() failed");
+
+ socklen = sizeof(sin);
+
+ fd = accept(lfd, (struct sockaddr *)&sin, &socklen);
+ if (fd < 0)
+ die_errno("accept() failed");
+ control_fd = fd;
+
+ /* Do not accept any further connections - we don't handle them
+ * anyway. */
+ close(lfd);
+
+ printf("accepted connection from %s:%u", inet_ntoa(sin.sin_addr),
+ ntohs(sin.sin_port));
+ if (addr == 0) {
+ /* Get our receive address - i.e. the address the peer connected to. */
+ addr = get_local_address(control_fd, &sin);
+ printf(" on %s:%u", inet_ntoa(sin.sin_addr), ntohs(sin.sin_port));
+ }
+ printf("\n");
+
+ peer_recv(fd, &remote, sizeof(struct options));
+ decode_options(&remote, &remote);
+ opts = &remote;
+
+ /*
+ * The sender gave us their send and receive addresses, we need
+ * to swap them.
+ */
+ opts->send_addr = opts->receive_addr;
+ opts->receive_addr = addr;
+ opt = *opts;
+
+ ctl = start_children(opts);
+
+ /* Wait for "GO" from the initiating peer */
+ peer_recv(fd, &ok, sizeof(ok));
+ peer_send(fd, &ok, sizeof(ok));
+
+ printf("negotiated options, tasks will start in 2 seconds\n");
+ release_children_and_wait(opts, ctl, soak_arr, 0);
+
+ return 0;
+}
+
+/*
+ * The soaker *constantly* spins calling getpid(). It tries to execute a
+ * second's worth of calls before checking that it's parent is still alive. It
+ * uses gettimeofday() to figure out the per-second rate of the series it just
+ * executed. It always tries to work from the highest rate it ever saw.
+ */
+static void run_soaker(pid_t parent_pid, struct soak_control *soak)
+{
+ uint64_t i;
+ uint64_t per_sec;
+ struct timeval start;
+ struct timeval stop;
+ uint64_t usecs;
+
+ nice(20);
+
+ soak->per_sec = 1000;
+
+ while (1) {
+ gettimeofday(&start, NULL);
+ for (i = 0; i < soak->per_sec; i++) {
+ syscall(SYS_getpid);
+ soak->counter++;
+ }
+ gettimeofday(&stop, NULL);
+
+ usecs = usec_sub(&stop, &start);
+ per_sec = (double)soak->per_sec * 1000000.0 / (double)usecs;
+
+ if (per_sec > soak->per_sec)
+ soak->per_sec = per_sec;
+
+ check_parent(parent_pid);
+ }
+}
+
+struct soak_control *start_soakers(void)
+{
+ struct soak_control *soak_arr;
+ pid_t parent = getpid();
+ pid_t pid;
+ size_t len;
+ long nr_soak = sysconf(_SC_NPROCESSORS_ONLN);
+ long i;
+
+ /* an extra terminating entry which will be all 0s */
+ len = (nr_soak + 1) * sizeof(struct soak_control);
+ soak_arr = mmap(NULL, len, PROT_READ|PROT_WRITE,
+ MAP_ANONYMOUS|MAP_SHARED, 0, 0);
+ if (soak_arr == MAP_FAILED)
+ die("mmap of %ld soak control structs failed", nr_soak);
+
+ memset(soak_arr, 0, len);
+
+ printf("started %ld cycle soaking processes\n", nr_soak);
+
+ for (i = 0; i < nr_soak; i++) {
+ pid = fork();
+ if (pid == -1)
+ die_errno("forking soaker nr %lu failed", i);
+ if (pid == 0) {
+ run_soaker(parent, soak_arr + i);
+ exit(0);
+ }
+ soak_arr[i].pid = pid;
+ }
+
+ return soak_arr;
+}
+
+void stop_soakers(struct soak_control *soak_arr)
+{
+ unsigned int i, nr_soak = sysconf(_SC_NPROCESSORS_ONLN);
+
+ if (!soak_arr)
+ return;
+ for (i = 0; i < nr_soak; ++i) {
+ kill(soak_arr[i].pid, SIGTERM);
+ waitpid(soak_arr[i].pid, NULL, 0);
+ }
+}
+
+void check_size(uint32_t size, uint32_t unspec, uint32_t max, char *desc,
+ char *opt)
+{
+ if (size == ~0)
+ die("specify %s with %s\n", desc, opt);
+ if (size < max)
+ die("%s must be at least %u bytes\n", desc, max);
+}
+
+enum {
+ OPT_RDMA_USE_ONCE = 0x100,
+ OPT_RDMA_USE_GET_MR,
+ OPT_RDMA_USE_FENCE,
+ OPT_RDMA_USE_NOTIFY,
+ OPT_RDMA_CACHE_MRS,
+ OPT_RDMA_ALIGNMENT,
+ OPT_RDMA_KEY_O_METER,
+ OPT_SHOW_PARAMS,
+ OPT_CONNECT_RETRIES,
+ OPT_USE_CONG_MONITOR,
+ OPT_PERFDATA,
+};
+
+static struct option long_options[] = {
+{ "req-bytes", required_argument, NULL, 'q' },
+{ "ack-bytes", required_argument, NULL, 'a' },
+{ "rdma-bytes", required_argument, NULL, 'D' },
+{ "tasks", required_argument, NULL, 't' },
+{ "depth", required_argument, NULL, 'd' },
+{ "recv-addr", required_argument, NULL, 'r' },
+{ "send-addr", required_argument, NULL, 's' },
+{ "port", required_argument, NULL, 'p' },
+{ "time", required_argument, NULL, 'T' },
+{ "report-cpu", no_argument, NULL, 'c' },
+{ "report-summary", no_argument, NULL, 'z' },
+{ "rtprio", no_argument, NULL, 'R' },
+{ "verify", no_argument, NULL, 'v' },
+{ "trace", no_argument, NULL, 'V' },
+
+{ "rdma-use-once", required_argument, NULL, OPT_RDMA_USE_ONCE },
+{ "rdma-use-get-mr", required_argument, NULL, OPT_RDMA_USE_GET_MR },
+{ "rdma-use-fence", required_argument, NULL, OPT_RDMA_USE_FENCE },
+{ "rdma-use-notify", required_argument, NULL, OPT_RDMA_USE_NOTIFY },
+{ "rdma-cache-mrs", required_argument, NULL, OPT_RDMA_CACHE_MRS },
+{ "rdma-alignment", required_argument, NULL, OPT_RDMA_ALIGNMENT },
+{ "rdma-key-o-meter", no_argument, NULL, OPT_RDMA_KEY_O_METER },
+{ "show-params", no_argument, NULL, OPT_SHOW_PARAMS },
+{ "show-perfdata", no_argument, NULL, OPT_PERFDATA },
+{ "connect-retries", required_argument, NULL, OPT_CONNECT_RETRIES },
+{ "use-cong-monitor", required_argument, NULL, OPT_USE_CONG_MONITOR },
+
+{ NULL }
+};
+
+int main(int argc, char **argv)
+{
+ struct options opts;
+ struct soak_control *soak_arr = NULL;
+
+#ifdef DYNAMIC_PF_RDS
+ /* Discover PF_RDS/SOL_RDS once, and be done with it */
+ (void) discover_pf_rds();
+ (void) discover_sol_rds();
+#endif
+
+#ifdef _SC_PAGESIZE
+ sys_page_size = sysconf(_SC_PAGESIZE);
+#else
+ sys_page_size = 4096;
+#endif
+
+ /* We really want to see output when we redirect
+ * stdout to a pipe. */
+ setlinebuf(stdout);
+
+ memset(&opts, 0xff, sizeof(opts));
+
+ opts.receive_addr = 0;
+ opts.starting_port = 4000;
+ opts.ack_size = MIN_MSG_BYTES;
+ opts.req_size = 1024;
+ opts.run_time = 0;
+ opts.summary_only = 0;
+ opts.rtprio = 0;
+ opts.tracing = 0;
+ opts.verify = 0;
+ opts.rdma_size = 0;
+ opts.use_cong_monitor = 1;
+ opts.rdma_use_fence = 1;
+ opts.rdma_cache_mrs = 0;
+ opts.rdma_alignment = 0;
+ opts.rdma_key_o_meter = 0;
+ opts.show_params = 0;
+ opts.connect_retries = 0;
+ opts.show_perfdata = 0;
+
+ while(1) {
+ int c, index;
+
+ c = getopt_long(argc, argv, "+a:cD:d:hp:q:Rr:s:t:T:vVz",
+ long_options, &index);
+ if (c == -1)
+ break;
+
+ switch(c) {
+ case 'a':
+ opts.ack_size = parse_ull(optarg, (uint32_t)~0);
+ break;
+ case 'c':
+ soak_arr = start_soakers();
+ break;
+ case 'D':
+ opts.rdma_size = parse_ull(optarg, (uint32_t)~0);
+ break;
+ case 'd':
+ opts.req_depth = parse_ull(optarg,(uint32_t)~0);
+ break;
+ case 'p':
+ opts.starting_port = parse_ull(optarg,
+ (uint16_t)~0);
+ break;
+ case 'q':
+ opts.req_size = parse_ull(optarg, (uint32_t)~0);
+ break;
+ case 'R':
+ opts.rtprio = 1;
+ break;
+ case 'r':
+ opts.receive_addr = parse_addr(optarg);
+ break;
+ case 's':
+ opts.send_addr = parse_addr(optarg);
+ break;
+ case 't':
+ opts.nr_tasks = parse_ull(optarg,
+ (uint16_t)~0);
+ break;
+ case 'T':
+ opts.run_time = parse_ull(optarg, (uint32_t)~0);
+ break;
+ case 'z':
+ opts.summary_only = 1;
+ break;
+ case 'v':
+ opts.verify = 1;
+ break;
+ case 'V':
+ opts.tracing = 1;
+ break;
+ case OPT_USE_CONG_MONITOR:
+ opts.use_cong_monitor = parse_ull(optarg, 1);
+ break;
+ case OPT_RDMA_USE_ONCE:
+ opts.rdma_use_once = parse_ull(optarg, 1);
+ break;
+ case OPT_RDMA_USE_GET_MR:
+ opts.rdma_use_get_mr = parse_ull(optarg, 1);
+ break;
+ case OPT_RDMA_USE_FENCE:
+ opts.rdma_use_fence = parse_ull(optarg, 1);
+ break;
+ case OPT_RDMA_CACHE_MRS:
+ opts.rdma_cache_mrs = parse_ull(optarg, 1);
+ break;
+ case OPT_RDMA_USE_NOTIFY:
+ (void) parse_ull(optarg, 1);
+ break;
+ case OPT_RDMA_ALIGNMENT:
+ opts.rdma_alignment = parse_ull(optarg, sys_page_size);
+ break;
+ case OPT_RDMA_KEY_O_METER:
+ opts.rdma_key_o_meter = 1;
+ break;
+ case OPT_SHOW_PARAMS:
+ opts.show_params = 1;
+ break;
+ case OPT_CONNECT_RETRIES:
+ opts.connect_retries = parse_ull(optarg, (uint32_t)~0);
+ break;
+ case OPT_PERFDATA:
+ opts.show_perfdata = 1;
+ break;
+ case 'h':
+ case '?':
+ default:
+ usage();
+ break;
+ }
+ }
+
+ if (opts.rdma_use_once == 0xff)
+ opts.rdma_use_once = !opts.rdma_cache_mrs;
+ else if (opts.rdma_cache_mrs && opts.rdma_use_once)
+ die("option --rdma-cache-mrs conflicts with --rdma-use-once\n");
+ if (opts.rdma_use_get_mr == 0xff)
+ opts.rdma_use_get_mr = opts.rdma_cache_mrs;
+ else if (opts.rdma_cache_mrs && !opts.rdma_use_get_mr)
+ die("option --rdma-cache-mrs conflicts with --rdma-use-get-mr=0\n");
+
+ /* the passive parent will read options off the wire */
+ if (opts.send_addr == ~0)
+ return passive_parent(opts.receive_addr, opts.starting_port,
+ soak_arr);
+
+ /* the active parent verifies and sends its options */
+ check_size(opts.ack_size, ~0, MIN_MSG_BYTES, "ack size", "-a");
+ check_size(opts.req_size, ~0, MIN_MSG_BYTES, "req size", "-q");
+
+ /* defaults */
+ if (opts.req_depth == ~0)
+ opts.req_depth = 1;
+ if (opts.nr_tasks == (uint16_t)~0)
+ opts.nr_tasks = 1;
+
+ if (opts.rdma_size && !check_rdma_support(&opts))
+ die("RDMA not supported by this kernel\n");
+
+ /* We require RDMA to be multiples of the page size for now.
+ * this is just to simplify debugging, but eventually we
+ * need to support rdma sizes from 1 to 1meg byte
+ */
+ if (opts.rdma_size && 0)
+ opts.rdma_size = (opts.rdma_size + 4095) & ~4095;
+
+ opt = opts;
+ return active_parent(&opts, soak_arr);
+}
+
+/*
+ * This are completely stupid. options.c should be removed.
+ */
+void print_usage(int durr) { }
+void print_version() { }
diff --git a/rds-tools.spec b/rds-tools.spec
new file mode 100644
index 0000000..e49a728
--- /dev/null
+++ b/rds-tools.spec
@@ -0,0 +1,38 @@
+Summary: RDS support tools
+Name: rds-tools
+Version: 1.4
+Release: 1
+License: GPL/BSD
+Group: Applications/Internet
+URL: http://oss.oracle.com/projects/rds/
+Source: rds-tools-%{version}-%{release}.tar.gz
+BuildRoot: /var/tmp/rds-tools-%{version}-%{release}
+
+%description
+rds-tools is a collection of support tools for the RDS socket API.
+
+%prep
+%setup -n rds-tools-%{version}-%{release}
+
+%build
+%configure
+make %{?_smp_mflags}
+
+%install
+rm -rf $RPM_BUILD_ROOT
+make DESTDIR=$RPM_BUILD_ROOT install
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%files
+%defattr(-,root,root)
+%{_bindir}/*
+%{_mandir}/*
+%{_includedir}/*
+
+%changelog
+* Sun Nov 25 2007 Vladimir Sokolovsky <vlad at mellanox.co.il>
+- Use DESTDIR
+* Mon Oct 27 2006 Zach Brown <zach.brown at oracle.com>
+- initial version
diff --git a/rds-tools.spec.in b/rds-tools.spec.in
new file mode 100644
index 0000000..6dd8f32
--- /dev/null
+++ b/rds-tools.spec.in
@@ -0,0 +1,38 @@
+Summary: RDS support tools
+Name: rds-tools
+Version: @VERSION@
+Release: @RELEASE@
+License: GPL/BSD
+Group: Applications/Internet
+URL: http://oss.oracle.com/projects/rds/
+Source: rds-tools-%{version}-%{release}.tar.gz
+BuildRoot: /var/tmp/rds-tools-%{version}-%{release}
+
+%description
+rds-tools is a collection of support tools for the RDS socket API.
+
+%prep
+%setup -n rds-tools-%{version}-%{release}
+
+%build
+%configure
+make %{?_smp_mflags}
+
+%install
+rm -rf $RPM_BUILD_ROOT
+make DESTDIR=$RPM_BUILD_ROOT install
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%files
+%defattr(-,root,root)
+%{_bindir}/*
+%{_mandir}/*
+%{_includedir}/*
+
+%changelog
+* Sun Nov 25 2007 Vladimir Sokolovsky <vlad at mellanox.co.il>
+- Use DESTDIR
+* Mon Oct 27 2006 Zach Brown <zach.brown at oracle.com>
+- initial version
diff --git a/rds-tools.txt b/rds-tools.txt
new file mode 100644
index 0000000..2dac8b4
--- /dev/null
+++ b/rds-tools.txt
@@ -0,0 +1,39 @@
+
+
+So, rds-get-stats is easy and I already have it done. we'd just import
+that.
+
+rds-gen would just send down a socket. I'm hoping for options like:
+
+ -s addr:port
+ to bind the source address
+ -d addr:port
+ dest to send to, maybe just round-robin between multiple to
+ start?
+ -m units
+ the size of each sent message
+ -b units
+ the size of the socket buffer
+ -5
+ include an md5sum at the tail of each message
+ -f file
+ read from a file until eof
+ -p units
+ send from a memory pool of the given length
+ -S file
+ put the -p pool in this mmaped/mlocked file, use sendfile
+ -l units
+ only send this many bytes total
+ -i timespec
+ output vmstat-like line at this interval
+
+I guess that gives us enough to chew on for now :) I want this stuff to
+be dirt simple. trivial arg parser helpers, maybe some list.h from the
+kernel, no glib complexity explosion. I guess I could send you some
+snippets of code along those lines.
+
+Oh, and I guess we'll need a little helper amongst the tools to get
+pf_rds and sol_rds from /proc/sys/net/rds/.
+
+- z
+
diff --git a/rds.7 b/rds.7
new file mode 100644
index 0000000..1bfc1a2
--- /dev/null
+++ b/rds.7
@@ -0,0 +1,445 @@
+.TH RDS 7
+.SH NAME
+RDS \- Reliable Datagram Sockets
+.SH
+SYNOPSIS
+.nf
+.B #include <sys/socket.h>
+.B #include <netinet/in.h>
+.fi
+.SH DESCRIPTION
+This is an implementation of the RDS socket API. It provides reliable,
+in-order datagram delivery between sockets over a variety of transports.
+.PP
+Currently, RDS can be transported over Infiniband, and loopback.
+RDS over TCP is disabled, but will be re-enabled in the near future.
+.PP
+RDS uses standard
+.B AF_INET
+addresses as described in
+.BR ip (7)
+to identify end points.
+.\"------------------------------------------------------------------
+.SS Socket Creation
+RDS is still in development and as such does not have a reserved protocol
+family constant. Applications must read the string representation of the
+protocol family value from the
+.B pf_rds
+sysctl parameter file described below.
+.PP
+.nf
+.B rds_socket = socket(pf_rds, SOCK_SEQPACKET, 0);
+.fi
+.PP
+.\"------------------------------------------------------------------
+.SS Socket Options
+RDS sockets support a number of socket options through the
+.BR setsockopt (2)
+and
+.BR getsockopt (2)
+calls. The following generic options (with socket level
+.BR SOL_SOCKET )
+are of specific importance:
+.TP
+.B SO_RCVBUF
+Specifies the size of the receive buffer. See section on
+"Congestion Control" below.
+.TP
+.B SO_SNDBUF
+Specifies the size of the send buffer. See "Message Transmission"
+below.
+.TP
+.B SO_SNDTIMEO
+Specifies the send timeout when trying to enqueue a message on a
+socket with a full queue in blocking mode.
+.PP
+In addition to these, RDS supports a number of protocol specific
+options (with socket level
+.BR SOL_RDS ).
+Just as with the RDS protocol family, an official value has not been
+assigned yet, so the kernel will assign a value dynamically.
+The assigned value can be retrieved from the
+.B sol_rds
+sysctl parameter file.
+.PP
+RDS specific socket options will be described in a separate section
+below.
+.\"------------------------------------------------------------------
+.SS Binding
+A new RDS socket has no local address when it is first returned from
+.BR socket (2).
+It must be bound to a local address by calling
+.BR bind (2)
+before any messages can be sent or received. This will also attach the
+socket to a specific transport, based on the type of interface the
+local address is attached to. From that point on, the socket can only
+reach destinations which are available through this transport.
+.PP
+For instance, when binding to the address of an Infiniband interface
+such as
+.BR ib0 ,
+the socket will use the Infiniband transport. If RDS is not able
+to associate a transport with the given address, it will return
+.BR EADDRNOTAVAIL .
+.PP
+An RDS socket can only be bound to one address and only one socket can
+be bound to a given address/port pair. If no port is specified in the
+binding address then an unbound port is selected at random.
+.PP
+RDS does not allow the application to bind a previously bound socket
+to another address. Binding to the wildcard address
+.B INADDR_ANY
+is not permitted either.
+.\"------------------------------------------------------------------
+.SS Connecting
+The default mode of operation for RDS is to use unconnected socket,
+and specify a destination address as an argument to
+.BR sendmsg .
+However, RDS allows sockets to be connected to a remote end point using
+.BR connect (2).
+If a socket is connected, calling
+.BR sendmsg
+without specifying a destination address will use the previously given
+remote address.
+.\"------------------------------------------------------------------
+.SS Congestion Control
+RDS does not have explicit congestion control like common
+streaming protocols such as TCP. However, sockets have two queue limits
+associated with them; the send queue size and the receive queue size.
+Messages are accounted based on the number of bytes of payload.
+.PP
+The send queue size limits how much data local processes can queue on
+a local socket (see the following section). If that limit is exceeded,
+the kernel will not accept further messages until the queue is drained
+and messages have been delivered to and acknowledged by the remote host.
+.PP
+The receive queue size limits how much data RDS will put on the receive
+queue of a socket before marking the socket as
+.IR congested .
+When a socket becomes congested, RDS will send a
+.I congestion map update
+to the other participating hosts, who are then expected to stop sending
+more messages to this port.
+.PP
+There is a timing window during which a remote host can still continue
+to send messages to a congested port; RDS solves this by accepting
+these messages even if the socket's receive queue is already over
+the limit.
+.PP
+As the application pulls incoming messages off the receive queue using
+.BR recvmsg (2),
+the number of bytes on the receive queue will eventually
+drop below the receive queue size, at which point the port is then
+marked uncongested, and another congestion update is sent to all
+participating hosts. This tells them to allow applications to send
+additional messages to this port.
+.PP
+The default values for the send and receive buffer size are controlled
+by the
+A given RDS socket has limited transmit buffer space. It defaults to
+the system wide socket send buffer size set in the
+.B wmem_default
+and
+.B rmem_default
+sysctls, respectively. They can be tuned by the application through the
+.B SO_SNDBUF
+and
+.B SO_RCVBUF
+socket options.
+.PP
+.\"------------------------------------------------------------------
+.SS Blocking Behavior
+The
+.BR sendmsg (2)
+and
+.BR recvmsg (2)
+calls can block in a variety of situations.
+Whether a call blocks or returns with an error depends on the non-blocking
+setting of the file descriptor and the
+.B MSG_DONTWAIT
+message flag. If the file descriptor is set to blocking mode (which is the
+default), and the
+.B MSG_DONTWAIT
+flag is
+.I not
+given, the call will block.
+.PP
+In addition, the
+.B SO_SNDTIMEO
+and
+.B SO_RCVTIMEO
+socket options can be used to specify a timeout (in seconds) after
+which the call will abort waiting, and return an error. The default
+timeout is 0, which tells RDS to block indefinitely.
+.\"------------------------------------------------------------------
+.SS Message Transmission
+Messages may be sent using
+.BR sendmsg (2)
+once the RDS socket is bound. Message length cannot exceed 4 gigabytes
+as the wire protocol uses an unsigned 32 bit integer to express the
+message length.
+.PP
+RDS does not support out of band data. Applications are allowed to
+send to unicast addresses only; broadcast or multicast are not
+supported.
+.PP
+A successful
+.BR sendmsg (2)
+call puts the message in the socket's transmit queue where it will
+remain until either the destination acknowledges that the message is
+no longer in the network or the application removes the message from
+the send queue.
+.PP
+Messages can be removed from the send queue with the
+RDS_CANCEL_SENT_TO socket option described below.
+.PP
+While a message is in the transmit queue its payload bytes are accounted for.
+If an attempt is made to send a message while there is not sufficient
+room on the transmit queue, the call will either block or return
+.BR EAGAIN .
+.PP
+Trying to send to a destination that is marked congested (see above),
+the call will either block or return
+.BR ENOBUFS .
+.PP
+A message sent with no payload bytes will not consume any space in the
+destination's send buffer but will result in a message receipt on the
+destination. The receiver will not get any payload data but will be able
+to see the sender's address.
+.PP
+Messages sent to a port to which no socket is bound will be silently
+discarded by the destination host. No error messages are reported
+to the sender.
+.\"------------------------------------------------------------------
+.SS Message Receipt
+Messages may be received with
+.BR recvmsg (2)
+on an RDS socket once it is bound to a source address. RDS will return
+messages in-order, i.e. messages from the same sender will arrive in
+the same order in which they were be sent.
+.PP
+The address of the sender will be returned in the
+.B sockaddr_in
+structure pointed to by the
+.B msg_name
+field, if set.
+.PP
+If the
+.B MSG_PEEK
+flag is given, the first message on the receive is returned without
+removing it from the queue.
+.PP
+The memory consumed by messages waiting for delivery does not limit
+the number of messages that can be queued for receive. RDS does attempt
+to perform congestion control as described in the section above.
+.PP
+If the length of the message exceeds the size of the buffer provided to
+.BR recvmsg (2),
+then the remainder of the bytes in the message are discarded and the
+.BR MSG_TRUNC
+flag is set in the msg_flags field. In this truncating case
+.BR recvmsg (2)
+will still return the number of bytes copied, not the length of entire messge.
+If
+.BR MSG_TRUNC
+is set in the flags argument to
+.BR recvmsg (2),
+then it will return the number of bytes in the entire message. Thus one
+can examine the size of the next message in the receive queue without incurring
+a copying overhead by providing a zero length buffer and setting
+.BR MSG_PEEK " and " MSG_TRUNC
+in the flags argument.
+.PP
+The sending address of a zero-length message will still be provided in the
+.B msg_name
+field.
+.\"------------------------------------------------------------------
+.SS Control Messages
+RDS uses control messages (a.k.a. ancillary data) through the
+.B msg_control
+and
+.B msg_controllen
+fields in
+.BR sendmsg (2)
+and
+.BR recvmsg (2).
+Control messages generated by RDS have a
+.BR cmsg_level
+value of
+.BR sol_rds .
+Most control messages are related to the zerocopy interface added in
+RDS version 3, and are described in
+.BR rds-rdma (7).
+.PP
+The only exception is the
+.BR RDS_CMSG_CONG_UPDATE
+message, which is described in the following section.
+.\"------------------------------------------------------------------
+.SS Polling
+RDS supports the
+.BR poll (2)
+interface in a limited fashion.
+.B POLLIN
+is returned when there is a message (either a proper RDS message,
+or a control message) waiting in the socket's receive queue.
+.B POLLOUT
+is always returned while there is room on the socket's send queue.
+.PP
+Sending to congested ports requires special handling. When an application
+tries to send to a congested destination, the system call will return
+.BR ENOBUFS .
+However, it cannot poll for
+.BR POLLOUT ,
+as there is probably still room on the transmit queue, so the call to
+.BR poll (2)
+would return immediately, even though the destination is still congested.
+.PP
+There are two ways of dealing with this situation. The first is to
+simply poll for
+.BR POLLIN .
+By default, a process sleeping in
+.BR poll (2)
+is always woken up when the congestion map is updated,
+and thus the application can retry any previously congested
+sends.
+.PP
+The second option is explicit congestion monitoring, which
+gives the application more fine-grained control.
+.PP
+With explicit monitoring, the application polls for
+.B POLLIN
+as before, and additionally uses the
+.BR RDS_CONG_MONITOR
+socket option to install a 64bit mask value in the socket, where each
+bit corresponds to a group of ports. When a congestion update arrives,
+RDS checks the set of ports that became uncongested against the bit mask
+installed in the socket. If they overlap, a control messages is
+enqueued on the socket, and the application is woken up. When it calls
+.BR recvmsg (2),
+it will be given the control message containing the bitmap.
+on the socket.
+.PP
+The congestion monitor bitmask can be set and queried using
+.BR setsockopt (2)
+with
+.BR RDS_CONG_MONITOR ,
+and a pointer to the 64bit mask variable.
+.PP
+Congestion updates are delivered to the application via
+.B RDS_CMSG_CONG_UPDATE
+control messages. These control messages are always delivered
+by themselves (or possibly additional control messages), but never
+along with a RDS data message. The
+.BR cmsg_data
+field of the control message is an 8 byte datum containing the
+64bit mask value.
+.PP
+Applications can use the following macros to test for and set bits
+in the bitmask:
+.PP
+.nf
+#define RDS_CONG_MONITOR_SIZE 64
+#define RDS_CONG_MONITOR_BIT(port) (((unsigned int) port) % RDS_CONG_MONITOR_SIZE)
+#define RDS_CONG_MONITOR_MASK(port) (1 << RDS_CONG_MONITOR_BIT(port))
+.fi
+.PP
+.\"------------------------------------------------------------------
+.SS Canceling Messages
+An application can cancel (flush) messages from the send queue using
+the
+.BR RDS_CANCEL_SENT_TO
+socket option with
+.BR setsockopt (2).
+This call takes an optional
+.B sockaddr_in
+address structure as argument. If given, only messages to the destination
+specified by this address are discarded. If no address is given, all
+pending messages are discarded.
+.PP
+Note that this affects messages that have not yet been transmitted
+as well as messages that have been transmitted, but for which no
+acknowledgment from the remote host has been received yet.
+.\"------------------------------------------------------------------
+.SS Reliability
+If
+.BR sendmsg (2)
+succeeds, RDS guarantees that the message will be visible to
+.BR recvmsg (2)
+on a socket bound to the destination address as long as that
+destination socket remains open.
+.PP
+If there is no socket bound on the destination, the message is
+silently dropped. If the sending RDS can't be sure that there is no
+socket bound then it will try to send the message indefinitely until it
+can be sure or the sent message is canceled.
+.PP
+If a socket is closed then all pending sent messages on the socket are
+canceled and may or may not be seen by the receiver.
+.PP
+The RDS_CANCEL_SENT_TO socket option can be used to cancel all pending
+messages to a given destination.
+.PP
+If a receiving socket is closed with pending messages then the sender
+considers those messages as having left the network and will not
+retransmit them.
+.PP
+A message will only be seen by
+.BR recvmsg (2)
+once, unless
+.B MSG_PEEK
+was specified. Once the message has been delivered it is removed
+from the sending socket's transmit queue.
+.PP
+All messages sent from the same socket to the same destination will
+be delivered in the order they're sent. Messages sent from different
+sockets, or to different destinations, may be delivered in any order.
+.\"------------------------------------------------------------------
+.SH SYSCTL VALUES
+These parameteres may only be accessed through their files in
+.BR /proc/sys/net/rds .
+Access through
+.BR sysctl (2)
+is not supported.
+.TP
+.B pf_rds
+This file contains the string representation of the protocol family
+constant passed to
+.BR socket (2)
+to create a new RDS socket.
+.TP
+.B sol_rds
+This file contains the string representation of the socket level parameter
+that is passed to
+.BR getsockopt (2)
+and
+.BR setsockopt (2)
+to manipulate RDS socket options.
+.TP
+.BR max_unacked_bytes " and " max_unacked_packets
+These parameters are used to tune the generation of acknowledgements. By
+default, the system receiving RDS messages does not send back explicit
+acknowledgements unless it transmits a message of its own (in which
+case the ACK is piggybacked onto the outgoing message), or when the sending
+system requests an ACK.
+.IP
+However, the sender needs to see an ACK from time to time so that it
+can purge old messages from the send queue. The unacked bytes and
+packet counters are used to keep track of how much data has been
+sent without requesting an ACK. The default is to request an acknowledgement
+every 16 packets, or every 16 MB, whichever comes first.
+.TP
+.BR reconnect_delay_min_ms " and " reconnect_delay_max_ms
+RDS uses host-to-host connections to transport RDS messages (both for the TCP
+and the Infiniband transport). If this connection breaks, RDS will try to
+re-establish the connection. Because this reconnect may be triggered by
+both hosts at the same time and fail, RDS uses a random backoff before attempting
+a reconnect. These two parameters specify the minimum and maximum delay in
+milliseconds. The default values are 1 and 1000, respectively.
+.SH SEE ALSO
+.BR rds-rdma (7),
+.BR socket (2),
+.BR bind (2),
+.BR sendmsg (2),
+.BR recvmsg (2),
+.BR getsockopt(2),
+.BR setsockopt (2).
diff --git a/rdstool.h b/rdstool.h
new file mode 100644
index 0000000..cc7988f
--- /dev/null
+++ b/rdstool.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * tools header stuff
+ */
+
+#ifndef __RDS_TOOL_H
+#define __RDS_TOOL_H
+
+#include <netinet/in.h>
+
+#include "kernel-list.h"
+#include "pfhack.h"
+
+#ifndef AF_RDS
+# define AF_RDS OFFICIAL_PF_RDS
+#endif
+#ifndef PF_RDS
+# define PF_RDS AF_RDS
+#endif
+#ifndef SOL_RDS
+# define SOL_RDS OFFICIAL_SOL_RDS
+#endif
+
+#define RDS_TOOL_BASE_OPTS ":s:m:f:i:-:vqhV"
+#define RDS_SINK_OPTS
+#define RDS_GEN_OPTS "d:b:l:"
+
+#define RDS_DEFAULT_MSG_SIZE 4096
+
+#define verbosef(lvl, f, fmt, a...) do { \
+ if (verbose >= (lvl)) \
+ fprintf((f), fmt, ##a); \
+} while (0)
+
+struct rds_endpoint {
+ struct list_head re_item;
+ char *re_name;
+ struct sockaddr_in re_addr;
+ int re_fd;
+};
+
+struct rds_context {
+ struct rds_endpoint *rc_saddr;
+ struct list_head rc_daddrs;
+ const char *rc_filename;
+ uint32_t rc_msgsize;
+ uint64_t rc_total;
+};
+
+/* Set by parse_options() */
+extern char *progname;
+extern unsigned int verbose;
+
+extern int parse_options(int argc, char *argv[], const char *opts,
+ struct rds_context *ctxt);
+extern int rds_bind(struct rds_context *ctxt);
+extern int dup_file(struct rds_context *ctxt, int fd, int flags);
+extern int setup_signals(void);
+extern int runningp(void);
+
+/* stats.c */
+extern int stats_init(int delay);
+extern void stats_extended(int extendedp);
+extern void stats_start(void);
+extern void stats_print(void);
+extern void stats_total(void);
+
+extern void stats_add_recv(uint64_t bytes);
+extern void stats_add_send(uint64_t bytes);
+extern uint64_t stats_get_send(void);
+extern void stats_add_read(uint64_t bytes);
+extern void stats_add_write(uint64_t bytes);
+
+
+/* Provided by C files with main() */
+extern void print_usage(int rc);
+extern void print_version(void);
+#endif /* __RDS_TOOL_H */
diff --git a/stap/README b/stap/README
new file mode 100644
index 0000000..d74f0fb
--- /dev/null
+++ b/stap/README
@@ -0,0 +1,15 @@
+SystemTap script for RDS
+
+SystemTap: http://sourceware.org/systemtap/
+SystemTap wiki: http://sourceware.org/systemtap/wiki
+
+To use SystemTap for tracing RDS, please ensure you
+have debugging symbols available for both your installed kernel
+as well as RDS module. These usually take the form of
+*-debuginfo RPMs, and may be available via your distro's
+update repository, a distro repository disabled by
+default, or via your distro's website.
+
+Please send any comments or improvement patches to
+rds-devel at oss.oracle.com.
+
diff --git a/stap/rds.stp b/stap/rds.stp
new file mode 100644
index 0000000..4baaf0e
--- /dev/null
+++ b/stap/rds.stp
@@ -0,0 +1,35 @@
+/*
+probe module("rds").function("rds_*")
+{
+ printf("RDS %s\n", pp())
+}
+*/
+
+global reads
+
+probe begin
+{
+ reads <<< 0
+}
+
+probe module("rds").function("rds_recvmsg").return
+{
+ reads <<< ret
+}
+
+probe module("rds").function("rds_send_pong").return
+{
+ reads <<< ret
+ //println(caller())
+}
+
+probe timer.sec(5)
+{
+ println("RDS bytes received")
+ print(@hist_log(reads))
+}
+
+probe end
+{
+ printf("end!\n")
+}
diff --git a/stats.c b/stats.c
new file mode 100644
index 0000000..daaabc8
--- /dev/null
+++ b/stats.c
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stats.c - Print stats at an interval
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <signal.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <time.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <arpa/inet.h>
+
+#include "kernel-list.h"
+#include "rdstool.h"
+
+static int stats_delay = 0; /* Delay in seconds */
+static int print_extended = 0; /* Print read/write stats? */
+static sig_atomic_t time_to_print = 0;
+
+struct rds_tool_stats {
+ uint64_t rs_send_bytes;
+ uint64_t rs_send_bytes_interval;
+ uint64_t rs_send_packets;
+ uint64_t rs_send_packets_interval;
+ uint64_t rs_recv_bytes;
+ uint64_t rs_recv_bytes_interval;
+ uint64_t rs_recv_packets;
+ uint64_t rs_recv_packets_interval;
+ uint64_t rs_read_bytes;
+ uint64_t rs_read_bytes_interval;
+ uint64_t rs_write_bytes;
+ uint64_t rs_write_bytes_interval;
+} tool_stats;
+
+#define inc_net_stat(type, val) do { \
+ tool_stats.rs_##type##_bytes += val; \
+ tool_stats.rs_##type##_bytes_interval += val; \
+ tool_stats.rs_##type##_packets += 1; \
+ tool_stats.rs_##type##_packets_interval += 1; \
+} while (0)
+
+#define inc_io_stat(type, val) do { \
+ tool_stats.rs_##type##_bytes += val; \
+ tool_stats.rs_##type##_bytes_interval += val; \
+} while (0)
+
+#define clear_interval() do { \
+ tool_stats.rs_send_bytes_interval = 0; \
+ tool_stats.rs_recv_bytes_interval = 0; \
+ tool_stats.rs_send_packets_interval = 0; \
+ tool_stats.rs_recv_packets_interval = 0; \
+ tool_stats.rs_read_bytes_interval = 0; \
+ tool_stats.rs_write_bytes_interval = 0; \
+} while (0)
+
+static void handler(int signum)
+{
+ time_to_print = 1;
+}
+
+static int setup_alarm(void)
+{
+ int rc = 0;
+ struct sigaction act;
+
+ sigemptyset(&act.sa_mask);
+ act.sa_handler = handler;
+ act.sa_flags = 0;
+
+ rc = sigaction(SIGALRM, &act, NULL);
+ if (rc) {
+ rc = -errno;
+ verbosef(0, stderr,
+ "%s: Unable to initialize timer: %s\n",
+ progname, strerror(-rc));
+ }
+
+ return rc;
+}
+
+void stats_add_read(uint64_t bytes)
+{
+ inc_io_stat(read, bytes);
+}
+
+void stats_add_write(uint64_t bytes)
+{
+ inc_io_stat(write, bytes);
+}
+
+void stats_add_send(uint64_t bytes)
+{
+ inc_net_stat(send, bytes);
+}
+
+uint64_t stats_get_send(void)
+{
+ return tool_stats.rs_send_bytes;
+}
+
+void stats_add_recv(uint64_t bytes)
+{
+ inc_net_stat(recv, bytes);
+}
+
+static void stats_arm(void)
+{
+ time_to_print = 0;
+ alarm(stats_delay);
+}
+
+int stats_init(int delay)
+{
+ int rc = 0;
+
+ stats_delay = delay;
+ if (stats_delay)
+ rc = setup_alarm();
+
+ return rc;
+}
+
+void stats_extended(int extendedp)
+{
+ print_extended = !!extendedp;
+}
+
+void stats_start(void)
+{
+ if (stats_delay) {
+ verbosef(1, stderr,
+ "%19s %19s %19s %19s\n",
+ "Bytes sent/s", "Packets sent/s",
+ "Bytes recv/s", "Packets recv/s");
+ if (print_extended)
+ verbosef(1, stderr, " %19s %19s",
+ "Bytes read/s", "Bytes written/s");
+ verbosef(1, stderr, "\n");
+
+ stats_arm();
+ }
+}
+
+static void stats_output(void)
+{
+ verbosef(0, stderr,
+ "%19"PRIu64" %19"PRIu64" %19"PRIu64" %19"PRIu64,
+ tool_stats.rs_send_bytes_interval / stats_delay,
+ tool_stats.rs_send_packets_interval / stats_delay,
+ tool_stats.rs_recv_bytes_interval / stats_delay,
+ tool_stats.rs_recv_packets_interval / stats_delay);
+ if (print_extended)
+ verbosef(0, stderr, " %19"PRIu64" %19"PRIu64,
+ tool_stats.rs_read_bytes_interval / stats_delay,
+ tool_stats.rs_write_bytes_interval / stats_delay);
+ verbosef(0, stderr, "\n");
+}
+
+void stats_print(void)
+{
+ /* Are stats on? */
+ if (stats_delay && time_to_print) {
+ stats_output();
+ clear_interval();
+ stats_arm();
+ }
+}
+
+void stats_total(void)
+{
+ if (!stats_delay)
+ return;
+
+ verbosef(0, stderr,
+ "Total:\n"
+ "%19"PRIu64" %19"PRIu64" %19"PRIu64" %19"PRIu64,
+ tool_stats.rs_send_bytes,
+ tool_stats.rs_send_packets,
+ tool_stats.rs_recv_bytes,
+ tool_stats.rs_recv_packets);
+ if (print_extended)
+ verbosef(0, stderr, " %19"PRIu64" %19"PRIu64,
+ tool_stats.rs_read_bytes,
+ tool_stats.rs_write_bytes);
+
+ verbosef(0, stderr, "\n");
+}
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ofed/rds-tools.git
More information about the Pkg-ofed-commits
mailing list