[Pkg-ofed-commits] [libpsm2] 01/03: New upstream version 10.3-37

Brian Smith bsmith-guest at moszumanska.debian.org
Thu Dec 7 22:30:00 UTC 2017


This is an automated email from the git hooks/post-receive script.

bsmith-guest pushed a commit to branch master
in repository libpsm2.

commit 03e1122f5f6edcf34ebc789c45ccd51aabb72538
Author: Brian T. Smith <bsmith at systemfabricworks.com>
Date:   Thu Dec 7 10:33:37 2017 -0600

    New upstream version 10.3-37
---
 COMMIT                 |  2 +-
 Makefile               | 38 +++++++++++++--------
 buildflags.mak         | 32 +++++++++--------
 compat/Makefile        |  2 +-
 compat/buildflags.mak  |  7 ++--
 libpsm2.spec.in        |  4 ++-
 libuuid/Makefile       |  7 ++--
 makesrpm.sh            |  2 +-
 opa/Makefile           | 10 +++---
 opa/opa_sysfs.c        |  1 +
 opa/opa_time.c         | 15 +++++++-
 psm.c                  | 93 +++++++++++++++++++-------------------------------
 psm2_am.h              | 57 ++++++++++++++++++++++++++++++-
 psm2_mq.h              | 56 +++++++++++++++++++++++++++---
 psm_am.c               | 73 +++++++++++++++++++++++++++++++++------
 psm_am_internal.h      | 24 ++++++++++---
 psm_context.c          | 15 ++++----
 psm_ep.h               |  2 +-
 psm_ep_connect.c       |  2 +-
 psm_mq.c               | 32 +++++++++++++++++
 psm_mq_internal.h      |  8 ++++-
 psm_perf.c             | 80 +++++++++++++++++++++++++------------------
 psm_perf.h             | 16 ++++-----
 psm_user.h             | 32 ++++++++++-------
 psm_utils.c            | 66 +++++++++++++++++++++++------------
 psm_utils.h            |  5 +--
 ptl_am/Makefile        |  5 ++-
 ptl_am/ptl.c           | 18 ++++++++--
 ptl_ips/Makefile       |  5 ++-
 ptl_ips/ips_path_rec.c |  2 +-
 ptl_ips/ips_proto.c    | 38 +++++++++++++++++++++
 ptl_ips/ips_proto_am.c | 20 +++++++++--
 ptl_ips/ips_recvhdrq.c | 19 +++++++++++
 ptl_ips/ips_tidcache.c | 11 ++++--
 ptl_self/Makefile      |  5 ++-
 ptl_self/ptl.c         | 38 +++++++++++++++++----
 rpm_release_extension  |  2 +-
 37 files changed, 607 insertions(+), 237 deletions(-)

diff --git a/COMMIT b/COMMIT
index d81e4f3..b55e71f 100644
--- a/COMMIT
+++ b/COMMIT
@@ -1 +1 @@
-f8df7f0de7139df384ea8b94dc8567885bf76070
\ No newline at end of file
+295d2ac9ced1415c309531cbb478eab943e174e8
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 8db7c1d..dc11046 100644
--- a/Makefile
+++ b/Makefile
@@ -84,16 +84,15 @@ override OUTDIR := $(shell readlink -m $(OUTDIR))
 endif
 endif
 
-LINKER_SCRIPT_FILE := ${OUTDIR}/psm2_linker_script.map
 
 PSM2_VERNO_MAJOR := $(shell sed -n 's/^\#define.*PSM2_VERNO_MAJOR.*0x0\?\([1-9a-f]\?[0-9a-f]\+\).*/\1/p' $(top_srcdir)/psm2.h)
 PSM2_VERNO_MINOR := $(shell sed -n 's/^\#define.*PSM2_VERNO_MINOR.*0x\([0-9]\?[0-9a-f]\+\).*/\1/p' $(top_srcdir)/psm2.h)
 PSM2_LIB_MAJOR   := $(shell printf "%d" ${PSM2_VERNO_MAJOR})
 PSM2_LIB_MINOR   := $(shell printf "%d" `sed -n 's/^\#define.*PSM2_VERNO_MINOR.*\(0x[0-9a-f]\+\).*/\1/p' $(top_srcdir)/psm2.h`)
+LINKER_SCRIPT_FILE = ${OUTDIR}/psm2_linker_script.map
 SOURCES_CHKSUM_FILES = Makefile buildflags.mak $(LINKER_SCRIPT_FILE) \
 		`find . -regex '\(.*\.h\|.*\.c\)' -not -path "./test/*" -not -path "./tools/*" -not -path "_revision.c" | sort`
 SOURCES_CHKSUM_VALUE = $(shell cat ${SOURCES_CHKSUM_FILES} | sha1sum | cut -d' ' -f 1)
-
 OPA_LIB_MAJOR := 4
 OPA_LIB_MINOR := 0
 
@@ -105,8 +104,16 @@ export OPA_LIB_MAJOR
 export OPA_LIB_MINOR
 export CCARCH ?= gcc
 export FCARCH ?= gfortran
+export AR ?= ar
 
 include $(top_srcdir)/buildflags.mak
+# We need to unexport these environs as during mock testing and normal calls,
+# if they are exported then during each submake they will be evaulated again.
+# This is costly and the LINKER_SCRIPT_FILE doesn't exist until after its
+# target rule runs.
+unexport SOURCES_CHKSUM_FILES
+unexport SOURCES_CHKSUM_VALUE
+unexport LINKER_SCRIPT_FILE
 INCLUDES += -I$(top_srcdir)
 
 ifneq (x86_64,$(arch))
@@ -272,18 +279,20 @@ all: outdir symlinks
 	@if [ ! -e $(HISTORY) ] || [ -z "`grep -E '^$(OUTDIR)$$' $(HISTORY)`" ]; then \
 		echo $(OUTDIR) >> $(HISTORY); \
 	fi
+	# Our buildflags.mak exports all variables, all are propogated to submakes.
 	@for subdir in $(SUBDIRS); do \
 		mkdir -p $(OUTDIR)/$$subdir; \
-		$(MAKE) -j $(nthreads) -C $$subdir OUTDIR=$(OUTDIR)/$$subdir $(OPTIONS); \
+		$(MAKE) -j $(nthreads) -C $$subdir OUTDIR=$(OUTDIR)/$$subdir; \
 	done
-	$(MAKE) -j $(nthreads) OUTDIR=$(OUTDIR) $(OPTIONS) $(OUTDIR)/${TARGLIB}.so
+	$(MAKE) -j $(nthreads) $(OUTDIR)/${TARGLIB}.so
+	$(MAKE) -j $(nthreads) $(OUTDIR)/${TARGLIB}.a
 	@mkdir -p $(OUTDIR)/compat
-	$(MAKE) -j $(nthreads) -C compat OUTDIR=$(OUTDIR)/compat $(OPTIONS)
+	$(MAKE) -j $(nthreads) -C compat OUTDIR=$(OUTDIR)/compat
 
 %_clean:
 	make OUTDIR=$* clean
 
-clean: linker_script_file_clean cleanlinks
+clean: cleanlinks
 	rm -rf ${OUTDIR}
 	@if [ -e $(HISTORY) ]; then \
 		grep -v -E "^$(OUTDIR)$$" $(HISTORY) > $(HISTORY)_tmp; \
@@ -294,12 +303,11 @@ clean: linker_script_file_clean cleanlinks
 	fi
 
 mock: OUTDIR := $(MOCK_OUTDIR)
-mock: OPTIONS = PSM2_MOCK_TESTING=1
 mock:
-	$(MAKE) OUTDIR=$(OUTDIR) OPTIONS=$(OPTIONS)
+	$(MAKE) OUTDIR=$(OUTDIR) PSM2_MOCK_TESTING=1
 
 debug: OUTDIR := $(DEBUG_OUTDIR)
-debug: OPTIONS = PSM_DEBUG=1
+debug: OPTIONS := PSM_DEBUG=1
 debug:
 	$(MAKE) OUTDIR=$(OUTDIR) OPTIONS=$(OPTIONS)
 
@@ -338,6 +346,8 @@ install: all
 	(cd ${DESTDIR}${INSTALL_LIB_TARG} ; \
 		ln -sf ${TARGLIB}.so.${MAJOR}.${MINOR} ${TARGLIB}.so.${MAJOR} ; \
 		ln -sf ${TARGLIB}.so.${MAJOR} ${TARGLIB}.so)
+	install -D $(OUTDIR)/${TARGLIB}.a \
+		${DESTDIR}${INSTALL_LIB_TARG}/${TARGLIB}.a
 	install -m 0644 -D psm2.h ${DESTDIR}/usr/include/psm2.h
 	install -m 0644 -D psm2_mq.h ${DESTDIR}/usr/include/psm2_mq.h
 	install -m 0644 -D psm2_am.h ${DESTDIR}/usr/include/psm2_am.h
@@ -501,16 +511,16 @@ $(OUTDIR)/${TARGLIB}.so.${MAJOR}.${MINOR}: ${${TARGLIB}-objs} $(LINKER_SCRIPT_FI
 	date -u -d@$${SOURCE_DATE_EPOCH:-$$(date +%s)} +'char psmi_hfi_build_timestamp[] ="%F %T%:z";' >> ${OUTDIR}/_revision.c
 	echo "char psmi_hfi_sources_checksum[] =\"${SOURCES_CHKSUM_VALUE}\";" >> ${OUTDIR}/_revision.c
 	echo "char psmi_hfi_git_checksum[] =\"`git rev-parse HEAD`\";" >> ${OUTDIR}/_revision.c
-	$(CC) -c $(BASECFLAGS) $(INCLUDES) ${OUTDIR}/_revision.c -o $(OUTDIR)/_revision.o
+	$(CC) -c $(CFLAGS) $(BASECFLAGS) $(INCLUDES) ${OUTDIR}/_revision.c -o $(OUTDIR)/_revision.o
 	$(CC) $(LINKER_SCRIPT) $(LDFLAGS) -o $@ -Wl,-soname=${TARGLIB}.so.${MAJOR} -shared \
 		${${TARGLIB}-objs} $(OUTDIR)/_revision.o -Lopa $(LDLIBS)
 
+$(OUTDIR)/${TARGLIB}.a: $(OUTDIR)/${TARGLIB}.so.${MAJOR}.${MINOR}
+	$(AR) rcs $(OUTDIR)/${TARGLIB}.a ${${TARGLIB}-objs} $(OUTDIR)/_revision.o
+
 ${OUTDIR}/%.o: ${top_srcdir}/%.c
-	$(CC) $(CFLAGS) $(INCLUDES) -MMD -c $< -o $@
+	$(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -MMD -c $< -o $@
 
 $(LINKER_SCRIPT_FILE): psm2_linker_script_map.in
 	sed "s/_psm2_additional_globals_;/$(PSM2_ADDITIONAL_GLOBALS)/" \
 	     psm2_linker_script_map.in > ${OUTDIR}/psm2_linker_script.map
-
-linker_script_file_clean:
-	rm -f $(LINKER_SCRIPT_FILE)
diff --git a/buildflags.mak b/buildflags.mak
index 67593f5..f59958c 100644
--- a/buildflags.mak
+++ b/buildflags.mak
@@ -86,8 +86,9 @@ ASFLAGS += $(BASE_FLAGS)
 
 ifeq ($(PSM2_MOCK_TESTING),1)
 BASECFLAGS += -DPSM2_MOCK_TESTING=1
-# we skip the linker script for testing version, we want all symbols to be
-# reachable from outside the library
+unexport LINKER_SCRIPT
+# We skip the linker script for mock testing version, we want all symbols
+# to be reachable from outside the library
 else
 LINKER_SCRIPT := -Wl,--version-script $(LINKER_SCRIPT_FILE)
 endif
@@ -178,14 +179,10 @@ endif
 
 BASECFLAGS += -fpic -fPIC -D_GNU_SOURCE
 
-ifeq (${CCARCH},gcc)
-  BASECFLAGS += -funwind-tables
-endif
-
 ifneq (,${PSM_VALGRIND})
-  CFLAGS += -DPSM_VALGRIND
+  BASECFLAGS += -DPSM_VALGRIND
 else
-  CFLAGS += -DNVALGRIND
+  BASECFLAGS += -DNVALGRIND
 endif
 
 ASFLAGS += -g3 -fpic
@@ -193,18 +190,25 @@ ASFLAGS += -g3 -fpic
 BASECFLAGS += ${OPA_CFLAGS}
 
 ifeq (${CCARCH},icc)
-    BASECFLAGS += -O3 -g3 -fpic -fPIC -D_GNU_SOURCE -DPACK_STRUCT_STL=packed,
-    CFLAGS += $(BASECFLAGS)
+    BASECFLAGS += -fpic -fPIC -D_GNU_SOURCE -DPACK_STRUCT_STL=packed,
     LDFLAGS += -static-intel
 else
 	ifeq (${CCARCH},gcc)
-	    CFLAGS += $(BASECFLAGS) -Wno-strict-aliasing -Wformat-security
+	    BASECFLAGS += -funwind-tables -Wno-strict-aliasing -Wformat-security
 	else
-	    ifeq (${CCARCH},gcc4)
-		CFLAGS += $(BASECFLAGS)
-	    else
+	    ifneq (${CCARCH},gcc4)
 		$(error Unknown compiler arch "${CCARCH}")
 	    endif # gcc4
 	endif # gcc
 endif # icc
 
+# We run export here to ensure all the above setup is in the environment
+# for sub makes. However, we exclude this during clean and distclean
+# to avoid resolution of some variables that don't need to be resolved
+# and avoid unnecessary missing file warnings during cleanup.
+ifneq ($(MAKECMDGOALS), clean)
+ifneq ($(MAKECMDGOALS), distclean)
+export
+endif
+endif
+
diff --git a/compat/Makefile b/compat/Makefile
index 092775f..996b7e9 100644
--- a/compat/Makefile
+++ b/compat/Makefile
@@ -76,7 +76,7 @@ install: all
 	install -D $(OUTDIR)/${COMPATLIB}.so.${MAJOR} ${DESTDIR}${COMPAT_LIB_TARG}/${COMPATLIB}.so.${MAJOR}
 
 $(OUTDIR)/%.o: $(compat_build_dir)/%.c
-	$(CC) $(CFLAGS) $(INCLUDES) -MMD -c $< -o $@
+	$(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -MMD -c $< -o $@
 
 $(OUTDIR)/${COMPATLIB}.so.${MAJOR}: ${${COMPATLIB}-objs}
 	$(CC) $(BASECFLAGS) $(LINKER_SCRIPT) $(LDFLAGS) -Wl,-soname=${COMPATLIB}.so.${MAJOR} -shared \
diff --git a/compat/buildflags.mak b/compat/buildflags.mak
index c677989..b448e4e 100644
--- a/compat/buildflags.mak
+++ b/compat/buildflags.mak
@@ -88,15 +88,12 @@ ASFLAGS += -g3 -fpic
 
 ifeq (${CCARCH},icc)
     BASECFLAGS += -O3 -g3
-    CFLAGS += $(BASECFLAGS)
     LDFLAGS += -static-intel
 else
 	ifeq (${CCARCH},gcc)
-	    CFLAGS += $(BASECFLAGS) -Wno-strict-aliasing
+	    BASECFLAGS += -Wno-strict-aliasing
 	else
-		ifeq (${CCARCH},gcc4)
-			CFLAGS += $(BASECFLAGS)
-		else
+		ifneq (${CCARCH},gcc4)
 			$(error Unknown compiler arch "${CCARCH}")
 		endif
 	endif
diff --git a/libpsm2.spec.in b/libpsm2.spec.in
index c5ddf62..7bd7836 100644
--- a/libpsm2.spec.in
+++ b/libpsm2.spec.in
@@ -73,7 +73,8 @@ Obsoletes: hfi1-psm < 1.0.0
 %package -n @RPM_NAME@@RPM_NAME_BASEEXT@
 %endif
 Summary: Intel PSM2 Libraries
-Provides: @RPM_NAME@
+Provides: @RPM_NAME@ = %{version}-%{release}
+Provides: @RPM_NAME@%{_isa} = %{version}-%{release}
 %if 0%{?suse_version}
 BuildRequires: libnuma-devel
 Requires: libnuma1
@@ -155,6 +156,7 @@ make %{?_smp_mflags}
 
 %files -n @RPM_NAME at -devel
 %{_libdir}/@TARGLIB at .so
+%{_libdir}/@TARGLIB at .a
 %{_includedir}/psm2.h
 %{_includedir}/psm2_mq.h
 %{_includedir}/psm2_am.h
diff --git a/libuuid/Makefile b/libuuid/Makefile
index aa3f5ac..2f5babe 100644
--- a/libuuid/Makefile
+++ b/libuuid/Makefile
@@ -55,8 +55,7 @@ OUTDIR = .
 
 this_srcdir := $(shell readlink -m .)
 top_srcdir := $(this_srcdir)/..
-include $(top_srcdir)/buildflags.mak
-CFLAGS += -DPSM_UUID=1 -Wno-unused-function
+BASECFLAGS += -DPSM_UUID=1 -Wno-unused-function
 INCLUDES += -I$(top_srcdir)
 
 ${TARGLIB}-objs := psm_uuid.o parse.o pack.o unpack.o unparse.o
@@ -70,10 +69,10 @@ IGNORE_DEP_TARGETS = clean
 all .DEFAULT: ${${TARGLIB}-objs}
 
 $(OUTDIR)/%.d: $(this_srcdir)/%.c
-	$(CC) $(CFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
+	$(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
 
 $(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS}
-	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+	$(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -c $< -o $@
 
 clean:
 	@if [ -d $(OUTDIR) ]; then \
diff --git a/makesrpm.sh b/makesrpm.sh
index e673b35..5fc4939 100755
--- a/makesrpm.sh
+++ b/makesrpm.sh
@@ -113,7 +113,7 @@ while [ "$1" != "" ]; do
                         if [ -z "$1" ]; then
                             usage
                         fi
-                        $RPM_NAME_BASEEXT="$1"
+                        RPM_NAME_BASEEXT="$1"
                         export RPM_NAME_BASEEXT="$1"
                         ;;
         -r | -rpmname)  shift
diff --git a/opa/Makefile b/opa/Makefile
index d065429..97c51bc 100644
--- a/opa/Makefile
+++ b/opa/Makefile
@@ -59,10 +59,8 @@ MINOR := $(OPA_LIB_MINOR)
 
 this_srcdir := $(shell readlink -m .)
 top_srcdir := $(this_srcdir)/..
-include $(top_srcdir)/buildflags.mak
-BASECFLAGS += -D_GNU_SOURCE
-INCLUDES += -I$(top_srcdir) -I$(top_srcdir)/ptl_ips
 
+INCLUDES += -I$(top_srcdir) -I$(top_srcdir)/ptl_ips
 ifeq (${arch},x86_64)
 	PLATFORM_OBJ=opa_dwordcpy-x86_64-fast.o
 else
@@ -86,13 +84,13 @@ install: all
 	@echo "Nothing to do for install."
 
 $(OUTDIR)/%.d:  $(this_srcdir)/%.c
-	$(CC) $(CFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
+	$(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
 
 $(OUTDIR)/%.d:  $(this_srcdir)/%.S
-	$(CC) $(CFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
+	$(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
 
 $(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS}
-	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+	$(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -c $< -o $@
 
 $(OUTDIR)/%.o: $(this_srcdir)/%.S | ${DEPS}
 	$(CC) $(ASFLAGS) -c $< -o $@
diff --git a/opa/opa_sysfs.c b/opa/opa_sysfs.c
index f0cec91..00cc18a 100644
--- a/opa/opa_sysfs.c
+++ b/opa/opa_sysfs.c
@@ -266,6 +266,7 @@ static int hfi_sysfs_unit_open_for_node(uint32_t unit, int flags)
 
 	snprintf(buf, sizeof(buf), "%s/hfi1_%u/device/numa_node",
 		 dirname(path_copy), unit);
+	free(path_copy);
 	fd = open(buf, flags);
 	saved_errno = errno;
 
diff --git a/opa/opa_time.c b/opa/opa_time.c
index 1b636ed..272fdb0 100644
--- a/opa/opa_time.c
+++ b/opa/opa_time.c
@@ -69,6 +69,16 @@
 
 #include "opa_user.h"
 
+#ifdef min
+#undef min
+#endif
+#define min(a, b) ((a) < (b) ? (a) : (b))
+
+#ifdef max
+#undef max
+#endif
+#define max(a, b) ((a) > (b) ? (a) : (b))
+
 /* init the cycle counter to picosecs/cycle conversion automatically */
 /* at program startup, if it's using timing functions. */
 static void init_picos_per_cycle(void) __attribute__ ((constructor));
@@ -224,6 +234,7 @@ static uint32_t hfi_timebase_from_cpuinfo(uint32_t old_pico_per_cycle)
 {
 	/* we only validate once */
 	uint32_t new_pico_per_cycle = old_pico_per_cycle;
+	uint32_t max_bet_new_old_pico, min_bet_new_old_pico;
 
 	char hostname[80];
 	gethostname(hostname, 80);
@@ -262,8 +273,10 @@ static uint32_t hfi_timebase_from_cpuinfo(uint32_t old_pico_per_cycle)
 	}
 #endif
 
+	max_bet_new_old_pico = max(new_pico_per_cycle, old_pico_per_cycle);
+	min_bet_new_old_pico = min(new_pico_per_cycle, old_pico_per_cycle);
 	/* If there's no change (within a small range), just return the old one */
-	if (abs(new_pico_per_cycle - old_pico_per_cycle) < 5)
+	if ((max_bet_new_old_pico - min_bet_new_old_pico) < 5)
 		return old_pico_per_cycle;
 
 	if (hfi_timebase_isvalid(new_pico_per_cycle)) {
diff --git a/psm.c b/psm.c
index 16a2ceb..cd543d8 100644
--- a/psm.c
+++ b/psm.c
@@ -154,60 +154,34 @@ int psmi_cuda_initialize()
 		goto fail;
 	}
 
-
-	psmi_cuCtxGetCurrent = dlsym(psmi_cuda_lib, "cuCtxGetCurrent");
-	psmi_cuCtxSetCurrent = dlsym(psmi_cuda_lib, "cuCtxSetCurrent");
-	psmi_cuPointerGetAttribute = dlsym(psmi_cuda_lib, "cuPointerGetAttribute");
-	psmi_cuPointerSetAttribute = dlsym(psmi_cuda_lib, "cuPointerSetAttribute");
-
-	psmi_cudaGetDeviceCount = dlsym(psmi_cudart_lib, "cudaGetDeviceCount");
-	psmi_cudaGetDeviceProperties = dlsym(psmi_cudart_lib, "cudaGetDeviceProperties");
-	psmi_cudaGetDevice = dlsym(psmi_cudart_lib, "cudaGetDevice");
-	psmi_cudaSetDevice = dlsym(psmi_cudart_lib, "cudaSetDevice");
-	psmi_cudaStreamCreate = dlsym(psmi_cudart_lib, "cudaStreamCreate");
-	psmi_cudaDeviceSynchronize = dlsym(psmi_cudart_lib, "cudaDeviceSynchronize");
-	psmi_cudaStreamSynchronize = dlsym(psmi_cudart_lib, "cudaStreamSynchronize");
-	psmi_cudaEventCreate = dlsym(psmi_cudart_lib, "cudaEventCreate");
-	psmi_cudaEventDestroy = dlsym(psmi_cudart_lib, "cudaEventDestroy");
-	psmi_cudaEventQuery = dlsym(psmi_cudart_lib, "cudaEventQuery");
-	psmi_cudaEventRecord = dlsym(psmi_cudart_lib, "cudaEventRecord");
-	psmi_cudaEventSynchronize = dlsym(psmi_cudart_lib, "cudaEventSynchronize");
-	psmi_cudaMalloc = dlsym(psmi_cudart_lib, "cudaMalloc");
-	psmi_cudaHostAlloc = dlsym(psmi_cudart_lib, "cudaHostAlloc");
-	psmi_cudaFreeHost = dlsym(psmi_cudart_lib, "cudaFreeHost");
-	psmi_cudaMemcpy = dlsym(psmi_cudart_lib, "cudaMemcpy");
-	psmi_cudaMemcpyAsync = dlsym(psmi_cudart_lib, "cudaMemcpyAsync");
-
-	psmi_cudaIpcGetMemHandle = dlsym(psmi_cudart_lib, "cudaIpcGetMemHandle");
-	psmi_cudaIpcOpenMemHandle = dlsym(psmi_cudart_lib, "cudaIpcOpenMemHandle");
-	psmi_cudaIpcCloseMemHandle = dlsym(psmi_cudart_lib, "cudaIpcCloseMemHandle");
-
-	if (!psmi_cuCtxGetCurrent || !psmi_cuCtxSetCurrent ||
-	    !psmi_cuPointerGetAttribute || !psmi_cuPointerSetAttribute ||
-	    !psmi_cudaGetDeviceCount || !psmi_cudaGetDeviceProperties ||
-	    !psmi_cudaGetDevice || !psmi_cudaSetDevice ||
-	    !psmi_cudaStreamCreate ||
-	    !psmi_cudaDeviceSynchronize || !psmi_cudaStreamSynchronize ||
-	    !psmi_cudaEventCreate || !psmi_cudaEventDestroy ||
-	    !psmi_cudaEventQuery || !psmi_cudaEventRecord ||
-	    !psmi_cudaEventSynchronize ||
-	    !psmi_cudaMalloc || !psmi_cudaHostAlloc || !psmi_cudaFreeHost ||
-	    !psmi_cudaMemcpy || !psmi_cudaMemcpyAsync || !psmi_cudaIpcGetMemHandle ||
-	    !psmi_cudaIpcOpenMemHandle || !psmi_cudaIpcCloseMemHandle) {
-		_HFI_ERROR
-			("Unable to resolve symbols in CUDA libraries.\n");
-		goto fail;
-	}
-
-	if (cuda_runtime_version > 7000) {
-		psmi_cudaStreamCreateWithFlags = dlsym(psmi_cudart_lib,
-						       "cudaStreamCreateWithFlags");
-		if (!psmi_cudaStreamCreateWithFlags) {
-			_HFI_ERROR
-				("Unable to resolve symbols in CUDA libraries.\n");
-			goto fail;
-		}
-	}
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxGetCurrent);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxSetCurrent);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuPointerGetAttribute);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuPointerSetAttribute);
+
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaGetDeviceCount);
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaGetDeviceProperties);
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaGetDevice);
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaSetDevice);
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaStreamCreate);
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaDeviceSynchronize);
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaStreamSynchronize);
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaEventCreate);
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaEventDestroy);
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaEventQuery);
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaEventRecord);
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaEventSynchronize);
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaMalloc);
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaHostAlloc);
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaFreeHost);
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaMemcpy);
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaMemcpyAsync);
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaIpcGetMemHandle);
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaIpcOpenMemHandle);
+	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaIpcCloseMemHandle);
+
+	if (cuda_runtime_version > 7000)
+		PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaStreamCreateWithFlags);
 
 	/* Check if all devices support Unified Virtual Addressing. */
 	PSMI_CUDA_CALL(cudaGetDeviceCount, &num_devices);
@@ -243,11 +217,13 @@ psm2_error_t __psm2_init(int *major, int *minor)
 	psmi_log_initialize();
 
 	PSM2_LOG_MSG("entering");
-#ifdef RDPMC_PERF_FRAMEWORK
-	psmi_rdpmc_perf_framework_init();
-#endif /* RDPMC_PERF_FRAMEWORK */
 
+	/* When PSM_PERF is enabled, the following code causes the
+	   PMU to be programmed to measure instruction cycles of the
+	   TX/RX speedpaths of PSM. */
 	GENERIC_PERF_INIT();
+	GENERIC_PERF_SET_SLOT_NAME(PSM_TX_SPEEDPATH_CTR, "TX");
+	GENERIC_PERF_SET_SLOT_NAME(PSM_RX_SPEEDPATH_CTR, "RX");
 
 	if (psmi_isinit == PSMI_INITIALIZED)
 		goto update;
@@ -457,6 +433,9 @@ psm2_error_t __psm2_finalize(void)
 
 	PSMI_ERR_UNLESS_INITIALIZED(NULL);
 
+	/* When PSM_PERF is enabled, the following line causes the
+	   instruction cycles gathered in the current run to be dumped
+	   to stderr. */
 	GENERIC_PERF_DUMP(stderr);
 	ep = psmi_opened_endpoint;
 	while (ep != NULL) {
diff --git a/psm2_am.h b/psm2_am.h
index 1383fbb..f085ea0 100644
--- a/psm2_am.h
+++ b/psm2_am.h
@@ -161,7 +161,7 @@ struct psm2_amarg {
 
 /** @brief The AM handler function type
  *
- * psm2_am_handler_fm_t is the datatype for an AM handler. PSM2 AM will call-back
+ * psm2_am_handler_fn_t is the datatype for an AM handler. PSM2 AM will call-back
  * into an AM handler using this function prototype. The parameters and result
  * of these handler functions are described here.
  *
@@ -183,6 +183,32 @@ int (*psm2_am_handler_fn_t) (psm2_am_token_t token,
 			    psm2_amarg_t *args, int nargs,
 			    void *src, uint32_t len);
 
+/** @brief The AM handler function type with caller context
+ *
+ * psm2_am_handler_2_fn_t is the datatype for an AM handler that
+ * includes a user context. PSM2 AM will call-back into an AM handler using
+ * this function prototype. The parameters and result
+ * of these handler functions are described here.
+ *
+ * @param[in] token This is an opaque token value passed into a handler.
+ *                  A request handler may send at most one reply back to the
+ *                  original requestor, and must pass this value as the token
+ *                  parameter to the psm2_am_reply_short() function. A reply
+ *                  handler is also passed a token value, but must not attempt
+ *                  to reply.
+ * @param[in] args A pointer to the arguments provided to this handler.
+ * @param[in] nargs The number of arguments.
+ * @param[in] src A pointer to the data payload provided to this handler.
+ * @param[in] len The length of the data payload in bytes.
+ * @param[in] hctx The user context pointer provided at handler registration.
+ *
+ * @returns 0 The handler should always return a result of 0.
+ */
+typedef
+int (*psm2_am_handler_2_fn_t) (psm2_am_token_t token,
+			    psm2_amarg_t *args, int nargs,
+			    void *src, uint32_t len, void *hctx);
+
 /** @brief Type for a completion call-back handler.
  *
  * A completion handler can be specified to give a call-back on the initiation
@@ -226,6 +252,35 @@ psm2_error_t psm2_am_register_handlers(psm2_ep_t ep,
 				     handlers, int num_handlers,
 				     int *handlers_idx);
 
+/** @brief Register AM call-back handlers at the specified end-point.
+ *
+ * This function is used to register an array of handlers, and may be called
+ * multiple times to register additonal handlers. The maximum number of
+ * handlers that can be registered is limited to the max_handlers value
+ * returned by psm2_am_get_parameters(). Handlers are associated with a PSM
+ * end-point. The handlers are allocated index numbers in the the handler table
+ * for that end-point.  The allocated index for the handler function in
+ * handlers[i] is returned in handlers_idx[i] for i in (0, num_handlers]. These
+ * handler index values are used in the psm2_am_request_short() and
+ * psm2_am_reply_short() functions.
+ *
+ * @param[in] ep End-point value
+ * @param[in] handlers Array of handler functions
+ * @param[in] num_handlers Number of handlers (sizes the handlers and
+ *                         handlers_idx arrays)
+ * @param[in] hctx Array of void* pointers to a user contexts for identifying the
+ *                         target ep that registered these handlers.
+ * @param[out] handlers_idx Used to return handler index mapping table
+ *
+ * @returns PSM2_OK Indicates success
+ * @returns PSM2_EP_NO_RESOURCES Insufficient slots in the AM handler table
+ */
+psm2_error_t psm2_am_register_handlers_2(psm2_ep_t ep,
+				     const psm2_am_handler_2_fn_t *
+				     handlers, int num_handlers,
+				     void **hctx,
+				     int *handlers_idx);
+
 /** @brief Generate an AM request.
  *
  * This function generates an AM request causing an AM handler function to be
diff --git a/psm2_mq.h b/psm2_mq.h
index 6c23b10..b9cbf4e 100644
--- a/psm2_mq.h
+++ b/psm2_mq.h
@@ -334,9 +334,10 @@ psm2_mq_init(psm2_ep_t ep, uint64_t tag_order_mask,
 psm2_error_t
 psm2_mq_finalize(psm2_mq_t mq);
 
-#define PSM2_MQ_TAG_ELEMENTS 3
+#define PSM2_MQ_TAG_ELEMENTS 4
 	/**< Represents the number of 32-bit tag elements in the psm2_mq_tag_t
-	 *   type. */
+	 *   type plus one extra element to keep alignment and padding
+	 *   as 16 bytes.  */
 
 /** @struct psm2_mq_tag
  ** @brief MQ Message tag
@@ -356,7 +357,11 @@ typedef
 //struct psm2_mq_tag {
 union psm2_mq_tag {
 //    union {
-		uint32_t tag[PSM2_MQ_TAG_ELEMENTS] __attribute__ ((aligned(16)));
+		uint32_t tag[PSM2_MQ_TAG_ELEMENTS]; /* No longer specifying
+						     * alignment as it makes
+						     * code break with newer
+						     * compilers. */
+
             /**< 3 x 32bit array representation of @ref psm2_mq_tag */
 		struct {
 			uint32_t tag0; /**< 1 of 3 uint32_t tag values */
@@ -403,7 +408,11 @@ struct psm2_mq_status2 {
 	/** Remote peer's epaddr */
 	psm2_epaddr_t msg_peer;
 	/** Sender's original message tag */
-	psm2_mq_tag_t msg_tag;
+	psm2_mq_tag_t msg_tag __attribute__ ((aligned(16)));/* Alignment added
+							     * to preserve the
+							     * layout as is
+							     * expected by
+							     * existent code */
 	/** Sender's original message length */
 	uint32_t msg_length;
 	/** Actual number of bytes transfered (receiver only) */
@@ -1081,6 +1090,45 @@ psm2_mq_ipeek(psm2_mq_t mq, psm2_mq_req_t *req, psm2_mq_status_t *status);
 psm2_error_t
 psm2_mq_ipeek2(psm2_mq_t mq, psm2_mq_req_t *req, psm2_mq_status2_t *status);
 
+/** @brief Check and dequeue the first request entry from the completed queue.
+ *
+ * Function to atomically check and dequeue the first entry from the completed
+ * queue. It must be paired with function psm2_mq_req_free, which returns the
+ * request to PSM2 library.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[out] req PSM MQ Request handle, to be used for receiving the matched
+ *                  message.
+ *
+ * The following error codes are returned.
+ *
+ * @retval PSM2_OK The dequeue operation was successful and @c req is updated
+ *                 with a request ready for completion.
+ *
+ * @retval PSM2_MQ_NO_COMPLETIONS The dequeue operation was not successful,
+ *                            meaning that there are no further requests ready
+ *                            for completion. The contents of @c req remain
+ *                            unchanged.
+ */
+psm2_error_t
+psm2_mq_ipeek_dequeue(psm2_mq_t mq, psm2_mq_req_t *req);
+
+/** @brief Return the request to PSM2 library.
+ *
+ * Function returns the request previously obtained via psm2_mq_ipeek_dequeue
+ * to the PSM2 library.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] req PSM MQ Request handle to be returned to PSM2 library.
+              If @p req is NULL, no operation is performed.
+ *
+ * The following error codes are returned.
+ *
+ * @retval PSM2_OK Return of an object to PSM2 library pool was successful.
+ */
+psm2_error_t
+psm2_mq_req_free(psm2_mq_t mq, psm2_mq_req_t req);
+
 /** @brief Wait until a non-blocking request completes
  *
  * Function to wait on requests created from either preposted receive buffers
diff --git a/psm_am.c b/psm_am.c
index df193da..bef1a92 100644
--- a/psm_am.c
+++ b/psm_am.c
@@ -90,7 +90,7 @@ static void psmi_am_min_parameters(struct psm2_am_parameters *dest,
 psm2_error_t psmi_am_init_internal(psm2_ep_t ep)
 {
 	int i;
-	psm2_am_handler_fn_t *am_htable;
+	struct psm2_ep_am_handle_entry *am_htable;
 	struct psm2_am_parameters params;
 
 	psmi_am_parameters.max_handlers = INT_MAX;
@@ -115,15 +115,19 @@ psm2_error_t psmi_am_init_internal(psm2_ep_t ep)
 
 	ep->am_htable =
 	    psmi_malloc(ep, UNDEFINED,
-			sizeof(psm2_am_handler_fn_t) * PSMI_AM_NUM_HANDLERS);
+			sizeof(struct psm2_ep_am_handle_entry) * PSMI_AM_NUM_HANDLERS);
 	if (ep->am_htable == NULL)
 		return PSM2_NO_MEMORY;
 
-	am_htable = (psm2_am_handler_fn_t *) ep->am_htable;
-	for (i = 0; i < PSMI_AM_NUM_HANDLERS; i++)
-		am_htable[i] = _ignore_handler;
+	am_htable = (struct psm2_ep_am_handle_entry *) ep->am_htable;
+	for (i = 0; i < PSMI_AM_NUM_HANDLERS; i++) {
+		am_htable[i].hfn = _ignore_handler;
+		am_htable[i].hctx = NULL;
+		am_htable[i].version = PSM2_AM_HANDLER_V2;
+	}
 
 	return PSM2_OK;
+
 }
 
 psm2_error_t
@@ -133,11 +137,15 @@ __psm2_am_register_handlers(psm2_ep_t ep,
 {
 	int i, j;
 
+	psmi_assert_always(ep->am_htable != NULL);
+
 	PSM2_LOG_MSG("entering");
 	/* For now just assign any free one */
-	for (i = 0, j = 0; i < PSMI_AM_NUM_HANDLERS; i++) {
-		if (ep->am_htable[i] == _ignore_handler) {
-			ep->am_htable[i] = handlers[j];
+	for (i = 0, j = 0; (i < PSMI_AM_NUM_HANDLERS) && (j < num_handlers); i++) {
+		if (ep->am_htable[i].hfn == _ignore_handler) {
+			ep->am_htable[i].hfn = handlers[j];
+			ep->am_htable[i].hctx = NULL;
+			ep->am_htable[i].version = PSM2_AM_HANDLER_V1;
 			handlers_idx[j] = i;
 			if (++j == num_handlers)	/* all registered */
 				break;
@@ -146,8 +154,11 @@ __psm2_am_register_handlers(psm2_ep_t ep,
 
 	if (j < num_handlers) {
 		/* Not enough free handlers, restore unused handlers */
-		for (i = 0; i < j; i++)
-			ep->am_htable[handlers_idx[i]] = _ignore_handler;
+		for (i = 0; i < j; i++) {
+			ep->am_htable[handlers_idx[i]].hfn = _ignore_handler;
+			ep->am_htable[handlers_idx[i]].hctx = NULL;
+			ep->am_htable[handlers_idx[i]].version = PSM2_AM_HANDLER_V2;
+		}
 		PSM2_LOG_MSG("leaving");
 		return psmi_handle_error(ep, PSM2_EP_NO_RESOURCES,
 					 "Insufficient "
@@ -162,6 +173,48 @@ __psm2_am_register_handlers(psm2_ep_t ep,
 PSMI_API_DECL(psm2_am_register_handlers)
 
 psm2_error_t
+__psm2_am_register_handlers_2(psm2_ep_t ep,
+			   const psm2_am_handler_2_fn_t *handlers,
+			   int num_handlers, void **hctx, int *handlers_idx)
+{
+	int i, j;
+
+	psmi_assert_always(ep->am_htable != NULL);
+
+	PSM2_LOG_MSG("entering");
+	/* For now just assign any free one */
+	for (i = 0, j = 0; (i < PSMI_AM_NUM_HANDLERS) && (j < num_handlers); i++) {
+		if (ep->am_htable[i].hfn == _ignore_handler) {
+			ep->am_htable[i].hfn = handlers[j];
+			ep->am_htable[i].hctx = hctx[j];
+			ep->am_htable[i].version = PSM2_AM_HANDLER_V2;
+			handlers_idx[j] = i;
+			if (++j == num_handlers)	/* all registered */
+				break;
+		}
+	}
+
+	if (j < num_handlers) {
+		/* Not enough free handlers, restore unused handlers */
+		for (i = 0; i < j; i++) {
+			ep->am_htable[handlers_idx[i]].hfn = _ignore_handler;
+			ep->am_htable[handlers_idx[i]].hctx = NULL;
+			ep->am_htable[handlers_idx[i]].version = PSM2_AM_HANDLER_V2;
+		}
+		PSM2_LOG_MSG("leaving");
+		return psmi_handle_error(ep, PSM2_EP_NO_RESOURCES,
+					 "Insufficient "
+					 "available AM handlers: registered %d of %d requested handlers",
+					 j, num_handlers);
+	}
+	else {
+		PSM2_LOG_MSG("leaving");
+		return PSM2_OK;
+	}
+}
+PSMI_API_DECL(psm2_am_register_handlers_2)
+
+psm2_error_t
 __psm2_am_request_short(psm2_epaddr_t epaddr, psm2_handler_t handler,
 		       psm2_amarg_t *args, int nargs, void *src, size_t len,
 		       int flags, psm2_am_completion_fn_t completion_fn,
diff --git a/psm_am_internal.h b/psm_am_internal.h
index 29edfb8..bc2c128 100644
--- a/psm_am_internal.h
+++ b/psm_am_internal.h
@@ -61,7 +61,21 @@
 
 #define PSMI_AM_ARGS_DEFAULT psm2_am_token_t token,			\
 			     psm2_amarg_t *args, int nargs,		\
-			     void *src, uint32_t len
+			     void *src, uint32_t len,			\
+			     void *hctx
+
+enum psm2_am_handler_version
+{
+	PSM2_AM_HANDLER_V1 = 0,
+	PSM2_AM_HANDLER_V2,
+};
+
+struct psm2_ep_am_handle_entry
+{
+	void *hfn;
+	void *hctx;
+	enum psm2_am_handler_version version;
+};
 
 struct psmi_am_token {
 	psm2_epaddr_t epaddr_incoming;
@@ -77,14 +91,14 @@ struct psmi_am_token {
    various assertions reference these parameters for sanity checking. */
 extern struct psm2_am_parameters psmi_am_parameters;
 
-PSMI_ALWAYS_INLINE(psm2_am_handler_fn_t
+PSMI_ALWAYS_INLINE(struct psm2_ep_am_handle_entry *
 		   psm_am_get_handler_function(psm2_ep_t ep,
 					       psm2_handler_t handler_idx))
 {
 	int hidx = handler_idx & (PSMI_AM_NUM_HANDLERS - 1);
-	psm2_am_handler_fn_t fn = (psm2_am_handler_fn_t) ep->am_htable[hidx];
-	psmi_assert_always(fn != NULL);
-	return fn;
+	struct psm2_ep_am_handle_entry *hentry = &ep->am_htable[hidx];
+	psmi_assert_always(hentry != NULL);
+	return hentry;
 }
 
 /* PSM internal initialization */
diff --git a/psm_context.c b/psm_context.c
index e87b69e..b2181b1 100644
--- a/psm_context.c
+++ b/psm_context.c
@@ -515,8 +515,10 @@ psm2_error_t psmi_context_close(psmi_context_t *context)
 		/* only unmap the RTAIL if it was enabled in the first place */
 		if (cinfo->runtime_flags & HFI1_CAP_DMA_RTAIL) {
 			munmap((void*)PSMI_ALIGNDOWN(binfo->rcvhdrtail_base, __hfi_pg_sz),
-			       __hfi_pg_sz);
+				__hfi_pg_sz);
 		}
+		munmap((void*)PSMI_ALIGNDOWN(binfo->user_regbase, __hfi_pg_sz),
+			__hfi_pg_sz);
 		munmap((void*)PSMI_ALIGNDOWN(binfo->events_bufbase, __hfi_pg_sz),
 		       __hfi_pg_sz);
 		munmap((void*)PSMI_ALIGNDOWN(binfo->status_bufbase, __hfi_pg_sz),
@@ -669,12 +671,13 @@ psmi_init_userinfo_params(psm2_ep_t ep, int unit_id,
 		max_contexts = max(env_maxctxt.e_int, 1);		/* needs to be non-negative */
 		ask_contexts = min(max_contexts, avail_contexts);	/* needs to be available */
 	} else if (!psmi_getenv("PSM2_SHAREDCONTEXTS_MAX",
-			 "Maximum number of contexts for this PSM2 job",
-			 PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
-			 (union psmi_envvar_val)avail_contexts, &env_maxctxt)) {
+				"",  /* deprecated */
+				PSMI_ENVVAR_LEVEL_HIDDEN | PSMI_ENVVAR_LEVEL_NEVER_PRINT,
+				PSMI_ENVVAR_TYPE_INT,
+				(union psmi_envvar_val)avail_contexts, &env_maxctxt)) {
 
 		_HFI_INFO
-		    ("This env variable is deprecated. Please use PSM2_MAX_CONTEXTS_PER_JOB in future.\n");
+		    ("The PSM2_SHAREDCONTEXTS_MAX env variable is deprecated. Please use PSM2_MAX_CONTEXTS_PER_JOB in future.\n");
 
 		max_contexts = max(env_maxctxt.e_int, 1);		/* needs to be non-negative */
 		ask_contexts = min(max_contexts, avail_contexts);	/* needs to be available */
@@ -717,7 +720,7 @@ psmi_init_userinfo_params(psm2_ep_t ep, int unit_id,
 		if (contexts > ask_contexts) {
 			err = psmi_handle_error(NULL, PSM2_EP_NO_DEVICE,
 						"Incompatible settings for "
-						"(PSM2_SHAREDCONTEXTS_MAX / PSM2_MAX_CONTEXTS_PER_JOB) and PSM2_RANKS_PER_CONTEXT");
+						"PSM2_MAX_CONTEXTS_PER_JOB and PSM2_RANKS_PER_CONTEXT");
 			goto fail;
 		}
 		ask_contexts = contexts;
diff --git a/psm_ep.h b/psm_ep.h
index 78b12f1..4354e75 100644
--- a/psm_ep.h
+++ b/psm_ep.h
@@ -173,7 +173,7 @@ struct psm2_ep {
 	struct psm2_ep *mctxt_master;
 
 	/* Active Message handler table */
-	void **am_htable;
+	struct psm2_ep_am_handle_entry *am_htable;
 
 	uint64_t gid_hi;
 	uint64_t gid_lo;
diff --git a/psm_ep_connect.c b/psm_ep_connect.c
index 9657209..1eb836f 100644
--- a/psm_ep_connect.c
+++ b/psm_ep_connect.c
@@ -132,7 +132,7 @@ __psm2_ep_connect(psm2_ep_t ep, int num_of_epid, psm2_epid_t const *array_of_epi
 			if (psmi_epid_version(array_of_epid[j]) >
 						 PSMI_EPID_VERSION) {
 					psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
-					  " Unkown version of EPID - %"PRIu64" \n"
+					  " Unknown version of EPID - %"PRIu64" \n"
 					  "Please upgrade PSM2 or set PSM2_ADDR_FMT=1 in the environment to force EPID version 1 \n",
 					  psmi_epid_version(array_of_epid[j]));
 			}
diff --git a/psm_mq.c b/psm_mq.c
index 44b602a..37290bd 100644
--- a/psm_mq.c
+++ b/psm_mq.c
@@ -1166,6 +1166,38 @@ __psm2_mq_ipeek(psm2_mq_t mq, psm2_mq_req_t *oreq, psm2_mq_status_t *status)
 }
 PSMI_API_DECL(psm2_mq_ipeek)
 
+psm2_error_t __psm2_mq_ipeek_dequeue(psm2_mq_t mq, psm2_mq_req_t *oreq)
+{
+	psm2_mq_req_t req;
+
+	PSMI_ASSERT_INITIALIZED();
+	PSMI_LOCK(mq->progress_lock);
+	if (mq->completed_q.first == NULL)
+		psmi_poll_internal(mq->ep, 1);
+	if ((req = mq->completed_q.first) == NULL) {
+		PSMI_UNLOCK(mq->progress_lock);
+		return PSM2_MQ_NO_COMPLETIONS;
+	}
+	mq_qq_remove(&mq->completed_q, req);
+	PSMI_UNLOCK(mq->progress_lock);
+	*oreq = req;
+	return PSM2_OK;
+}
+PSMI_API_DECL(psm2_mq_ipeek_dequeue)
+
+psm2_error_t __psm2_mq_req_free(psm2_mq_t mq, psm2_mq_req_t req)
+{
+	PSMI_ASSERT_INITIALIZED();
+	if (req == NULL)
+		return PSM2_OK;
+	PSMI_LOCK(mq->progress_lock);
+	psmi_mq_req_free(req);
+	PSMI_UNLOCK(mq->progress_lock);
+
+	return PSM2_OK;
+}
+PSMI_API_DECL(psm2_mq_req_free)
+
 static
 psm2_error_t psmi_mqopt_ctl(psm2_mq_t mq, uint32_t key, void *value, int get)
 {
diff --git a/psm_mq_internal.h b/psm_mq_internal.h
index f20bf34..0f30e5c 100644
--- a/psm_mq_internal.h
+++ b/psm_mq_internal.h
@@ -231,7 +231,11 @@ struct psm2_mq_req {
 
 	/* Tag matching vars */
 	psm2_epaddr_t peer;
-	psm2_mq_tag_t tag;
+	psm2_mq_tag_t tag __attribute__ ((aligned(16)));/* Alignment added
+							 * to preserve the
+							 * layout as is
+							 * expected by
+							 * existent code */
 	psm2_mq_tag_t tagsel;	/* used for receives */
 
 	/* Some PTLs want to get notified when there's a test/wait event */
@@ -279,6 +283,8 @@ struct psm2_mq_req {
 	uint8_t cuda_ipc_handle_attached;
 #endif
 
+	uint64_t user_reserved[4];
+
 	/* PTLs get to store their own per-request data.  MQ manages the allocation
 	 * by allocating psm2_mq_req so that ptl_req_data has enough space for all
 	 * possible PTLs.
diff --git a/psm_perf.c b/psm_perf.c
index f3d7e94..aaf3fd0 100644
--- a/psm_perf.c
+++ b/psm_perf.c
@@ -62,8 +62,14 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <asm/unistd.h>
+#include <linux/perf_event.h>
+
+/* Configuration */
 
-struct rdpmc_ctx global_rdpmc_ctx;
+#define RDPMC_PERF_DEFAULT_TYPE   (PERF_TYPE_HARDWARE)
+#define RDPMC_PERF_DEFAULT_CONFIG (PERF_COUNT_HW_CPU_CYCLES)
+
+__thread struct rdpmc_ctx global_rdpmc_ctx;
 
 u64 global_rdpmc_begin[RDPMC_PERF_MAX_SLOT_NUMBER];
 u64 global_rdpmc_summ[RDPMC_PERF_MAX_SLOT_NUMBER];
@@ -71,8 +77,8 @@ u64 global_rdpmc_number[RDPMC_PERF_MAX_SLOT_NUMBER];
 
 char global_rdpmc_slot_name[RDPMC_PERF_MAX_SLOT_NUMBER][RDPMC_PERF_MAX_SLOT_NAME];
 
-unsigned int global_rdpmc_type   = RDPMC_PERF_DEFAULT_TYPE;
-unsigned int global_rdpmc_config = RDPMC_PERF_DEFAULT_CONFIG;
+__thread unsigned int global_rdpmc_type   = RDPMC_PERF_DEFAULT_TYPE;
+__thread unsigned int global_rdpmc_config = RDPMC_PERF_DEFAULT_CONFIG;
 
 struct rdpmc_ctx {
 	int fd;
@@ -160,36 +166,7 @@ PSMI_ALWAYS_INLINE(void rdpmc_close(struct rdpmc_ctx *ctx))
 	munmap(ctx->buf, sysconf(_SC_PAGESIZE));
 }
 
-/**
- * rdpmc_read: read a ring 3 readable performance counter
- * @ctx: Pointer to initialized &rdpmc_ctx structure.
- *
- * Read the current value of a running performance counter.
- */
-unsigned long long rdpmc_read(struct rdpmc_ctx *ctx)
-{
-	u64 val;
-	unsigned seq;
-	u64 offset = 0;
-
-	typeof (ctx->buf) buf = ctx->buf;
-	do {
-		seq = buf->lock;
-		ips_rmb();
-		if (buf->index <= 0)
-			return buf->offset;
-#if defined(__ICC) || defined(__INTEL_COMPILER)
-                val = _rdpmc(buf->index - 1);
-#else /* GCC */
-                val = __builtin_ia32_rdpmc(buf->index - 1);
-#endif
-		offset = buf->offset;
-		ips_rmb();
-	} while (buf->lock != seq);
-	return val + offset;
-}
-
-void psmi_rdpmc_perf_framework_init()
+static void psmi_rdpmc_perf_framework_init()
 {
     int rdpmc_retval;
 
@@ -243,4 +220,41 @@ void psmi_rdpmc_perf_framework_init()
     }
 }
 
+/**
+ * rdpmc_read: read a ring 3 readable performance counter
+ * @ctx: Pointer to initialized &rdpmc_ctx structure.
+ *
+ * Read the current value of a running performance counter.
+ */
+unsigned long long rdpmc_read(struct rdpmc_ctx *ctx)
+{
+	static __thread int rdpmc_perf_initialized = 0;
+
+	if_pf(!rdpmc_perf_initialized)
+	{
+		psmi_rdpmc_perf_framework_init();
+		rdpmc_perf_initialized = 1;
+	}
+
+	u64 val;
+	unsigned seq;
+	u64 offset = 0;
+
+	typeof (ctx->buf) buf = ctx->buf;
+	do {
+		seq = buf->lock;
+		ips_rmb();
+		if (buf->index <= 0)
+			return buf->offset;
+#if defined(__ICC) || defined(__INTEL_COMPILER)
+                val = _rdpmc(buf->index - 1);
+#else /* GCC */
+                val = __builtin_ia32_rdpmc(buf->index - 1);
+#endif
+		offset = buf->offset;
+		ips_rmb();
+	} while (buf->lock != seq);
+	return val + offset;
+}
+
 #endif /* RDPMC_PERF_FRAMEWORK */
diff --git a/psm_perf.h b/psm_perf.h
index 6fa06d2..b6b77f0 100644
--- a/psm_perf.h
+++ b/psm_perf.h
@@ -51,21 +51,19 @@
 
 */
 
-#ifdef RDPMC_PERF_FRAMEWORK
+#define PSM_TX_SPEEDPATH_CTR 0
+#define PSM_RX_SPEEDPATH_CTR 1
 
-#include <linux/perf_event.h>
+#ifdef RDPMC_PERF_FRAMEWORK
 
 /* Configuration */
 
-#define RDPMC_PERF_DEFAULT_TYPE   (PERF_TYPE_HARDWARE)
-#define RDPMC_PERF_DEFAULT_CONFIG (PERF_COUNT_HW_CPU_CYCLES)
-
 #define RDPMC_PERF_MAX_SLOT_NUMBER (8)
 #define RDPMC_PERF_MAX_SLOT_NAME   (256)
 
 /* RDPMC infrastructure */
 
-extern struct rdpmc_ctx global_rdpmc_ctx;
+extern __thread struct rdpmc_ctx global_rdpmc_ctx;
 
 typedef unsigned long long u64;
 
@@ -75,10 +73,8 @@ extern u64 global_rdpmc_number[RDPMC_PERF_MAX_SLOT_NUMBER];
 
 extern char global_rdpmc_slot_name[RDPMC_PERF_MAX_SLOT_NUMBER][RDPMC_PERF_MAX_SLOT_NAME];
 
-extern unsigned int global_rdpmc_type;
-extern unsigned int global_rdpmc_config;
-
-extern void psmi_rdpmc_perf_framework_init();
+extern __thread unsigned int global_rdpmc_type;
+extern __thread unsigned int global_rdpmc_config;
 
 extern unsigned long long rdpmc_read(struct rdpmc_ctx *ctx);
 
diff --git a/psm_user.h b/psm_user.h
index dd5384f..437c983 100644
--- a/psm_user.h
+++ b/psm_user.h
@@ -154,13 +154,13 @@ _psmi_get_epid_version()) {
 	return psmi_epid_ver;
 }
 
-#define PSMI_EPID_VERSION_SHM 			0
+#define PSMI_EPID_VERSION_SHM 				0
 #define PSMI_EPID_SHM_ONLY				1
 #define PSMI_EPID_IPS_SHM				0
 #define PSMI_EPID_VERSION 				_psmi_get_epid_version()
-#define PSMI_MAX_EPID_VERNO_SUPPORTED	2
-#define PSMI_MIN_EPID_VERNO_SUPPORTED	1
-#define PSMI_EPID_VERNO_DEFAULT			2
+#define PSMI_MAX_EPID_VERNO_SUPPORTED			2
+#define PSMI_MIN_EPID_VERNO_SUPPORTED			1
+#define PSMI_EPID_VERNO_DEFAULT				2
 #define PSMI_EPID_V1					1
 #define PSMI_EPID_V2					2
 
@@ -240,18 +240,18 @@ _psmi_mutex_unlock_inner(pthread_mutex_t *mutex,
 
 #define _PSMI_LOCK_INIT(pl)	/* static initialization */
 #define _PSMI_LOCK_TRY(pl)							\
-	    _psmi_mutex_trylock_inner(&((pl).lock), PSMI_CURLOC,	\
+	    _psmi_mutex_trylock_inner(&((pl).lock), PSMI_CURLOC,		\
 					&((pl).lock_owner))
 #define _PSMI_LOCK(pl)								\
-	    _psmi_mutex_lock_inner(&((pl).lock), PSMI_CURLOC,	\
+	    _psmi_mutex_lock_inner(&((pl).lock), PSMI_CURLOC,			\
                                         &((pl).lock_owner))
 #define _PSMI_UNLOCK(pl)							\
-	    _psmi_mutex_unlock_inner(&((pl).lock), PSMI_CURLOC,	\
+	    _psmi_mutex_unlock_inner(&((pl).lock), PSMI_CURLOC,			\
                                         &((pl).lock_owner))
 #define _PSMI_LOCK_ASSERT(pl)							\
-	    psmi_assert_always(pl.lock_owner == pthread_self());
-#define _PSMI_UNLOCK_ASSERT(pl)						\
-	    psmi_assert_always(pl.lock_owner != pthread_self());
+	psmi_assert_always((pl).lock_owner == pthread_self());
+#define _PSMI_UNLOCK_ASSERT(pl)							\
+	psmi_assert_always((pl).lock_owner != pthread_self());
 #define PSMI_LOCK_DISABLED	0
 
 #elif defined(PSMI_LOCK_IS_MUTEXLOCK)
@@ -409,7 +409,7 @@ cudaError_t (*psmi_cudaIpcCloseMemHandle)(void* devPtr);
 
 #define PSMI_CUDA_CHECK_EVENT(event, cudaerr) do {			\
 		cudaerr = psmi_cudaEventQuery(event);			\
-		if ((cudaerr != cudaSuccess) &&			\
+		if ((cudaerr != cudaSuccess) &&			        \
 		    (cudaerr != cudaErrorNotReady)) {			\
 			_HFI_ERROR(					\
 				"CUDA failure: %s() returned %d\n",	\
@@ -420,7 +420,15 @@ cudaError_t (*psmi_cudaIpcCloseMemHandle)(void* devPtr);
 		}							\
 	} while (0)
 
-
+#define PSMI_CUDA_DLSYM(psmi_cuda_lib,func) do {                        \
+	psmi_##func = dlsym(psmi_cuda_lib, STRINGIFY(func));            \
+	if (!psmi_##func) {               				\
+		psmi_handle_error(PSMI_EP_NORETURN,                     \
+			       PSM2_INTERNAL_ERR,                       \
+			       " Unable to resolve %s symbol"		\
+			       " in CUDA libraries.\n",STRINGIFY(func));\
+	}                                                               \
+} while (0)
 
 PSMI_ALWAYS_INLINE(
 int
diff --git a/psm_utils.c b/psm_utils.c
index df45cdd..e5d4fbc 100644
--- a/psm_utils.c
+++ b/psm_utils.c
@@ -401,12 +401,15 @@ static int psmi_getenv_is_verblevel(int printlevel)
 	return (printlevel <= psmi_getenv_verblevel);
 }
 
-#define GETENV_PRINTF(_level, _fmt, ...)			\
-	do {							\
-		int nlevel = _level;				\
-		if (psmi_getenv_is_verblevel(nlevel))		\
-		nlevel = 0;					\
-		_HFI_ENVDBG(nlevel, _fmt, ##__VA_ARGS__);	\
+#define GETENV_PRINTF(_level, _fmt, ...)				\
+	do {								\
+		if ((_level & PSMI_ENVVAR_LEVEL_NEVER_PRINT) == 0)	\
+		{							\
+			int nlevel = _level;				\
+			if (psmi_getenv_is_verblevel(nlevel))		\
+				nlevel = 0;				\
+			_HFI_ENVDBG(nlevel, _fmt, ##__VA_ARGS__);	\
+		}							\
 	} while (0)
 
 int
@@ -476,10 +479,14 @@ MOCKABLE(psmi_getenv)(const char *name, const char *descr, int level,
 			used_default = 1;
 		} else {
 			char *ep;
-			tval.e_int = (int)strtol(env, &ep, 0);
+			/* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */
+			tval.e_int = (int)strtol(env, &ep, 10);
 			if (ep == env) {
-				used_default = 1;
-				tval = defval;
+				tval.e_int = (int)strtol(env, &ep, 16);
+				if (ep == env) {
+					used_default = 1;
+					tval = defval;
+				}
 			}
 		}
 		_GETENV_PRINT(used_default, "%d", tval.e_int, defval.e_int);
@@ -492,10 +499,14 @@ MOCKABLE(psmi_getenv)(const char *name, const char *descr, int level,
 			used_default = 1;
 		} else {
 			char *ep;
-			tval.e_int = (unsigned int)strtoul(env, &ep, 0);
+			/* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */
+			tval.e_int = (unsigned int)strtoul(env, &ep, 10);
 			if (ep == env) {
-				used_default = 1;
-				tval = defval;
+				tval.e_int = (unsigned int)strtoul(env, &ep, 16);
+				if (ep == env) {
+					used_default = 1;
+					tval = defval;
+				}
 			}
 		}
 		if (type == PSMI_ENVVAR_TYPE_UINT_FLAGS)
@@ -512,10 +523,14 @@ MOCKABLE(psmi_getenv)(const char *name, const char *descr, int level,
 			used_default = 1;
 		} else {
 			char *ep;
-			tval.e_long = strtol(env, &ep, 0);
+			/* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */
+			tval.e_long = strtol(env, &ep, 10);
 			if (ep == env) {
-				used_default = 1;
-				tval = defval;
+				tval.e_long = strtol(env, &ep, 16);
+				if (ep == env) {
+					used_default = 1;
+					tval = defval;
+				}
 			}
 		}
 		_GETENV_PRINT(used_default, "%ld", tval.e_long, defval.e_long);
@@ -526,11 +541,16 @@ MOCKABLE(psmi_getenv)(const char *name, const char *descr, int level,
 			used_default = 1;
 		} else {
 			char *ep;
+		 	/* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */
 			tval.e_ulonglong =
-			    (unsigned long long)strtoull(env, &ep, 0);
+			    (unsigned long long)strtoull(env, &ep, 10);
 			if (ep == env) {
-				used_default = 1;
-				tval = defval;
+				tval.e_ulonglong =
+				    (unsigned long long)strtoull(env, &ep, 16);
+				if (ep == env) {
+					used_default = 1;
+					tval = defval;
+				}
 			}
 		}
 		_GETENV_PRINT(used_default, "%llu",
@@ -544,10 +564,14 @@ MOCKABLE(psmi_getenv)(const char *name, const char *descr, int level,
 			used_default = 1;
 		} else {
 			char *ep;
-			tval.e_ulong = (unsigned long)strtoul(env, &ep, 0);
+			/* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */
+			tval.e_ulong = (unsigned long)strtoul(env, &ep, 10);
 			if (ep == env) {
-				used_default = 1;
-				tval = defval;
+				tval.e_ulong = (unsigned long)strtoul(env, &ep, 16);
+				if (ep == env) {
+					used_default = 1;
+					tval = defval;
+				}
 			}
 		}
 		if (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS)
diff --git a/psm_utils.h b/psm_utils.h
index 07d198b..3358704 100644
--- a/psm_utils.h
+++ b/psm_utils.h
@@ -259,8 +259,9 @@ union psmi_envvar_val {
 	unsigned long long e_ulonglong;
 };
 
-#define PSMI_ENVVAR_LEVEL_USER	    1
-#define PSMI_ENVVAR_LEVEL_HIDDEN    2
+#define PSMI_ENVVAR_LEVEL_USER	         1
+#define PSMI_ENVVAR_LEVEL_HIDDEN         2
+#define PSMI_ENVVAR_LEVEL_NEVER_PRINT    4
 
 #define PSMI_ENVVAR_TYPE_YESNO		0
 #define PSMI_ENVVAR_TYPE_STR		1
diff --git a/ptl_am/Makefile b/ptl_am/Makefile
index 5aa5a46..1109e89 100644
--- a/ptl_am/Makefile
+++ b/ptl_am/Makefile
@@ -55,7 +55,6 @@ OUTDIR = .
 
 this_srcdir := $(shell readlink -m .)
 top_srcdir := $(this_srcdir)/..
-include $(top_srcdir)/buildflags.mak
 INCLUDES += -I$(top_srcdir)
 
 ${TARGLIB}-objs := am_reqrep.o am_reqrep_shmem.o ptl.o cmarwu.o
@@ -69,10 +68,10 @@ IGNORE_DEP_TARGETS = clean
 all .DEFAULT: ${${TARGLIB}-objs}
 
 $(OUTDIR)/%.d: $(this_srcdir)/%.c
-	$(CC) $(CFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
+	$(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
 
 $(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS}
-	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+	$(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -c $< -o $@
 
 clean:
 	@if [ -d $(OUTDIR) ]; then \
diff --git a/ptl_am/ptl.c b/ptl_am/ptl.c
index 1f20cdf..99479c5 100644
--- a/ptl_am/ptl.c
+++ b/ptl_am/ptl.c
@@ -350,15 +350,27 @@ void
 psmi_am_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len)
 {
 	amsh_am_token_t *tok = (amsh_am_token_t *) toki;
-	psm2_am_handler_fn_t hfn;
+	struct psm2_ep_am_handle_entry *hentry;
 
 	psmi_assert(toki != NULL);
 
-	hfn = psm_am_get_handler_function(tok->mq->ep,
+	hentry = psm_am_get_handler_function(tok->mq->ep,
 					  (psm2_handler_t) args[0].u32w0);
 
+	/* Note a guard here for hentry != NULL is not needed because at
+	 * initialization, a psmi_assert_always() assure the entry will be
+	 * non-NULL. */
+
 	/* Invoke handler function. For AM we do not support break functionality */
-	hfn(toki, args + 1, narg - 1, buf, len);
+	if (likely(hentry->version == PSM2_AM_HANDLER_V2)) {
+		psm2_am_handler_2_fn_t hfn2 =
+				(psm2_am_handler_2_fn_t)hentry->hfn;
+		hfn2(toki, args + 1, narg - 1, buf, len, hentry->hctx);
+	} else {
+		psm2_am_handler_fn_t hfn1 =
+				(psm2_am_handler_fn_t)hentry->hfn;
+		hfn1(toki, args + 1, narg - 1, buf, len);
+	}
 
 	return;
 }
diff --git a/ptl_ips/Makefile b/ptl_ips/Makefile
index d48c883..86e2055 100644
--- a/ptl_ips/Makefile
+++ b/ptl_ips/Makefile
@@ -55,7 +55,6 @@ OUTDIR = .
 
 this_srcdir = $(shell readlink -m .)
 top_srcdir := $(this_srcdir)/..
-include $(top_srcdir)/buildflags.mak
 INCLUDES += -I$(top_srcdir)
 
 ${TARGLIB}-objs := ptl.o ptl_rcvthread.o ips_proto.o ipserror.o ips_recvq.o \
@@ -74,10 +73,10 @@ IGNORE_DEP_TARGETS = clean
 all .DEFAULT: ${${TARGLIB}-objs}
 
 $(OUTDIR)/%.d: $(this_srcdir)/%.c
-	$(CC) $(CFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
+	$(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
 
 $(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS}
-	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+	$(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -c $< -o $@
 
 clean:
 	@if [ -d $(OUTDIR) ]; then \
diff --git a/ptl_ips/ips_path_rec.c b/ptl_ips/ips_path_rec.c
index 647b111..1d52a55 100644
--- a/ptl_ips/ips_path_rec.c
+++ b/ptl_ips/ips_path_rec.c
@@ -659,7 +659,7 @@ MOCKABLE(ips_ibta_init)(struct ips_proto *proto)
 		_HFI_PRDBG("Static path selection: Base LID\n");
 
 	psmi_getenv("PSM2_DISABLE_CCA",
-		    "Disable use of Congestion Control Architecure (CCA) [enabled] ",
+		    "Disable use of Congestion Control Architecture (CCA) [enabled] ",
 		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
 		    (union psmi_envvar_val)0, &disable_cca);
 	if (disable_cca.e_uint)
diff --git a/ptl_ips/ips_proto.c b/ptl_ips/ips_proto.c
index 150bda1..2c4ebd9 100644
--- a/ptl_ips/ips_proto.c
+++ b/ptl_ips/ips_proto.c
@@ -1097,6 +1097,10 @@ ips_proto_timer_ctrlq_callback(struct psmi_timer *timer, uint64_t t_cyc_expire)
 	while (ctrlq->ctrlq_cqe[ctrlq->ctrlq_tail].msg_queue_mask) {
 		cqe = &ctrlq->ctrlq_cqe[ctrlq->ctrlq_tail];
 
+		/* When PSM_PERF is enabled, the following line causes the
+		   PMU to start a stop watch to measure instruction cycles of the
+		   TX speedpath of PSM.  The stop watch is stopped below. */
+		GENERIC_PERF_BEGIN(PSM_TX_SPEEDPATH_CTR);
 		if (cqe->msg_scb.flow->transfer == PSM_TRANSFER_PIO) {
 			err = ips_spio_transfer_frame(proto,
 				cqe->msg_scb.flow, &cqe->msg_scb.pbc,
@@ -1112,6 +1116,10 @@ ips_proto_timer_ctrlq_callback(struct psmi_timer *timer, uint64_t t_cyc_expire)
 				cqe->msg_scb.cksum, 0,
 				have_cksum, cqe->msg_scb.cksum[0]);
 		}
+		/* When PSM_PERF is enabled, the following line causes the
+		   PMU to stop a stop watch to measure instruction cycles of the
+		   TX speedpath of PSM.  The stop watch was started above. */
+		GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR);
 
 		if (err == PSM2_OK) {
 			ips_proto_epaddr_stats_set(proto, cqe->message_type);
@@ -1197,6 +1205,10 @@ ips_proto_send_ctrl_message(struct ips_flow *flow, uint8_t message_type,
 
 	switch (flow->transfer) {
 	case PSM_TRANSFER_PIO:
+		/* When PSM_PERF is enabled, the following line causes the
+		   PMU to start a stop watch to measure instruction cycles of the
+		   TX speedpath of PSM.  The stop watch is stopped below. */
+		GENERIC_PERF_BEGIN(PSM_TX_SPEEDPATH_CTR);
 		err = ips_spio_transfer_frame(proto, flow,
 			     &ctrlscb->pbc, payload, paylen,
 			     PSMI_TRUE, have_cksum, ctrlscb->cksum[0]
@@ -1204,11 +1216,23 @@ ips_proto_send_ctrl_message(struct ips_flow *flow, uint8_t message_type,
 			     , 0
 #endif
 			     );
+		/* When PSM_PERF is enabled, the following line causes the
+		   PMU to stop a stop watch to measure instruction cycles of the
+		   TX speedpath of PSM.  The stop watch was started above. */
+		GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR);
 		break;
 	case PSM_TRANSFER_DMA:
+		/* When PSM_PERF is enabled, the following line causes the
+		   PMU to start a stop watch to measure instruction cycles of the
+		   TX speedpath of PSM.  The stop watch is stopped below. */
+		GENERIC_PERF_BEGIN(PSM_TX_SPEEDPATH_CTR);
 		err = ips_dma_transfer_frame(proto, flow,
 			     ctrlscb, payload, paylen,
 			     have_cksum, ctrlscb->cksum[0]);
+		/* When PSM_PERF is enabled, the following line causes the
+		   PMU to stop a stop watch to measure instruction cycles of the
+		   TX speedpath of PSM.  The stop watch was started above. */
+		GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR);
 		break;
 	default:
 		err = PSM2_INTERNAL_ERR;
@@ -1347,6 +1371,10 @@ ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed)
 		scb = SLIST_FIRST(scb_pend);
 		psmi_assert(scb->nfrag == 1);
 
+		/* When PSM_PERF is enabled, the following line causes the
+		   PMU to start a stop watch to measure instruction cycles of the
+		   TX speedpath of PSM.  The stop watch is stopped below. */
+		GENERIC_PERF_BEGIN(PSM_TX_SPEEDPATH_CTR);
 		if ((err = ips_spio_transfer_frame(proto, flow, &scb->pbc,
 						   ips_scb_buffer(scb),
 						   scb->payload_size,
@@ -1359,6 +1387,10 @@ ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed)
 						   , IS_TRANSFER_BUF_GPU_MEM(scb)
 #endif
 						)) == PSM2_OK) {
+			/* When PSM_PERF is enabled, the following line causes the
+			   PMU to stop a stop watch to measure instruction cycles of the
+			   TX speedpath of PSM.  The stop watch was started above. */
+			GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR);
 			t_cyc = get_cycles();
 			scb->flags &= ~IPS_SEND_FLAG_PENDING;
 			scb->ack_timeout = proto->epinfo.ep_timeout_ack;
@@ -1373,7 +1405,13 @@ ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed)
 #endif
 
 		} else
+		{
+			/* When PSM_PERF is enabled, the following line causes the
+			   PMU to stop a stop watch to measure instruction cycles of the
+			   TX speedpath of PSM.  The stop watch was started above. */
+			GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR);
 			break;
+		}
 	}
 
 	/* If out of flow credits re-schedule send timer */
diff --git a/ptl_ips/ips_proto_am.c b/ptl_ips/ips_proto_am.c
index 98a7460..f5eb1cf 100644
--- a/ptl_ips/ips_proto_am.c
+++ b/ptl_ips/ips_proto_am.c
@@ -410,7 +410,8 @@ ips_am_run_handler(const struct ips_message_header *p_hdr,
 {
 	struct ips_am_token token;
 	int nargs = p_hdr->amhdr_nargs;
-	psm2_am_handler_fn_t hfn;
+	int ret;
+	struct psm2_ep_am_handle_entry *hentry;
 	psm2_amarg_t *args = (psm2_amarg_t *)p_hdr->data;
 
 	token.tok.flags = p_hdr->flags;
@@ -449,10 +450,23 @@ ips_am_run_handler(const struct ips_message_header *p_hdr,
 		paylen -= p_hdr->amhdr_len;
 	}
 
-	hfn = psm_am_get_handler_function(proto_am->proto->ep,
+	hentry = psm_am_get_handler_function(proto_am->proto->ep,
 			p_hdr->amhdr_hidx);
 
-	int ret = hfn(&token, args, nargs, payload, paylen);
+	/* Note a guard here for hentry != NULL is not needed because at
+	 * initialization, a psmi_assert_always() assure the entry will be
+	 * non-NULL. */
+
+	if (likely(hentry->version == PSM2_AM_HANDLER_V2)) {
+		psm2_am_handler_2_fn_t hfn2 =
+				(psm2_am_handler_2_fn_t)hentry->hfn;
+		ret = hfn2(&token, args, nargs, payload, paylen, hentry->hctx);
+	} else {
+		psm2_am_handler_fn_t hfn1 =
+				(psm2_am_handler_fn_t)hentry->hfn;
+		ret = hfn1(&token, args, nargs, payload, paylen);
+	}
+
 	return ret;
 }
 
diff --git a/ptl_ips/ips_recvhdrq.c b/ptl_ips/ips_recvhdrq.c
index 4b2617f..7c61399 100644
--- a/ptl_ips/ips_recvhdrq.c
+++ b/ptl_ips/ips_recvhdrq.c
@@ -438,6 +438,10 @@ process_pending_acks(struct ips_recvhdrq *recvq))
  */
 psm2_error_t ips_recvhdrq_progress(struct ips_recvhdrq *recvq)
 {
+	/* When PSM_PERF is enabled, the following line causes the
+	   PMU to start a stop watch to measure instruction cycles of the
+	   RX speedpath of PSM.  The stop watch is stopped below. */
+	GENERIC_PERF_BEGIN(PSM_RX_SPEEDPATH_CTR);
 	struct ips_recvhdrq_state *state = recvq->state;
 	const __le32 *rhf;
 	PSMI_CACHEALIGN struct ips_recvhdrq_event rcv_ev = {.proto =
@@ -545,6 +549,11 @@ psm2_error_t ips_recvhdrq_progress(struct ips_recvhdrq *recvq)
 			if (ret == IPS_RECVHDRQ_REVISIT)
 			{
 				PSM2_LOG_MSG("leaving");
+				/* When PSM_PERF is enabled, the following line causes the
+				   PMU to stop a stop watch to measure instruction cycles of
+				   the RX speedpath of PSM.  The stop watch was started
+				   above. */
+				GENERIC_PERF_END(PSM_RX_SPEEDPATH_CTR);
 				return PSM2_OK_NO_PROGRESS;
 			}
 
@@ -631,6 +640,11 @@ psm2_error_t ips_recvhdrq_progress(struct ips_recvhdrq *recvq)
 			if (ret == IPS_RECVHDRQ_REVISIT)
 			{
 				PSM2_LOG_MSG("leaving");
+				/* When PSM_PERF is enabled, the following line causes the
+				   PMU to stop a stop watch to measure instruction cycles of
+				   the RX speedpath of PSM.  The stop watch was started
+				   above. */
+				GENERIC_PERF_END(PSM_RX_SPEEDPATH_CTR);
 				return PSM2_OK_NO_PROGRESS;
 			}
 		}
@@ -726,6 +740,11 @@ skip_packet_no_egr_update:
 	process_pending_acks(recvq);
 
 	PSM2_LOG_MSG("leaving");
+	/* When PSM_PERF is enabled, the following line causes the
+	   PMU to stop a stop watch to measure instruction cycles of
+	   the RX speedpath of PSM.  The stop watch was started
+	   above. */
+	GENERIC_PERF_END(PSM_RX_SPEEDPATH_CTR);
 	return num_hdrq_done ? PSM2_OK : PSM2_OK_NO_PROGRESS;
 }
 
diff --git a/ptl_ips/ips_tidcache.c b/ptl_ips/ips_tidcache.c
index ecc0bba..aad1ee9 100644
--- a/ptl_ips/ips_tidcache.c
+++ b/ptl_ips/ips_tidcache.c
@@ -203,14 +203,21 @@ retry:
 			 * PSM frees tidcache enteries when the driver sends
 			 * EINVAL there by unpinning pages and freeing some
 			 * BAR1 space.*/
-		     || (PSMI_IS_CUDA_ENABLED && errno == EINVAL)
+		     || (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)start) && errno == EINVAL)
 #endif
 			) && NIDLE) {
 			uint64_t lengthEvicted = ips_tidcache_evict(tidc,length);
 
 			if (lengthEvicted >= length)
 				goto retry;
-		}
+		} else if (errno == EFAULT)
+                       psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+                                " Unhandled error in TID Update: %s\n", strerror(errno));
+#ifdef PSM_CUDA
+		else if (PSMI_IS_CUDA_ENABLED && errno == ENOTSUP)
+		       psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+                                " Nvidia driver apis mismatch: %s\n", strerror(errno));
+#endif
 
 		/* Unable to pin pages? retry later */
 		return PSM2_EP_DEVICE_FAILURE;
diff --git a/ptl_self/Makefile b/ptl_self/Makefile
index daeac5b..6af8bf7 100644
--- a/ptl_self/Makefile
+++ b/ptl_self/Makefile
@@ -55,7 +55,6 @@ OUTDIR = .
 
 this_srcdir = $(shell readlink -m .)
 top_srcdir := $(this_srcdir)/..
-include $(top_srcdir)/buildflags.mak
 INCLUDES += -I$(top_srcdir)
 
 ${TARGLIB}-objs := ptl.o
@@ -68,10 +67,10 @@ IGNORE_DEP_TARGETS = clean
 all .DEFAULT: ${${TARGLIB}-objs}
 
 $(OUTDIR)/%.d: $(this_srcdir)/%.c
-	$(CC) $(CFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
+	$(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o)
 
 $(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS}
-	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+	$(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -c $< -o $@
 
 clean:
 	@if [ -d $(OUTDIR) ]; then \
diff --git a/ptl_self/ptl.c b/ptl_self/ptl.c
index da613d9..4e42bef 100644
--- a/ptl_self/ptl.c
+++ b/ptl_self/ptl.c
@@ -223,14 +223,27 @@ self_am_short_request(psm2_epaddr_t epaddr,
 		      psm2_am_completion_fn_t completion_fn,
 		      void *completion_ctxt)
 {
-	psm2_am_handler_fn_t hfn;
+	struct psm2_ep_am_handle_entry *hentry;
 	psm2_ep_t ep = epaddr->ptlctl->ptl->ep;
 	struct psmi_am_token tok;
 
 	tok.epaddr_incoming = epaddr;
 
-	hfn = psm_am_get_handler_function(ep, handler);
-	hfn(&tok, args, nargs, src, len);
+	hentry = psm_am_get_handler_function(ep, handler);
+
+	/* Note a guard here for hentry != NULL is not needed because at
+	 * initialization, a psmi_assert_always() assure the entry will be
+	 * non-NULL. */
+
+	if (likely(hentry->version == PSM2_AM_HANDLER_V2)) {
+		psm2_am_handler_2_fn_t hfn2 =
+				(psm2_am_handler_2_fn_t)hentry->hfn;
+		hfn2(&tok, args, nargs, src, len, hentry->hctx);
+	} else {
+		psm2_am_handler_fn_t hfn1 =
+				(psm2_am_handler_fn_t)hentry->hfn;
+		hfn1(&tok, args, nargs, src, len);
+	}
 
 	if (completion_fn) {
 		completion_fn(completion_ctxt);
@@ -246,12 +259,25 @@ self_am_short_reply(psm2_am_token_t token,
 		    void *src, size_t len, int flags,
 		    psm2_am_completion_fn_t completion_fn, void *completion_ctxt)
 {
-	psm2_am_handler_fn_t hfn;
+	struct psm2_ep_am_handle_entry *hentry;
 	struct psmi_am_token *tok = token;
 	psm2_ep_t ep = tok->epaddr_incoming->ptlctl->ptl->ep;
 
-	hfn = psm_am_get_handler_function(ep, handler);
-	hfn(token, args, nargs, src, len);
+	hentry = psm_am_get_handler_function(ep, handler);
+
+	/* Note a guard here for hentry != NULL is not needed because at
+	 * initialization, a psmi_assert_always() assure the entry will be
+	 * non-NULL. */
+
+	if (likely(hentry->version == PSM2_AM_HANDLER_V2)) {
+		psm2_am_handler_2_fn_t hfn2 =
+				(psm2_am_handler_2_fn_t)hentry->hfn;
+		hfn2(token, args, nargs, src, len, hentry->hctx);
+	} else {
+		psm2_am_handler_fn_t hfn1 =
+				(psm2_am_handler_fn_t)hentry->hfn;
+		hfn1(token, args, nargs, src, len);
+	}
 
 	if (completion_fn) {
 		completion_fn(completion_ctxt);
diff --git a/rpm_release_extension b/rpm_release_extension
index 98d9bcb..81b5c5d 100644
--- a/rpm_release_extension
+++ b/rpm_release_extension
@@ -1 +1 @@
-17
+37

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ofed/libpsm2.git



More information about the Pkg-ofed-commits mailing list