[superlu-dist] 01/01: New upstream version 5.2.2+dfsg1
Drew Parsons
dparsons at moszumanska.debian.org
Wed Nov 8 09:06:05 UTC 2017
This is an automated email from the git hooks/post-receive script.
dparsons pushed a commit to annotated tag upstream/5.2.2+dfsg1
in repository superlu-dist.
commit d99f988c13b18c2e9810e13706c9c6776ded6e9c
Author: Drew Parsons <dparsons at debian.org>
Date: Wed Nov 8 17:03:38 2017 +0800
New upstream version 5.2.2+dfsg1
---
.gitignore | 5 +-
CBLAS/Makefile | 10 +-
CMakeLists.txt | 93 +-
DOC/ug.pdf | Bin 687318 -> 0 bytes
DoxyConfig | 4 +-
EXAMPLE/CMakeLists.txt | 22 +-
EXAMPLE/README | 2 +
EXAMPLE/dcreate_matrix.c | 5 +
EXAMPLE/pddrive.c | 16 +-
EXAMPLE/pddrive1.c | 13 +-
EXAMPLE/pddrive2.c | 13 +-
EXAMPLE/pddrive3.c | 13 +-
EXAMPLE/pddrive4.c | 3 +-
EXAMPLE/pzdrive.c | 16 +-
EXAMPLE/pzdrive1.c | 13 +-
EXAMPLE/pzdrive2.c | 13 +-
EXAMPLE/pzdrive3.c | 13 +-
EXAMPLE/pzdrive4.c | 3 +-
EXAMPLE/zcreate_matrix.c | 5 +
INSTALL/superlu_timer.c | 54 -
MAKE_INC/make.cuda_gpu | 4 +-
MAKE_INC/make.mac-x | 43 +
MAKE_INC/make.mpich | 2 +-
MAKE_INC/{make.mpich => make.ssg1} | 20 +-
MAKE_INC/make.xc30 | 2 +-
MAKE_INC/make.xt5 | 3 +-
Makefile | 3 +-
README | 251 ---
README.md | 274 +++
SRC/CMakeLists.txt | 17 +-
SRC/Makefile | 22 +-
SRC/colamd.c | 3424 +++++++++++++++++++++++++++++
SRC/colamd.h | 259 +++
SRC/dSchCompUdt-2Ddynamic.c | 762 ++++---
SRC/dbinary_io.c | 40 +
SRC/dlook_ahead_update.c | 115 +-
SRC/dmemory_dist.c | 7 +-
SRC/dreadMM.c | 26 +-
SRC/dscatter.c | 87 +-
SRC/get_perm_c.c | 41 +
SRC/mc64ad_dist.c | 2654 ----------------------
SRC/memory.c | 31 +-
SRC/pdgssvx.c | 24 +-
SRC/pdgstrf.c | 442 ++--
SRC/pdgstrf2.c | 165 +-
SRC/psymbfact.h | 5 +-
SRC/pzgssvx.c | 24 +-
SRC/pzgstrf.c | 442 ++--
SRC/pzgstrf2.c | 165 +-
SRC/sp_colorder.c | 3 +-
SRC/sp_ienv.c | 5 +-
SRC/static_schedule.c | 10 +
SRC/superlu_ddefs.h | 2 +-
SRC/superlu_defs.h | 15 +-
SRC/superlu_dist_config.h | 4 +
SRC/superlu_dist_config.h.in | 9 +
SRC/superlu_dist_version.c | 30 +
SRC/superlu_enum_consts.h | 3 +
SRC/superlu_zdefs.h | 2 +-
SRC/util.c | 86 +-
SRC/zSchCompUdt-2Ddynamic.c | 762 ++++---
SRC/zbinary_io.c | 40 +
SRC/zlook_ahead_update.c | 115 +-
SRC/zmemory_dist.c | 7 +-
SRC/zreadMM.c | 32 +-
SRC/zscatter.c | 87 +-
TEST/#pztest.c# | 517 +++++
TEST/CMakeLists.txt | 79 +
TEST/Makefile | 56 +
TEST/README | 12 +
{EXAMPLE => TEST}/dcreate_matrix.c | 5 +
TEST/pdcompute_resid.c | 155 ++
TEST/pdtest.c | 519 +++++
TEST/pdtest.sh | 64 +
TEST/pzcompute_resid.c | 154 ++
TEST/pztest.c | 518 +++++
TEST/pztest.sh | 64 +
TEST/runtest.cmake | 13 +
{EXAMPLE => TEST}/zcreate_matrix.c | 5 +
compile.out | 62 +
MAKE_INC/make.mpich => make.inc | 24 +-
make.inc.in | 7 +-
run_cmake_build.csh | 2 +-
run_cmake_build.csh => run_cmake_build.sh | 55 +-
superlu_dist.pc.in | 12 +
85 files changed, 8861 insertions(+), 4344 deletions(-)
diff --git a/.gitignore b/.gitignore
index 2eb65d5..adcaf5c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,12 @@
*~
-# You have to ignore this genrated file or git will complain that it is an
+# You have to ignore this generated file or git will complain that it is an
# unknown file!
/make.inc
# If the instructions are telling people to create this build dir under the
# source tree, you had better put in an ignore for this.
/build/
+
+# Ignore Testing/ folder
+Testing/
diff --git a/CBLAS/Makefile b/CBLAS/Makefile
index 5812c03..d0eca9a 100644
--- a/CBLAS/Makefile
+++ b/CBLAS/Makefile
@@ -66,28 +66,28 @@ ALLBLAS = input_error_dist.o
all: single double complex complex16
-single: $(SBLAS1) $(SBLAS2) $(SBLAS3)
+single: $(SBLAS1) $(SBLAS2) $(SBLAS3) $(ALLBLAS)
$(ARCH) $(ARCHFLAGS) $(BLASLIB) $(SBLAS1) $(ALLBLAS) \
$(SBLAS2) $(SBLAS3)
$(RANLIB) $(BLASLIB)
-double: $(DBLAS1) $(DBLAS2) $(DBLAS3)
+double: $(DBLAS1) $(DBLAS2) $(DBLAS3) $(ALLBLAS)
$(ARCH) $(ARCHFLAGS) $(BLASLIB) $(DBLAS1) $(ALLBLAS) \
$(DBLAS2) $(DBLAS3)
$(RANLIB) $(BLASLIB)
-complex: $(CBLAS1) $(CBLAS2) $(CBLAS3)
+complex: $(CBLAS1) $(CBLAS2) $(CBLAS3) $(ALLBLAS)
$(ARCH) $(ARCHFLAGS) $(BLASLIB) $(CBLAS1) $(ALLBLAS) \
$(CBLAS2) $(CBLAS3)
$(RANLIB) $(BLASLIB)
-complex16: $(ZBLAS1) $(ZBLAS2) $(ZBLAS3)
+complex16: $(ZBLAS1) $(ZBLAS2) $(ZBLAS3) $(ALLBLAS)
$(ARCH) $(ARCHFLAGS) $(BLASLIB) $(ZBLAS1) $(ALLBLAS) \
$(ZBLAS2) $(ZBLAS3)
$(RANLIB) $(BLASLIB)
.c.o:
- $(CC) $(CFLAGS) $(CDEFS) -I$(HEADER) -c $< $(VERBOSE)
+ $(CC) $(CFLAGS) $(CDEFS) -c $< $(VERBOSE)
clean:
rm -f *.o
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d082edc..437306d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,8 +10,8 @@ cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR)
# Project version numbers
project(SuperLU_DIST NONE)
set(VERSION_MAJOR "5")
-set(VERSION_MINOR "1")
-set(VERSION_BugFix "3")
+set(VERSION_MINOR "2")
+set(VERSION_BugFix "2")
set(PROJECT_VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_BugFix})
######################################################################
@@ -20,6 +20,8 @@ set(PROJECT_VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_BugFix})
MESSAGE("\nProcess XSDK defaults ...")
# SET(USE_XSDK_DEFAULTS_DEFAULT TRUE) # Set to false if desired
INCLUDE("cmake/XSDKDefaults.cmake")
+INCLUDE(CTest)
+
######################################################################
######################################################################
@@ -27,20 +29,8 @@ INCLUDE("cmake/XSDKDefaults.cmake")
# Usual initialization stuff
#
######################################################################
-# setup options
-option(enable_blaslib "Build the CBLAS library" ${enable_blaslib_DEFAULT})
-option(enable_parmetislib "Build the ParMETIS library" ON)
-option(enable_doc "Build doxygen documentation" OFF)
-option(enable_double "Enable double precision library" ON)
-option(enable_complex16 "Enable complex16 precision library" ON)
-option(enable_examples "Build examples" ON)
-option(TPL_PARMETIS_LIBRARIES "List of absolute paths to ParMETIS link libraries [].")
-option(TPL_PARMETIS_INCLUDE_DIRS "List of absolute paths to ParMETIS include directories [].")
-
-if (NOT CMAKE_INSTALL_PREFIX)
- set(CMAKE_INSTALL_PREFIX /usr/local)
-endif()
-
+set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) ## ????
+set(CMAKE_INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib")
#---- For shared library
@@ -82,13 +72,28 @@ else()
set(enable_blaslib_DEFAULT ON)
endif()
+if (NOT CMAKE_INSTALL_PREFIX)
+ set(CMAKE_INSTALL_PREFIX /usr/local)
+endif()
+
+
+# setup options
+option(enable_blaslib "Build the CBLAS library" ${enable_blaslib_DEFAULT})
+option(enable_parmetislib "Build the ParMETIS library" ON)
+option(enable_doc "Build doxygen documentation" OFF)
+option(enable_double "Enable double precision library" ON)
+option(enable_complex16 "Enable complex16 precision library" ON)
+option(enable_tests "Build tests" ON)
+option(enable_examples "Build examples" ON)
+option(TPL_PARMETIS_LIBRARIES "List of absolute paths to ParMETIS link libraries [].")
+option(TPL_PARMETIS_INCLUDE_DIRS "List of absolute paths to ParMETIS include directories [].")
+
# setup required compiler defines and options.
## get_directory_property( DirDefs COMPILE_DEFINITIONS )
-set(CMAKE_C_FLAGS "-DDEBUGlevel=0 -DPRNTlevel=0 ${CMAKE_C_FLAGS}")
+# set(CMAKE_C_FLAGS "-DDEBUGlevel=0 -DPRNTlevel=0 ${CMAKE_C_FLAGS}")
if(XSDK_INDEX_SIZE EQUAL 64)
message("-- Using 64 bit integer for index size")
- set(CMAKE_C_FLAGS "-D_LONGINT ${CMAKE_C_FLAGS}")
endif()
set(CMAKE_C_FLAGS_RELEASE "-O3" CACHE STRING "")
@@ -98,6 +103,23 @@ set(CMAKE_C_FLAGS_RELEASE "-O3" CACHE STRING "")
#
######################################################################
#
+#--------------------- MPI ---------------------
+find_package(MPI)
+if(MPI_C_FOUND)
+ set(CMAKE_C_FLAGS "${MPI_C_COMPILE_FLAGS} ${CMAKE_C_FLAGS}")
+ set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MPI_C_LINK_FLAGS}" )
+endif()
+#--------------------- OpenMP ---------------------
+find_package(OpenMP)
+## include(FindOpenMP) # Strumpack uses this
+if(OPENMP_FOUND)
+ set(CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${CMAKE_C_FLAGS}")
+# On edison, OpenMP_EXE_LINKER_FLAGS is empty
+# set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+ set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
+ message("-- OpenMP_EXE_LINKER_FLAGS='${OpenMP_EXE_LINKER_FLAGS}'")
+ message("-- CMAKE_EXE_LINKER_FLAGS='${CMAKE_EXE_LINKER_FLAGS}'")
+endif()
#--------------------- BLAS ---------------------
if(NOT enable_blaslib)
# set(TPL_BLAS_LIBRARIES "" CACHE FILEPATH
@@ -125,27 +147,12 @@ else()
add_subdirectory(CBLAS)
set(BLAS_LIB blas)
if (BUILD_SHARED_LIBS) # export to be referenced by downstream makefile
- set(BLAS_LIB_EXPORT ${CMAKE_SOURCE_DIR}/build/CBLAS/libblas.so)
+ set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/CBLAS/libblas.so)
else()
- set(BLAS_LIB_EXPORT ${CMAKE_SOURCE_DIR}/build/CBLAS/libblas.a)
+ set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/CBLAS/libblas.a)
endif()
endif()
-#--------------------- MPI ---------------------
-find_package(MPI)
-if(MPI_C_FOUND)
- set(CMAKE_C_FLAGS "${MPI_C_COMPILE_FLAGS} ${CMAKE_C_FLAGS}")
- set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MPI_C_LINK_FLAGS}" )
-endif()
-#--------------------- OpenMP ---------------------
-find_package(OpenMP)
-if(OPENMP_FOUND)
- set(CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${CMAKE_C_FLAGS}")
-# On edison, OpenMP_EXE_LINKER_FLAGS is empty
-# set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
- set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
-# message("-- CMAKE_EXE_LINKER_FLAGS='${CMAKE_EXE_LINKER_FLAGS}'")
-endif()
#--------------------- ParMETIS ---------------------
if (enable_parmetislib) ## want to use parmetis
if (NOT TPL_PARMETIS_LIBRARIES)
@@ -197,16 +204,16 @@ include_directories(${MPI_C_INCLUDE_PATH})
add_subdirectory(SRC)
-if(enable_tests)
- enable_testing()
- add_subdirectory(TESTING)
-endif()
-
if(enable_doc)
message(FATAL_ERROR "Documentation build requested but not implemented.")
#implement doxygen
endif()
+if(enable_tests)
+ enable_testing()
+ add_subdirectory(TEST)
+endif()
+
if(enable_examples)
enable_testing()
add_subdirectory(EXAMPLE)
@@ -215,3 +222,9 @@ endif()
# file(WRITE "make.defs" "# can be exposed to users" ${CMAKE_C_COMPILER})
# configure_file(${CMAKE_SOURCE_DIR}/make.inc.in ${CMAKE_BINARY_DIR}/make.inc)
configure_file(${SuperLU_DIST_SOURCE_DIR}/make.inc.in ${SuperLU_DIST_SOURCE_DIR}/make.inc)
+configure_file(${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_dist_config.h.in ${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_dist_config.h)
+
+# Add pkg-config support
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/superlu_dist.pc.in ${CMAKE_CURRENT_BINARY_DIR}/superlu_dist.pc @ONLY)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/superlu_dist.pc
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
diff --git a/DOC/ug.pdf b/DOC/ug.pdf
deleted file mode 100644
index f854405..0000000
Binary files a/DOC/ug.pdf and /dev/null differ
diff --git a/DoxyConfig b/DoxyConfig
index 5bbc5a0..9760183 100644
--- a/DoxyConfig
+++ b/DoxyConfig
@@ -31,7 +31,7 @@ PROJECT_NAME = SuperLU Distributed
# This could be handy for archiving the generated documentation or
# if some version control system is used.
-PROJECT_NUMBER = 5.0.0
+PROJECT_NUMBER = 5.2.2
e
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
# base path where the generated documentation will be put.
@@ -513,7 +513,7 @@ WARN_LOGFILE =
# directories like "/usr/src/myproject". Separate the files or directories
# with spaces.
-INPUT = SRC/ EXAMPLE/ FORTRAN/
+INPUT = SRC/ EXAMPLE/ FORTRAN/ TEST/
# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
diff --git a/EXAMPLE/CMakeLists.txt b/EXAMPLE/CMakeLists.txt
index 5eb7473..2ecb368 100644
--- a/EXAMPLE/CMakeLists.txt
+++ b/EXAMPLE/CMakeLists.txt
@@ -3,51 +3,52 @@ include_directories(${SuperLU_DIST_SOURCE_DIR}/SRC)
# Libs linked to all of the examples
set(all_link_libs superlu_dist ${BLAS_LIB} m)
-function(add_superlu_dist_test target input nprow npcol)
- set(TEST_INPUT "${SuperLU_DIST_SOURCE_DIR}/EXAMPLE/${input}")
- set(TEST_OUTPUT "${SuperLU_DIST_BINARY_DIR}/EXAMPLE/${target}.out")
+function(add_superlu_dist_example target input nprow npcol)
+ set(EXAMPLE_INPUT "${SuperLU_DIST_SOURCE_DIR}/EXAMPLE/${input}")
+ set(EXAMPLE_OUTPUT "${SuperLU_DIST_BINARY_DIR}/EXAMPLE/${target}.out")
## get_target_property(TEST_LOC ${target} LOCATION)
- set(TEST_LOC ${CMAKE_CURRENT_BINARY_DIR})
+ set(EXAMPLE_LOC ${CMAKE_CURRENT_BINARY_DIR})
MATH( EXPR procs "${nprow}*${npcol}" )
# message("MPIEXEC_FLAG is ${MPIEXEC_NUMPROC_FLAG}")
# corresponding to mpiexec -n 4 pddrive -r <nprow> -c <npcol> g20.rua
add_test(${target} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${procs}
- ${MPIEXEC_PREFLAGS} ${target} ${MPIEXEC_POSTFLAGS} -r "${nprow}" -c "${npcol}" "${TEST_INPUT}")
+ ${MPIEXEC_PREFLAGS} ${target} ${MPIEXEC_POSTFLAGS} -r "${nprow}" -c "${npcol}" "${EXAMPLE_INPUT}")
+
# add_test(NAME ${target} COMMAND "${CMAKE_COMMAND}"
# -DTEST=${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${procs}
# ${MPIEXEC_PREFLAGS} ${target} ${MPIEXEC_POSTFLAGS} -r "${nprow}" -c "${npcol}" "${TEST_INPUT}"
# -DOUTPUT=${target}.out
# -P "${SuperLU_DIST_SOURCE_DIR}/EXAMPLE/runexample.cmake" )
-
# MPI variables:
# ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} PROCS
# ${MPIEXEC_PREFLAGS} EXECUTABLE ${MPIEXEC_POSTFLAGS} ARGS)
-endfunction(add_superlu_dist_test)
+endfunction(add_superlu_dist_example)
if(enable_double)
set(DEXM pddrive.c dcreate_matrix.c)
add_executable(pddrive ${DEXM})
target_link_libraries(pddrive ${all_link_libs})
- add_superlu_dist_test(pddrive big.rua 2 2)
set(DEXM1 pddrive1.c dcreate_matrix.c)
add_executable(pddrive1 ${DEXM1})
target_link_libraries(pddrive1 ${all_link_libs})
- add_superlu_dist_test(pddrive1 big.rua 2 2)
+ add_superlu_dist_example(pddrive1 big.rua 2 2)
set(DEXM2 pddrive2.c dcreate_matrix.c dcreate_matrix_perturbed.c)
add_executable(pddrive2 ${DEXM2})
target_link_libraries(pddrive2 ${all_link_libs})
+ add_superlu_dist_example(pddrive2 big.rua 2 2)
set(DEXM3 pddrive3.c dcreate_matrix.c)
add_executable(pddrive3 ${DEXM3})
target_link_libraries(pddrive3 ${all_link_libs})
+ add_superlu_dist_example(pddrive3 big.rua 2 2)
set(DEXM4 pddrive4.c dcreate_matrix.c)
add_executable(pddrive4 ${DEXM4})
@@ -84,14 +85,17 @@ if(enable_complex16)
set(ZEXM1 pzdrive1.c zcreate_matrix.c)
add_executable(pzdrive1 ${ZEXM1})
target_link_libraries(pzdrive1 ${all_link_libs})
+ add_superlu_dist_example(pzdrive1 cg20.cua 2 2)
set(ZEXM2 pzdrive2.c zcreate_matrix.c zcreate_matrix_perturbed.c)
add_executable(pzdrive2 ${ZEXM2})
target_link_libraries(pzdrive2 ${all_link_libs})
+ add_superlu_dist_example(pzdrive2 cg20.cua 2 2)
set(ZEXM3 pzdrive3.c zcreate_matrix.c)
add_executable(pzdrive3 ${ZEXM3})
target_link_libraries(pzdrive3 ${all_link_libs})
+ add_superlu_dist_example(pzdrive3 cg20.cua 2 2)
set(ZEXM4 pzdrive4.c zcreate_matrix.c)
add_executable(pzdrive4 ${ZEXM4})
diff --git a/EXAMPLE/README b/EXAMPLE/README
index f773812..7146acd 100644
--- a/EXAMPLE/README
+++ b/EXAMPLE/README
@@ -50,3 +50,5 @@ command.
4. To run the complex examples pzdrive4 and pzdrive4_ABglobal, you may type:
% mpiexec -n 10 pzdrive4 cg20.cua
+
+
diff --git a/EXAMPLE/dcreate_matrix.c b/EXAMPLE/dcreate_matrix.c
index 77292d7..a622463 100644
--- a/EXAMPLE/dcreate_matrix.c
+++ b/EXAMPLE/dcreate_matrix.c
@@ -89,9 +89,14 @@ int dcreate_matrix(SuperMatrix *A, int nrhs, double **rhs,
#endif
if ( !iam ) {
+ double t = SuperLU_timer_();
+
/* Read the matrix stored on disk in Harwell-Boeing format. */
dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+ printf("Time to read and distribute matrix %.2f\n",
+ SuperLU_timer_() - t); fflush(stdout);
+
/* Broadcast matrix A to the other PEs. */
MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm );
MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm );
diff --git a/EXAMPLE/pddrive.c b/EXAMPLE/pddrive.c
index 3ebca24..750613c 100644
--- a/EXAMPLE/pddrive.c
+++ b/EXAMPLE/pddrive.c
@@ -63,7 +63,7 @@ int main(int argc, char *argv[])
int iam, info, ldb, ldx, nrhs;
char **cpp, c;
FILE *fp, *fopen();
- extern int cpp_defs();
+ int cpp_defs();
nprow = 1; /* Default process rows. */
npcol = 1; /* Default process columns. */
@@ -108,8 +108,13 @@ int main(int argc, char *argv[])
iam = grid.iam;
if ( iam >= nprow * npcol ) goto out;
if ( !iam ) {
- printf("Input matrix file: %s\n", *cpp);
- printf("\tProcess grid\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+ int v_major, v_minor, v_bugfix;
+ superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+ printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+ printf("Input matrix file:\t%s\n", *cpp);
+ printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+ fflush(stdout);
}
#if ( VAMPIR>=1 )
@@ -138,7 +143,7 @@ int main(int argc, char *argv[])
options.ParSymbFact = NO;
options.ColPerm = METIS_AT_PLUS_A;
options.RowPerm = LargeDiag;
- options.ReplaceTinyPivot = YES;
+ options.ReplaceTinyPivot = NO;
options.IterRefine = DOUBLE;
options.Trans = NOTRANS;
options.SolveInitialized = NO;
@@ -151,12 +156,13 @@ int main(int argc, char *argv[])
options.IterRefine = NOREFINE;
options.ColPerm = NATURAL;
options.Equil = NO;
- options.ReplaceTinyPivot = NO;
+ options.ReplaceTinyPivot = YES;
#endif
if (!iam) {
print_sp_ienv_dist(&options);
print_options_dist(&options);
+ fflush(stdout);
}
m = A.nrow;
diff --git a/EXAMPLE/pddrive1.c b/EXAMPLE/pddrive1.c
index 9c01607..37e9ea8 100644
--- a/EXAMPLE/pddrive1.c
+++ b/EXAMPLE/pddrive1.c
@@ -60,6 +60,7 @@ int main(int argc, char *argv[])
int iam, info, ldb, ldx, nrhs;
char **cpp, c;
FILE *fp, *fopen();
+ int cpp_defs();
nprow = 1; /* Default process rows. */
npcol = 1; /* Default process columns. */
@@ -104,8 +105,13 @@ int main(int argc, char *argv[])
iam = grid.iam;
if ( iam >= nprow * npcol ) goto out;
if ( !iam ) {
- printf("Input matrix file: %s\n", *cpp);
- printf("\tProcess grid\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+ int v_major, v_minor, v_bugfix;
+ superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+ printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+ printf("Input matrix file:\t%s\n", *cpp);
+ printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+ fflush(stdout);
}
#if ( VAMPIR>=1 )
@@ -137,7 +143,7 @@ int main(int argc, char *argv[])
options.Equil = YES;
options.ColPerm = METIS_AT_PLUS_A;
options.RowPerm = LargeDiag;
- options.ReplaceTinyPivot = YES;
+ options.ReplaceTinyPivot = NO;
options.Trans = NOTRANS;
options.IterRefine = DOUBLE;
options.SolveInitialized = NO;
@@ -149,6 +155,7 @@ int main(int argc, char *argv[])
if (!iam) {
print_sp_ienv_dist(&options);
print_options_dist(&options);
+ fflush(stdout);
}
m = A.nrow;
diff --git a/EXAMPLE/pddrive2.c b/EXAMPLE/pddrive2.c
index 0cf3191..e4d9508 100644
--- a/EXAMPLE/pddrive2.c
+++ b/EXAMPLE/pddrive2.c
@@ -64,6 +64,7 @@ int main(int argc, char *argv[])
int iam, info, ldb, ldx, nrhs;
char **cpp, c;
FILE *fp, *fopen();
+ int cpp_defs();
/* prototypes */
extern int dcreate_matrix_perturbed
@@ -113,8 +114,13 @@ int main(int argc, char *argv[])
iam = grid.iam;
if ( iam >= nprow * npcol ) goto out;
if ( !iam ) {
- printf("Input matrix file: %s\n", *cpp);
- printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+ int v_major, v_minor, v_bugfix;
+ superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+ printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+ printf("Input matrix file:\t%s\n", *cpp);
+ printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+ fflush(stdout);
}
#if ( DEBUGlevel>=1 )
@@ -142,7 +148,7 @@ int main(int argc, char *argv[])
options.Equil = YES;
options.ColPerm = METIS_AT_PLUS_A;
options.RowPerm = LargeDiag;
- options.ReplaceTinyPivot = YES;
+ options.ReplaceTinyPivot = NO;
options.Trans = NOTRANS;
options.IterRefine = DOUBLE;
options.SolveInitialized = NO;
@@ -154,6 +160,7 @@ int main(int argc, char *argv[])
if (!iam) {
print_sp_ienv_dist(&options);
print_options_dist(&options);
+ fflush(stdout);
}
/* Initialize ScalePermstruct and LUstruct. */
diff --git a/EXAMPLE/pddrive3.c b/EXAMPLE/pddrive3.c
index e591f39..2a971b1 100644
--- a/EXAMPLE/pddrive3.c
+++ b/EXAMPLE/pddrive3.c
@@ -69,6 +69,7 @@ int main(int argc, char *argv[])
int iam, info, ldb, ldx, nrhs;
char **cpp, c;
FILE *fp, *fopen();
+ int cpp_defs();
nprow = 1; /* Default process rows. */
npcol = 1; /* Default process columns. */
@@ -113,8 +114,13 @@ int main(int argc, char *argv[])
iam = grid.iam;
if ( iam >= nprow * npcol ) goto out;
if ( !iam ) {
- printf("Input matrix file: %s\n", *cpp);
- printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+ int v_major, v_minor, v_bugfix;
+ superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+ printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+ printf("Input matrix file:\t%s\n", *cpp);
+ printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+ fflush(stdout);
}
#if ( DEBUGlevel>=1 )
@@ -161,7 +167,7 @@ int main(int argc, char *argv[])
options.Equil = YES;
options.ColPerm = METIS_AT_PLUS_A;
options.RowPerm = LargeDiag;
- options.ReplaceTinyPivot = YES;
+ options.ReplaceTinyPivot = NO;
options.Trans = NOTRANS;
options.IterRefine = DOUBLE;
options.SolveInitialized = NO;
@@ -173,6 +179,7 @@ int main(int argc, char *argv[])
if (!iam) {
print_sp_ienv_dist(&options);
print_options_dist(&options);
+ fflush(stdout);
}
/* Initialize ScalePermstruct and LUstruct. */
diff --git a/EXAMPLE/pddrive4.c b/EXAMPLE/pddrive4.c
index d0192ec..1a03add 100644
--- a/EXAMPLE/pddrive4.c
+++ b/EXAMPLE/pddrive4.c
@@ -66,6 +66,7 @@ int main(int argc, char *argv[])
int nrhs = 1; /* Number of right-hand side. */
char **cpp, c;
FILE *fp, *fopen();
+ int cpp_defs();
/* ------------------------------------------------------------
@@ -153,7 +154,7 @@ int main(int argc, char *argv[])
options.Equil = YES;
options.ColPerm = METIS_AT_PLUS_A;
options.RowPerm = LargeDiag;
- options.ReplaceTinyPivot = YES;
+ options.ReplaceTinyPivot = NO;
options.Trans = NOTRANS;
options.IterRefine = DOUBLE;
options.SolveInitialized = NO;
diff --git a/EXAMPLE/pzdrive.c b/EXAMPLE/pzdrive.c
index 33e0a9d..b1785b8 100644
--- a/EXAMPLE/pzdrive.c
+++ b/EXAMPLE/pzdrive.c
@@ -62,7 +62,7 @@ int main(int argc, char *argv[])
int iam, info, ldb, ldx, nrhs;
char **cpp, c;
FILE *fp, *fopen();
- extern int cpp_defs();
+ int cpp_defs();
nprow = 1; /* Default process rows. */
npcol = 1; /* Default process columns. */
@@ -107,8 +107,13 @@ int main(int argc, char *argv[])
iam = grid.iam;
if ( iam >= nprow * npcol ) goto out;
if ( !iam ) {
- printf("Input matrix file: %s\n", *cpp);
- printf("\tProcess grid\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+ int v_major, v_minor, v_bugfix;
+ superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+ printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+ printf("Input matrix file:\t%s\n", *cpp);
+ printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+ fflush(stdout);
}
#if ( VAMPIR>=1 )
@@ -137,7 +142,7 @@ int main(int argc, char *argv[])
options.ParSymbFact = NO;
options.ColPerm = METIS_AT_PLUS_A;
options.RowPerm = LargeDiag;
- options.ReplaceTinyPivot = YES;
+ options.ReplaceTinyPivot = NO;
options.IterRefine = DOUBLE;
options.Trans = NOTRANS;
options.SolveInitialized = NO;
@@ -150,12 +155,13 @@ int main(int argc, char *argv[])
options.IterRefine = NOREFINE;
options.ColPerm = NATURAL;
options.Equil = NO;
- options.ReplaceTinyPivot = NO;
+ options.ReplaceTinyPivot = YES;
#endif
if (!iam) {
print_sp_ienv_dist(&options);
print_options_dist(&options);
+ fflush(stdout);
}
m = A.nrow;
diff --git a/EXAMPLE/pzdrive1.c b/EXAMPLE/pzdrive1.c
index 402a133..50726ce 100644
--- a/EXAMPLE/pzdrive1.c
+++ b/EXAMPLE/pzdrive1.c
@@ -59,6 +59,7 @@ int main(int argc, char *argv[])
int iam, info, ldb, ldx, nrhs;
char **cpp, c;
FILE *fp, *fopen();
+ int cpp_defs();
nprow = 1; /* Default process rows. */
npcol = 1; /* Default process columns. */
@@ -103,8 +104,13 @@ int main(int argc, char *argv[])
iam = grid.iam;
if ( iam >= nprow * npcol ) goto out;
if ( !iam ) {
- printf("Input matrix file: %s\n", *cpp);
- printf("\tProcess grid\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+ int v_major, v_minor, v_bugfix;
+ superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+ printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+ printf("Input matrix file:\t%s\n", *cpp);
+ printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+ fflush(stdout);
}
#if ( VAMPIR>=1 )
@@ -136,7 +142,7 @@ int main(int argc, char *argv[])
options.Equil = YES;
options.ColPerm = METIS_AT_PLUS_A;
options.RowPerm = LargeDiag;
- options.ReplaceTinyPivot = YES;
+ options.ReplaceTinyPivot = NO;
options.Trans = NOTRANS;
options.IterRefine = DOUBLE;
options.SolveInitialized = NO;
@@ -148,6 +154,7 @@ int main(int argc, char *argv[])
if (!iam) {
print_sp_ienv_dist(&options);
print_options_dist(&options);
+ fflush(stdout);
}
m = A.nrow;
diff --git a/EXAMPLE/pzdrive2.c b/EXAMPLE/pzdrive2.c
index b75f6ef..bf8f69c 100644
--- a/EXAMPLE/pzdrive2.c
+++ b/EXAMPLE/pzdrive2.c
@@ -63,6 +63,7 @@ int main(int argc, char *argv[])
int iam, info, ldb, ldx, nrhs;
char **cpp, c;
FILE *fp, *fopen();
+ int cpp_defs();
/* prototypes */
extern int zcreate_matrix_perturbed
@@ -112,8 +113,13 @@ int main(int argc, char *argv[])
iam = grid.iam;
if ( iam >= nprow * npcol ) goto out;
if ( !iam ) {
- printf("Input matrix file: %s\n", *cpp);
- printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+ int v_major, v_minor, v_bugfix;
+ superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+ printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+ printf("Input matrix file:\t%s\n", *cpp);
+ printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+ fflush(stdout);
}
#if ( DEBUGlevel>=1 )
@@ -141,7 +147,7 @@ int main(int argc, char *argv[])
options.Equil = YES;
options.ColPerm = METIS_AT_PLUS_A;
options.RowPerm = LargeDiag;
- options.ReplaceTinyPivot = YES;
+ options.ReplaceTinyPivot = NO;
options.Trans = NOTRANS;
options.IterRefine = DOUBLE;
options.SolveInitialized = NO;
@@ -153,6 +159,7 @@ int main(int argc, char *argv[])
if (!iam) {
print_sp_ienv_dist(&options);
print_options_dist(&options);
+ fflush(stdout);
}
/* Initialize ScalePermstruct and LUstruct. */
diff --git a/EXAMPLE/pzdrive3.c b/EXAMPLE/pzdrive3.c
index f251587..e086a77 100644
--- a/EXAMPLE/pzdrive3.c
+++ b/EXAMPLE/pzdrive3.c
@@ -68,6 +68,7 @@ int main(int argc, char *argv[])
int iam, info, ldb, ldx, nrhs;
char **cpp, c;
FILE *fp, *fopen();
+ int cpp_defs();
nprow = 1; /* Default process rows. */
npcol = 1; /* Default process columns. */
@@ -112,8 +113,13 @@ int main(int argc, char *argv[])
iam = grid.iam;
if ( iam >= nprow * npcol ) goto out;
if ( !iam ) {
- printf("Input matrix file: %s\n", *cpp);
- printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+ int v_major, v_minor, v_bugfix;
+ superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+ printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+ printf("Input matrix file:\t%s\n", *cpp);
+ printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+ fflush(stdout);
}
#if ( DEBUGlevel>=1 )
@@ -160,7 +166,7 @@ int main(int argc, char *argv[])
options.Equil = YES;
options.ColPerm = METIS_AT_PLUS_A;
options.RowPerm = LargeDiag;
- options.ReplaceTinyPivot = YES;
+ options.ReplaceTinyPivot = NO;
options.Trans = NOTRANS;
options.IterRefine = DOUBLE;
options.SolveInitialized = NO;
@@ -172,6 +178,7 @@ int main(int argc, char *argv[])
if (!iam) {
print_sp_ienv_dist(&options);
print_options_dist(&options);
+ fflush(stdout);
}
/* Initialize ScalePermstruct and LUstruct. */
diff --git a/EXAMPLE/pzdrive4.c b/EXAMPLE/pzdrive4.c
index 8a1caad..8d170d5 100644
--- a/EXAMPLE/pzdrive4.c
+++ b/EXAMPLE/pzdrive4.c
@@ -65,6 +65,7 @@ int main(int argc, char *argv[])
int nrhs = 1; /* Number of right-hand side. */
char **cpp, c;
FILE *fp, *fopen();
+ int cpp_defs();
/* ------------------------------------------------------------
@@ -152,7 +153,7 @@ int main(int argc, char *argv[])
options.Equil = YES;
options.ColPerm = METIS_AT_PLUS_A;
options.RowPerm = LargeDiag;
- options.ReplaceTinyPivot = YES;
+ options.ReplaceTinyPivot = NO;
options.Trans = NOTRANS;
options.IterRefine = DOUBLE;
options.SolveInitialized = NO;
diff --git a/EXAMPLE/zcreate_matrix.c b/EXAMPLE/zcreate_matrix.c
index 87774cf..8660143 100644
--- a/EXAMPLE/zcreate_matrix.c
+++ b/EXAMPLE/zcreate_matrix.c
@@ -88,9 +88,14 @@ int zcreate_matrix(SuperMatrix *A, int nrhs, doublecomplex **rhs,
#endif
if ( !iam ) {
+ double t = SuperLU_timer_();
+
/* Read the matrix stored on disk in Harwell-Boeing format. */
zreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+ printf("Time to read and distribute matrix %.2f\n",
+ SuperLU_timer_() - t); fflush(stdout);
+
/* Broadcast matrix A to the other PEs. */
MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm );
MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm );
diff --git a/INSTALL/superlu_timer.c b/INSTALL/superlu_timer.c
deleted file mode 100644
index 3a2ffcc..0000000
--- a/INSTALL/superlu_timer.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Purpose
- * =======
- * Returns the time in seconds used by the process.
- *
- * Note: the timer function call is machine dependent. Use conditional
- * compilation to choose the appropriate function.
- *
- */
-
-
-#ifdef SUN
-/*
- * It uses the system call gethrtime(3C), which is accurate to
- * nanoseconds.
-*/
-#include <sys/time.h>
-
-double SuperLU_timer_() {
- return ( (double)gethrtime() / 1e9 );
-}
-
-#elif defined ( UNIX_TIMER )
-
-#include <sys/types.h>
-#include <sys/times.h>
-#include <time.h>
-#include <sys/time.h>
-
-#ifndef CLK_TCK
-#define CLK_TCK 60
-#endif
-
-double SuperLU_timer_()
-{
- struct tms use;
- double tmp;
- times(&use);
- tmp = use.tms_utime;
- tmp += use.tms_stime;
- return (double)(tmp) / (double) CLK_TCK;
-}
-
-#else
-
-#include <mpi.h>
-
-double SuperLU_timer_()
-{
- return MPI_Wtime();
-}
-
-#endif
-
diff --git a/MAKE_INC/make.cuda_gpu b/MAKE_INC/make.cuda_gpu
index 2e8b8a8..4ae0eac 100644
--- a/MAKE_INC/make.cuda_gpu
+++ b/MAKE_INC/make.cuda_gpu
@@ -62,9 +62,9 @@ RANLIB = ranlib
CC = mpicc
# CFLAGS should be set to be the C flags that include optimization
CFLAGS = ${CUDA_FLAGS} ${INCS} -std=c99 -O3 -Wall -w2 -openmp -mkl \
- -DDEBUGlevel=0 -DPRNTlevel=1 -DPROFlevel=0 \
-# -D_LONGINT
+ -DDEBUGlevel=0 -DPRNTlevel=1 -DPROFlevel=0
# -Wunused-variable
+#XSDK_INDEX_SIZE = 64 ## 64-bit integer
#
# NOOPTS should be set to be the C flags that turn off any optimization
NOOPTS = -O0
diff --git a/MAKE_INC/make.mac-x b/MAKE_INC/make.mac-x
new file mode 100644
index 0000000..8beb9a3
--- /dev/null
+++ b/MAKE_INC/make.mac-x
@@ -0,0 +1,43 @@
+############################################################################
+#
+# Program: SuperLU_DIST
+#
+# Module: make.inc
+#
+# Purpose: Top-level Definitions
+#
+# Creation date: March 1, 2016 version 5.0.0
+#
+# Modified:
+#
+#
+############################################################################
+#
+# The name of the libraries to be created/linked to
+#
+SuperLUroot = /Users/xsli/Dropbox/Codes/SuperLU/superlu_dist.git/
+DSUPERLULIB = $(SuperLUroot)/lib/libsuperlu_dist.a
+
+BLASLIB = $(SuperLUroot)/CBLAS/libblas.a
+
+LIBS = $(DSUPERLULIB) ${BLASLIB} /Users/xsli/lib/parmetis-4.0.3/build/Darwin-x86_64/libparmetis/libparmetis.a /Users/xsli/lib/parmetis-4.0.3/build/Darwin-x86_64/libmetis/libmetis.a
+
+#
+# The archiver and the flag(s) to use when building archive (library)
+# If your system has no ranlib, set RANLIB = echo.
+#
+ARCH = /usr/bin/ar
+ARCHFLAGS = cr
+RANLIB = /usr/bin/ranlib
+
+CC = /Users/xsli/lib/mpich2-install/bin/mpicc
+CFLAGS = -O3 -DNDEBUG -I/Users/xsli/lib/parmetis-4.0.3/metis/include -I/Users/xsli/lib/parmetis-4.0.3/include -DDEBUGlevel=0 -DPRNTlevel=0 -std=c99 -g
+#CFLAGS += -openmp
+#XSDK_INDEX_SIZE = 64 ## 64-bit integer
+# CFLAGS +=
+NOOPTS = -O0
+FORTRAN = /usr/local/bin/gfortran
+
+LOADER = $(CC)
+LOADOPTS = -openmp
+# LOADOPTS = -Wl,-rpath,/Users/xsli/Dropbox/Codes/SuperLU/superlu_dist.git/xsli-build/lib
diff --git a/MAKE_INC/make.mpich b/MAKE_INC/make.mpich
index 559a086..db3b92c 100644
--- a/MAKE_INC/make.mpich
+++ b/MAKE_INC/make.mpich
@@ -39,7 +39,7 @@ RANLIB = /usr/bin/ranlib
CC = /home/xiaoye/mpich-install/bin/mpicc
CFLAGS = -DNDEBUG -DUSE_VENDOR_BLAS -DAdd_ -DDEBUGlevel=0 -DPRNTlevel=0 -std=c99 -fPIC -g ${I_PARMETIS}
-# CFLAGS += -D_LONGINT
+#XSDK_INDEX_SIZE = 64 ## 64-bit integer
# CFLAGS +=
NOOPTS = -O0
FORTRAN = /usr/bin/gfortran
diff --git a/MAKE_INC/make.mpich b/MAKE_INC/make.ssg1
similarity index 55%
copy from MAKE_INC/make.mpich
copy to MAKE_INC/make.ssg1
index 559a086..30f86b6 100644
--- a/MAKE_INC/make.mpich
+++ b/MAKE_INC/make.ssg1
@@ -15,19 +15,10 @@
#
# The name of the libraries to be created/linked to
#
-VERSION = 5.1.3
-SuperLUroot = /home/xiaoye/Dropbox/Codes/SuperLU/SuperLU_DIST_${VERSION}
+SuperLUroot = /home/xiaoye/Dropbox/Codes/SuperLU/superlu_dist.git
DSUPERLULIB = $(SuperLUroot)/lib/libsuperlu_dist.a
-# BLASDEF = -DUSE_VENDOR_BLAS
-
-PARMETIS_DIR := ${HOME}/lib/static/parmetis-4.0.3
-I_PARMETIS := -I${PARMETIS_DIR}/include -I${PARMETIS_DIR}/metis/include
-METISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libmetis -lmetis
-PARMETISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libparmetis -lparmetis
-
-LIBS = $(DSUPERLULIB) /usr/lib/libf77blas.so /usr/lib/libatlas.so \
- ${PARMETISLIB} ${METISLIB}
+LIBS = $(DSUPERLULIB) /usr/lib/libf77blas.so /usr/lib/libatlas.so /home/xiaoye/lib/static/parmetis-4.0.3/build/Linux-x86_64/libparmetis/libparmetis.a /home/xiaoye/lib/static/parmetis-4.0.3/build/Linux-x86_64/libmetis/libmetis.a
#
# The archiver and the flag(s) to use when building archive (library)
@@ -38,11 +29,10 @@ ARCHFLAGS = cr
RANLIB = /usr/bin/ranlib
CC = /home/xiaoye/mpich-install/bin/mpicc
-CFLAGS = -DNDEBUG -DUSE_VENDOR_BLAS -DAdd_ -DDEBUGlevel=0 -DPRNTlevel=0 -std=c99 -fPIC -g ${I_PARMETIS}
-# CFLAGS += -D_LONGINT
-# CFLAGS +=
+CFLAGS = -O3 -DNDEBUG -I/home/xiaoye/lib/static/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/parmetis-4.0.3/include -fopenmp -DUSE_VENDOR_BLAS -DDEBUGlevel=0 -DPRNTlevel=0 -std=c99 -g
+#XSDK_INDEX_SIZE = 64 ## 64-bit integer
NOOPTS = -O0
FORTRAN = /usr/bin/gfortran
LOADER = $(CC)
-LOADOPTS = -Wl,-rpath=/home/xiaoye/Dropbox/Codes/SuperLU/SuperLU_DIST_${VERSION}/lib -g # -Wl,-Bdynamic
+LOADOPTS = -Wl,-rpath,/home/xiaoye/Dropbox/Codes/SuperLU/superlu_dist.git/lib -Wl,-rpath -Wl,/home/xiaoye/mpich-install/lib -Wl,--enable-new-dtags -fopenmp
diff --git a/MAKE_INC/make.xc30 b/MAKE_INC/make.xc30
index dba42bb..06131b8 100644
--- a/MAKE_INC/make.xc30
+++ b/MAKE_INC/make.xc30
@@ -62,7 +62,7 @@ CC = cc
CFLAGS = -fast -m64 -std=c99 -Wall -openmp \
$(I_PARMETIS) -DDEBUGlevel=0 -DPRNTlevel=0 -DPROFlevel=0 \
# uncomment the following to use 64-bit integer
-# CFLAGS += -D_LONGINT
+#XSDK_INDEX_SIZE = 64 ## 64-bit integer
# NOOPTS should be set to be the C flags that turn off any optimization
NOOPTS = -O0 -std=c99
diff --git a/MAKE_INC/make.xt5 b/MAKE_INC/make.xt5
index 926d28e..1365a6e 100644
--- a/MAKE_INC/make.xt5
+++ b/MAKE_INC/make.xt5
@@ -59,7 +59,8 @@ RANLIB = ranlib
CC = cc
INCS = $(I_PARMETIS)
# CFLAGS should be set to be the C flags that include optimization
-CFLAGS = ${INCS} -c99 -fastsse -DDEBUGlevel=0 -DPRNTlevel=1 #-D_LONGINT
+CFLAGS = ${INCS} -c99 -fastsse -DDEBUGlevel=0 -DPRNTlevel=1
+#XSDK_INDEX_SIZE = 64 ## 64-bit integer
#
# NOOPTS should be set to be the C flags that turn off any optimization
NOOPTS = -O0
diff --git a/Makefile b/Makefile
index 7717442..4ef799d 100644
--- a/Makefile
+++ b/Makefile
@@ -14,7 +14,7 @@
include make.inc
-all: install lib example
+all: lib install example
lib: superlulib
@@ -43,3 +43,4 @@ cleantesting:
( cd INSTALL; $(MAKE) clean )
( cd EXAMPLE; $(MAKE) clean )
( cd FORTRAN; $(MAKE) clean )
+ ( cd TEST; $(MAKE) clean )
diff --git a/README b/README
deleted file mode 100644
index 2cfbba7..0000000
--- a/README
+++ /dev/null
@@ -1,251 +0,0 @@
- SuperLU_DIST (version 5.1)
- ============================
-
-SuperLU_DIST contains a set of subroutines to solve a sparse linear system
-A*X=B. It uses Gaussian elimination with static pivoting (GESP).
-Static pivoting is a technique that combines the numerical stability of
-partial pivoting with the scalability of Cholesky (no pivoting),
-to run accurately and efficiently on large numbers of processors.
-
-SuperLU_DIST is a parallel extension to the serial SuperLU library.
-It is targeted for the distributed memory parallel machines.
-SuperLU_DIST is implemented in ANSI C, and MPI for communications.
-Currently, the LU factorization and triangular solution routines,
-which are the most time-consuming part of the solution process,
-are parallelized. The other routines, such as static pivoting and
-column preordering for sparsity are performed sequentially.
-This "alpha" release contains double-precision real and double-precision
-complex data types.
-
-The distribution contains the following directory structure:
-
- SuperLU_DIST/README instructions on installation
- SuperLU_DIST/CBLAS/ needed BLAS routines in C, not necessarily fast
- SuperLU_DIST/DOC/ the Users' Guide
- SuperLU_DIST/EXAMPLE/ example programs
- SuperLU_DIST/INSTALL/ test machine dependent parameters
- SuperLU_DIST/SRC/ C source code, to be compiled into libsuperlu_dist.a
- SuperLU_DIST/lib/ contains library archive libsuperlu_dist.a
- SuperLU_DIST/Makefile top level Makefile that does installation and testing
- SuperLU_DIST/make.inc compiler, compiler flags, library definitions and C
- preprocessor definitions, included in all Makefiles.
- (You may need to edit it to suit for your system
- before compiling the whole package.)
- SuperLU_DIST/MAKE_INC/ sample machine-specific make.inc files
-
-
-----------------
-| INSTALLATION |
-----------------
-
-There are two ways to install the package. One requires users to
-edit makefile manually, the other uses CMake build system.
-The procedures are described below.
-
-1. Manual installation with makefile.
- Before installing the package, please examine the three things dependent
- on your system setup:
-
- 1.1 Edit the make.inc include file.
-
- This make include file is referenced inside each of the Makefiles
- in the various subdirectories. As a result, there is no need to
- edit the Makefiles in the subdirectories. All information that is
- machine specific has been defined in this include file.
-
- Sample machine-specific make.inc are provided in the MAKE_INC/
- directory for several platforms, such as Cray XT5 and IBM SP.
- When you have selected the machine to which you wish to install
- SuperLU_DIST, copy the appropriate sample include file
- (if one is present) into make.inc.
- For example, if you wish to run SuperLU_DIST on a Cray XT5, you can do
-
- cp MAKE_INC/make.xc30 make.inc
-
- For the systems other than listed above, some porting effort is needed
- for parallel factorization routines. Please refer to the Users' Guide
- for detailed instructions on porting.
-
- The following CPP definitions can be set in CFLAGS.
- o -D_LONGINT
- use 64-bit integers for indexing sparse matrices. (default 32 bit)
-
- o -DPRNTlevel=[0,1,2,...]
- printing level to show solver's execution details. (default 0)
-
- o -DDEBUGlevel=[0,1,2,...]
- diagnostic printing level for debugging purpose. (default 0)
-
-
- 1.2. The BLAS library.
-
- The parallel routines in SuperLU_DIST uses some sequential BLAS routines
- on each process. If there is BLAS library available on your machine,
- you may define the following in the file make.inc:
- BLASDEF = -DUSE_VENDOR_BLAS
- BLASLIB = <BLAS library you wish to link with>
-
- The CBLAS/ subdirectory contains the part of the C BLAS needed by
- SuperLU_DIST package. However, these codes are intended for use
- only if there is no faster implementation of the BLAS already
- available on your machine. In this case, you should go to the
- top-level SuperLU_DIST/ directory and do the following:
-
- 1) In make.inc, undefine (comment out) BLASDEF, and define:
- BLASLIB = ../lib/libblas$(PLAT).a
-
- 2) Type: make blaslib
- to make the BLAS library from the routines in the
- CBLAS/ subdirectory.
-
-
- 1.3. External libraries: Metis and ParMetis.
-
- If you will use Metis or ParMetis ordering, you will
- need to install them yourself. Since ParMetis package already
- contains the source code for the Metis library, you can just
- download and compile ParMetis from:
- http://glaros.dtc.umn.edu/gkhome/metis/parmetis/download
-
- After you have installed it, you should define the following in make.inc:
- METISLIB = -L<metis directory> -lmetis
- PARMETISLIB = -L<parmetis directory> -lparmetis
- I_PARMETIS = -I<parmetis directory>/include -I<parmetis directory>/metis/include
-
- 1.4. C preprocessor definition CDEFS.
-
- In the header file SRC/Cnames.h, we use macros to determine how
- C routines should be named so that they are callable by Fortran.
- (Some vendor-supplied BLAS libraries do not have C interfaces. So the
- re-naming is needed in order for the SuperLU BLAS calls (in C) to
- interface with the Fortran-style BLAS.)
- The possible options for CDEFS are:
-
- o -DAdd_: Fortran expects a C routine to have an underscore
- postfixed to the name;
- (This is set as the default)
- o -DNoChange: Fortran expects a C routine name to be identical to
- that compiled by C;
- o -DUpCase: Fortran expects a C routine name to be all uppercase.
-
- 1.5. Multicore and GPU (optional).
-
- To use OpenMP parallelism, need to compile the code with the
- following CPP definition:
-
- -D_OPENMP
-
- and set the number of threads to be used as follows:
-
- setenv OMP_NUM_THREADS <##>
-
- To enable Nvidia GPU access, need to take the following 2 step:
- 1) set the following Linux environment variable:
-
- setenv ACC GPU
-
- 2) Add the CUDA library location in make.inc:
-
- ifeq "${ACC}" "GPU"
- CFLAGS += -DGPU_ACC
- INCS += -I<CUDA directory>/include
- LIBS += -L<CUDA directory>/lib64 -lcublas -lcudart
- endif
-
- A Makefile is provided in each subdirectory. The installation can be done
- completely automatically by simply typing "make" at the top level.
-
-2. Using CMake build system.
- You will need to create a build tree from which to invoke CMake.
-
- First, in order to use parallel symbolic factorization function, you
- need to install ParMETIS parallel ordering package, and define the
- two environment variables: PARMETIS_ROOT and PARMETIS_BUILD_DIR
-
- setenv PARMETIS_ROOT <Prefix directory of the ParMETIS installation>
- setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/build/Linux-x86_64
-
- Then, the installation procedure is the following.
-
- From the top level directory, do:
-
- mkdir build ; cd build
- cmake .. \
- -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \
- -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include"
-
- ( example:
- setenv PARMETIS_ROOT ~/lib/dynamic/parmetis-4.0.3
- setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/build/Linux-x86_64
- cmake .. \
- -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \
- -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \
- -DCMAKE_C_FLAGS="-std=c99 -g" \
- -Denable_blaslib=OFF \
- -DBUILD_SHARED_LIBS=OFF \
- -DCMAKE_C_COMPILER=mpicc \
- -DCMAKE_INSTALL_PREFIX=..
- )
-
- To actually build, type:
- make
-
- To install the libraries, type:
- make install
-
- To run the installation test, type:
- make test
- (The outputs are in file: build/Testing/Temporary/LastTest.log)
-
-
- ++++++++
- Note on the C-Fortran name mangling handled by C preprocessor definition:
- ++++++++
- In the default setting, we assume that Fortran expects a C routine
- to have an underscore postfixed to the name. Depending on the
- compiler, you may need to define one of the following flags in
- during the cmake build to overwrite default setting:
-
- cmake .. -DCMAKE_C_FLAGS="-DNoChange"
-
- cmake .. -DCMAKE_C_FLAGS="-DUpCase"
-
-
---------------
-| REFERENCES |
---------------
-
-[1] SuperLU_DIST: A Scalable Distributed-Memory Sparse Direct Solver for
- Unsymmetric Linear Systems. Xiaoye S. Li and James W. Demmel.
- ACM Trans. on Math. Solftware, Vol. 29, No. 2, June 2003, pp. 110-140.
-[2] Parallel Symbolic Factorization for Sparse LU with Static Pivoting.
- L. Grigori, J. Demmel and X.S. Li. SIAM J. Sci. Comp., Vol. 29, Issue 3,
- 1289-1314, 2007.
-[3] A distributed CPU-GPU sparse direct solver. P. Sao, R. Vuduc and X.S. Li,
- Proc. of EuroPar-2014 Parallel Processing, August 25-29, 2014.
- Porto, Portugal.
-
-Xiaoye S. Li Lawrence Berkeley National Lab, xsli at lbl.gov
-Laura Grigori INRIA, France, Laura.Grigori at inria.fr
-Piyush Sao Georgia Institute of Technology, piyush.feynman at gmail.com
-Ichitaro Yamazaki Univ. of Tennessee, ic.yamazaki at gmail.com
-
---------------------
-| RELEASE VERSIONS |
---------------------
-
- October 15, 2003 Version 2.0
- October 1, 2007 Version 2.1
- Feburary 20, 2008 Version 2.2
- October 15, 2008 Version 2.3
- June 9, 2010 Version 2.4
- November 23, 2010 Version 2.5
- March 31, 2013 Version 3.3
- October 1, 2014 Version 4.0
- July 15, 2014 Version 4.1
- September 25, 2015 Version 4.2
- December 31, 2015 Version 4.3
- April 8, 2016 Version 5.0.0
- May 15, 2016 Version 5.1.0
- October 4, 2016 Version 5.1.1
- December 31, 2016 Version 5.1.3
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..47ca0fa
--- /dev/null
+++ b/README.md
@@ -0,0 +1,274 @@
+# SuperLU_DIST (version 5.2)
+
+[](https://travis-ci.org/xiaoyeli/superlu_dist)
+[Nightly tests](http://my.cdash.org/index.php?project=superlu_dist)
+
+SuperLU_DIST contains a set of subroutines to solve a sparse linear system
+A*X=B. It uses Gaussian elimination with static pivoting (GESP).
+Static pivoting is a technique that combines the numerical stability of
+partial pivoting with the scalability of Cholesky (no pivoting),
+to run accurately and efficiently on large numbers of processors.
+
+SuperLU_DIST is a parallel extension to the serial SuperLU library.
+It is targeted for the distributed memory parallel machines.
+SuperLU_DIST is implemented in ANSI C, and MPI for communications.
+Currently, the LU factorization and triangular solution routines,
+which are the most time-consuming part of the solution process,
+are parallelized. The other routines, such as static pivoting and
+column preordering for sparsity are performed sequentially.
+This "alpha" release contains double-precision real and double-precision
+complex data types.
+
+### The distribution contains the following directory structure:
+
+```
+SuperLU_DIST/README instructions on installation
+SuperLU_DIST/CBLAS/ needed BLAS routines in C, not necessarily fast
+ (NOTE: this version is single threaded. If you use the
+ library with multiple OpenMP threads, performance
+ relies on a good multithreaded BLAS implementation.)
+SuperLU_DIST/DOC/ the Users' Guide
+SuperLU_DIST/EXAMPLE/ example programs
+SuperLU_DIST/INSTALL/ test machine dependent parameters
+SuperLU_DIST/SRC/ C source code, to be compiled into libsuperlu_dist.a
+SuperLU_DIST/TEST/ testing code
+SuperLU_DIST/lib/ contains library archive libsuperlu_dist.a
+SuperLU_DIST/Makefile top-level Makefile that does installation and testing
+SuperLU_DIST/make.inc compiler, compiler flags, library definitions and C
+ preprocessor definitions, included in all Makefiles.
+ (You may need to edit it to suit your system
+ before compiling the whole package.)
+SuperLU_DIST/MAKE_INC/ sample machine-specific make.inc files
+```
+
+## INSTALLATION
+
+There are two ways to install the package. One requires users to
+edit makefile manually, the other uses CMake build system.
+The procedures are described below.
+
+### Installation option 1: Manual installation with makefile.
+Before installing the package, please examine the three things dependent
+on your system setup:
+
+1.1 Edit the make.inc include file.
+
+This make include file is referenced inside each of the Makefiles
+in the various subdirectories. As a result, there is no need to
+edit the Makefiles in the subdirectories. All information that is
+machine specific has been defined in this include file.
+
+Sample machine-specific make.inc are provided in the MAKE_INC/
+directory for several platforms, such as Cray XT5, Linux, Mac-OS, and CUDA.
+When you have selected the machine to which you wish to install
+SuperLU_DIST, copy the appropriate sample include file
+(if one is present) into make.inc.
+
+For example, if you wish to run SuperLU_DIST on a Cray XT5, you can do
+
+`cp MAKE_INC/make.xt5 make.inc`
+
+For the systems other than listed above, some porting effort is needed
+for parallel factorization routines. Please refer to the Users' Guide
+for detailed instructions on porting.
+
+The following CPP definitions can be set in CFLAGS.
+
+```
+-DXSDK_INDEX_SIZE=64
+use 64-bit integers for indexing sparse matrices. (default 32 bit)
+
+-DPRNTlevel=[0,1,2,...]
+printing level to show solver's execution details. (default 0)
+
+-DDEBUGlevel=[0,1,2,...]
+diagnostic printing level for debugging purpose. (default 0)
+```
+
+1.2. The BLAS library.
+The parallel routines in SuperLU_DIST use some BLAS routines on each MPI
+process. Moreover, if you enable OpenMP with multiple threads, you need to
+link with a multithreaded BLAS library. Otherwise performance will be poor.
+A good public domain BLAS library is OpenBLAS (http://www.openblas.net),
+which has OpenMP support.
+
+If you have a BLAS library your machine, you may define the following in
+the file make.inc:
+
+```
+BLASDEF = -DUSE_VENDOR_BLAS
+BLASLIB = <BLAS library you wish to link with>
+```
+
+The CBLAS/ subdirectory contains the part of the C BLAS (single threaded)
+needed by SuperLU_DIST package. However, these codes are intended for use
+only if there is no faster implementation of the BLAS already
+available on your machine. In this case, you should go to the
+top-level SuperLU_DIST/ directory and do the following:
+
+1) In make.inc, undefine (comment out) BLASDEF, and define:
+` BLASLIB = ../lib/libblas$(PLAT).a`
+
+2) Type: `make blaslib`
+to make the BLAS library from the routines in the
+` CBLAS/ subdirectory.`
+
+1.3. External libraries: Metis and ParMetis.
+
+If you will use Metis or ParMetis ordering, you will
+need to install them yourself. Since ParMetis package already
+contains the source code for the Metis library, you can just
+download and compile ParMetis from:
+[http://glaros.dtc.umn.edu/gkhome/metis/parmetis/download](http://glaros.dtc.umn.edu/gkhome/metis/parmetis/download)
+
+After you have installed it, you should define the following in make.inc:
+```
+METISLIB = -L<metis directory> -lmetis
+PARMETISLIB = -L<parmetis directory> -lparmetis
+I_PARMETIS = -I<parmetis directory>/include -I<parmetis directory>/metis/include
+```
+1.4. C preprocessor definition CDEFS.
+In the header file SRC/Cnames.h, we use macros to determine how
+C routines should be named so that they are callable by Fortran.
+(Some vendor-supplied BLAS libraries do not have C interfaces. So the
+re-naming is needed in order for the SuperLU BLAS calls (in C) to
+interface with the Fortran-style BLAS.)
+The possible options for CDEFS are:
+
+`-DAdd_`: Fortran expects a C routine to have an underscore
+ postfixed to the name;
+ (This is set as the default)
+`-DNoChange`: Fortran expects a C routine name to be identical to
+ that compiled by C;
+`-DUpCase`: Fortran expects a C routine name to be all uppercase.
+
+1.5. Multicore and GPU (optional).
+
+To use OpenMP parallelism, need to link with an OpenMP library, and
+set the number of threads you wish to use as follows (bash):
+`export OMP_NUM_THREADS=<##>`
+
+To enable NVIDIA GPU access, need to take the following 2 step:
+1) Set the following Linux environment variable:
+`export ACC=GPU`
+
+2) Add the CUDA library location in make.inc:
+```
+ifeq "${ACC}" "GPU"
+CFLAGS += -DGPU_ACC
+INCS += -I<CUDA directory>/include
+LIBS += -L<CUDA directory>/lib64 -lcublas -lcudart
+endif
+```
+A Makefile is provided in each subdirectory. The installation can be done
+completely automatically by simply typing "make" at the top level.
+
+### Installation option 2: Using CMake build system.
+You will need to create a build tree from which to invoke CMake.
+
+First, in order to use parallel symbolic factorization function, you
+need to install ParMETIS parallel ordering package and define the
+two environment variables: PARMETIS_ROOT and PARMETIS_BUILD_DIR
+
+```
+export PARMETIS_ROOT=<Prefix directory of the ParMETIS installation>
+export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/build/Linux-x86_64
+```
+Then, the installation procedure is the following.
+
+From the top level directory, do:
+```
+mkdir build ; cd build
+cmake .. \
+-DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \
+-DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include"
+
+( Example cmake script: see run_cmake_build.sh
+
+export PARMETIS_ROOT=~/lib/dynamic/parmetis-4.0.3
+export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/build/Linux-x86_64
+cmake .. \
+-DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \
+-DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \
+-DCMAKE_C_FLAGS="-std=c99 -g" \
+-Denable_blaslib=OFF \
+-DBUILD_SHARED_LIBS=OFF \
+-DCMAKE_C_COMPILER=mpicc \
+-DCMAKE_INSTALL_PREFIX=.
+
+)
+```
+To actually build, type:
+`make`
+
+To install the libraries, type:
+`make install`
+
+To run the installation test, type:
+`ctest`
+(The outputs are in file: `build/Testing/Temporary/LastTest.log`)
+or,
+`ctest -D Experimental`
+or,
+`ctest -D Nightly`
+
+**NOTE:**
+The parallel execution in ctest is invoked by "mpiexec" command which is
+from MPICH environment. If your MPI is not MPICH/mpiexec based, the test
+execution may fail. You can always go to TEST/ directory to perform
+testing manually.
+
+**Note on the C-Fortran name mangling handled by C preprocessor definition:**
+In the default setting, we assume that Fortran expects a C routine
+to have an underscore postfixed to the name. Depending on the
+compiler, you may need to define one of the following flags in
+during the cmake build to overwrite default setting:
+
+```
+cmake .. -DCMAKE_C_FLAGS="-DNoChange"
+cmake .. -DCMAKE_C_FLAGS="-DUpCase"
+```
+
+## READING SPARSE MATRIX FILES
+
+The SRC/ directory contains the following routines to read different file
+formats, they all have the similar calling sequence.
+```
+$ ls -l dread*.c
+dreadMM.c : Matrix Market, files with suffix .mtx
+dreadhb.c : Harrell-Boeing, files with suffix .rua
+dreadrb.c : Rutherford-Boeing, files with suffix .rb
+dreadtriple.c : triplet, with header
+dreadtriple_noheader.c : triplet, no header, which is also readable in Matlab
+```
+
+## REFERENCES
+
+**[1]** SuperLU_DIST: A Scalable Distributed-Memory Sparse Direct Solver for Unsymmetric Linear Systems. Xiaoye S. Li and James W. Demmel. ACM Trans. on Math. Software, Vol. 29, No. 2, June 2003, pp. 110-140.
+**[2]** Parallel Symbolic Factorization for Sparse LU with Static Pivoting. L. Grigori, J. Demmel and X.S. Li. SIAM J. Sci. Comp., Vol. 29, Issue 3, 1289-1314, 2007.
+**[3]** A distributed CPU-GPU sparse direct solver. P. Sao, R. Vuduc and X.S. Li, Proc. of EuroPar-2014 Parallel Processing, August 25-29, 2014. Porto, Portugal.
+
+**Xiaoye S. Li**, Lawrence Berkeley National Lab, [xsli at lbl.gov](xsli at lbl.gov)
+**Laura Grigori**, INRIA, France, [laura.grigori at inria.fr](laura.grigori at inria.fr)
+**Piyush Sao**, Georgia Institute of Technology, [piyush.feynman at gmail.com](piyush.feynman at gmail.com)
+**Ichitaro Yamazaki**, Univ. of Tennessee, [ic.yamazaki at gmail.com](ic.yamazaki at gmail.com)
+
+## RELEASE VERSIONS
+```
+October 15, 2003 Version 2.0
+October 1, 2007 Version 2.1
+Feburary 20, 2008 Version 2.2
+October 15, 2008 Version 2.3
+June 9, 2010 Version 2.4
+November 23, 2010 Version 2.5
+March 31, 2013 Version 3.3
+October 1, 2014 Version 4.0
+July 15, 2014 Version 4.1
+September 25, 2015 Version 4.2
+December 31, 2015 Version 4.3
+April 8, 2016 Version 5.0.0
+May 15, 2016 Version 5.1.0
+October 4, 2016 Version 5.1.1
+December 31, 2016 Version 5.1.3
+September 30, 2017 Version 5.2.0
+```
diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index b8341c9..36b55d1 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -8,6 +8,8 @@ set(headers
superlu_enum_consts.h
supermatrix.h
util_dist.h
+ colamd.h
+ superlu_dist_config.h
)
# first: precision-independent files
@@ -32,6 +34,8 @@ set(sources
xerr_dist.c
smach_dist.c
dmach_dist.c
+ colamd.c
+ superlu_dist_version.c
)
set_source_files_properties(superlu_timer.c PROPERTIES COMPILE_FLAGS -O0)
@@ -123,5 +127,14 @@ set_target_properties(superlu_dist PROPERTIES
VERSION ${PROJECT_VERSION} SOVERSION ${VERSION_MAJOR}
)
-install(TARGETS superlu_dist DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
-install(FILES ${headers} DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
+# Define GNU standard installation directories
+include(GNUInstallDirs)
+
+install(TARGETS superlu_dist
+# DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
+install(FILES ${headers}
+# DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
diff --git a/SRC/Makefile b/SRC/Makefile
index c78083d..532274e 100644
--- a/SRC/Makefile
+++ b/SRC/Makefile
@@ -30,10 +30,11 @@ include ../make.inc
# Precision independent routines
#
ALLAUX = sp_ienv.o etree.o sp_colorder.o get_perm_c.o \
- mmd.o comm.o memory.o util.o superlu_grid.o \
+ colamd.o mmd.o comm.o memory.o util.o superlu_grid.o \
pxerr_dist.o superlu_timer.o symbfact.o \
psymbfact.o psymbfact_util.o get_perm_c_parmetis.o mc64ad_dist.o \
- static_schedule.o xerr_dist.o smach_dist.o dmach_dist.o
+ static_schedule.o xerr_dist.o smach_dist.o dmach_dist.o \
+ superlu_dist_version.o
ifeq "${ACC}" "GPU"
ALLAUX += cublas_utils.o
@@ -70,16 +71,29 @@ ZPLUSRC = pzgssvx.o pzgssvx_ABglobal.o \
all: double complex16
-double: $(DSLUSRC) $(DPLUSRC) $(ALLAUX)
+config_h:
+ifeq ($(XSDK_INDEX_SIZE),64)
+ printf "#define XSDK_INDEX_SIZE 64\n" > superlu_dist_config.h
+else
+ printf "/* #define XSDK_INDEX_SIZE 64 */\n" > superlu_dist_config.h
+endif
+ printf "#if (XSDK_INDEX_SIZE == 64)\n#define _LONGINT 1\n#endif\n" >> superlu_dist_config.h
+
+double: config_h $(DSLUSRC) $(DPLUSRC) $(ALLAUX)
$(ARCH) $(ARCHFLAGS) $(DSUPERLULIB) \
$(DSLUSRC) $(DPLUSRC) $(ALLAUX)
$(RANLIB) $(DSUPERLULIB)
-complex16: $(ZSLUSRC) $(ZPLUSRC) $(ALLAUX)
+complex16: config_h $(ZSLUSRC) $(ZPLUSRC) $(ALLAUX)
$(ARCH) $(ARCHFLAGS) $(DSUPERLULIB) \
$(ZSLUSRC) $(ZPLUSRC) $(ALLAUX)
$(RANLIB) $(DSUPERLULIB)
+pdgstrf.o: dscatter.c dlook_ahead_update.c dSchCompUdt-2Ddynamic.c pdgstrf.c
+ $(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -c pdgstrf.c $(VERBOSE)
+
+pzgstrf.o: zscatter.c zlook_ahead_update.c zSchCompUdt-2Ddynamic.c pzgstrf.c
+ $(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -c pzgstrf.c $(VERBOSE)
.c.o:
$(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -c $< $(VERBOSE)
diff --git a/SRC/colamd.c b/SRC/colamd.c
new file mode 100644
index 0000000..5500e68
--- /dev/null
+++ b/SRC/colamd.c
@@ -0,0 +1,3424 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file colamd.c
+ *\brief A sparse matrix column ordering algorithm
+
+ <pre>
+ ==========================================================================
+ === colamd/symamd - a sparse matrix column ordering algorithm ============
+ ==========================================================================
+
+
+ colamd: an approximate minimum degree column ordering algorithm,
+ for LU factorization of symmetric or unsymmetric matrices,
+ QR factorization, least squares, interior point methods for
+ linear programming problems, and other related problems.
+
+ symamd: an approximate minimum degree ordering algorithm for Cholesky
+ factorization of symmetric matrices.
+
+ Purpose:
+
+ Colamd computes a permutation Q such that the Cholesky factorization of
+ (AQ)'(AQ) has less fill-in and requires fewer floating point operations
+ than A'A. This also provides a good ordering for sparse partial
+ pivoting methods, P(AQ) = LU, where Q is computed prior to numerical
+ factorization, and P is computed during numerical factorization via
+ conventional partial pivoting with row interchanges. Colamd is the
+ column ordering method used in SuperLU, part of the ScaLAPACK library.
+ It is also available as built-in function in MATLAB Version 6,
+ available from MathWorks, Inc. (http://www.mathworks.com). This
+ routine can be used in place of colmmd in MATLAB.
+
+ Symamd computes a permutation P of a symmetric matrix A such that the
+ Cholesky factorization of PAP' has less fill-in and requires fewer
+ floating point operations than A. Symamd constructs a matrix M such
+ that M'M has the same nonzero pattern of A, and then orders the columns
+ of M using colmmd. The column ordering of M is then returned as the
+ row and column ordering P of A.
+
+ Authors:
+
+ The authors of the code itself are Stefan I. Larimore and Timothy A.
+ Davis (davis at cise.ufl.edu), University of Florida. The algorithm was
+ developed in collaboration with John Gilbert, Xerox PARC, and Esmond
+ Ng, Oak Ridge National Laboratory.
+
+ Date:
+
+ September 8, 2003. Version 2.3.
+
+ Acknowledgements:
+
+ This work was supported by the National Science Foundation, under
+ grants DMS-9504974 and DMS-9803599.
+
+ Copyright and License:
+
+ Copyright (c) 1998-2003 by the University of Florida.
+ All Rights Reserved.
+
+ THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
+
+ Permission is hereby granted to use, copy, modify, and/or distribute
+ this program, provided that the Copyright, this License, and the
+ Availability of the original version is retained on all copies and made
+ accessible to the end-user of any code or package that includes COLAMD
+ or any modified version of COLAMD.
+
+ Availability:
+
+ The colamd/symamd library is available at
+
+ http://www.cise.ufl.edu/research/sparse/colamd/
+
+ This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.c
+ file. It requires the colamd.h file. It is required by the colamdmex.c
+ and symamdmex.c files, for the MATLAB interface to colamd and symamd.
+
+ See the ChangeLog file for changes since Version 1.0.
+
+ ==========================================================================
+ === Description of user-callable routines ================================
+ ==========================================================================
+
+
+ ----------------------------------------------------------------------------
+ colamd_recommended:
+ ----------------------------------------------------------------------------
+
+ C syntax:
+
+ #include "colamd.h"
+ int colamd_recommended (int nnz, int n_row, int n_col) ;
+
+ or as a C macro
+
+ #include "colamd.h"
+ Alen = COLAMD_RECOMMENDED (int nnz, int n_row, int n_col) ;
+
+ Purpose:
+
+ Returns recommended value of Alen for use by colamd. Returns -1
+ if any input argument is negative. The use of this routine
+ or macro is optional. Note that the macro uses its arguments
+ more than once, so be careful for side effects, if you pass
+ expressions as arguments to COLAMD_RECOMMENDED. Not needed for
+ symamd, which dynamically allocates its own memory.
+
+ Arguments (all input arguments):
+
+ int nnz ; Number of nonzeros in the matrix A. This must
+ be the same value as p [n_col] in the call to
+ colamd - otherwise you will get a wrong value
+ of the recommended memory to use.
+
+ int n_row ; Number of rows in the matrix A.
+
+ int n_col ; Number of columns in the matrix A.
+
+ ----------------------------------------------------------------------------
+ colamd_set_defaults:
+ ----------------------------------------------------------------------------
+
+ C syntax:
+
+ #include "colamd.h"
+ colamd_set_defaults (double knobs [COLAMD_KNOBS]) ;
+
+ Purpose:
+
+ Sets the default parameters. The use of this routine is optional.
+
+ Arguments:
+
+ double knobs [COLAMD_KNOBS] ; Output only.
+
+ Colamd: rows with more than (knobs [COLAMD_DENSE_ROW] * n_col)
+ entries are removed prior to ordering. Columns with more than
+ (knobs [COLAMD_DENSE_COL] * n_row) entries are removed prior to
+ ordering, and placed last in the output column ordering.
+
+ Symamd: uses only knobs [COLAMD_DENSE_ROW], which is knobs [0].
+ Rows and columns with more than (knobs [COLAMD_DENSE_ROW] * n)
+ entries are removed prior to ordering, and placed last in the
+ output ordering.
+
+ COLAMD_DENSE_ROW and COLAMD_DENSE_COL are defined as 0 and 1,
+ respectively, in colamd.h. Default values of these two knobs
+ are both 0.5. Currently, only knobs [0] and knobs [1] are
+ used, but future versions may use more knobs. If so, they will
+ be properly set to their defaults by the future version of
+ colamd_set_defaults, so that the code that calls colamd will
+ not need to change, assuming that you either use
+ colamd_set_defaults, or pass a (double *) NULL pointer as the
+ knobs array to colamd or symamd.
+
+ ----------------------------------------------------------------------------
+ colamd:
+ ----------------------------------------------------------------------------
+
+ C syntax:
+
+ #include "colamd.h"
+ int colamd (int n_row, int n_col, int Alen, int *A, int *p,
+ double knobs [COLAMD_KNOBS], int stats [COLAMD_STATS]) ;
+
+ Purpose:
+
+ Computes a column ordering (Q) of A such that P(AQ)=LU or
+ (AQ)'AQ=LL' have less fill-in and require fewer floating point
+ operations than factorizing the unpermuted matrix A or A'A,
+ respectively.
+
+ Returns:
+
+ TRUE (1) if successful, FALSE (0) otherwise.
+
+ Arguments:
+
+ int n_row ; Input argument.
+
+ Number of rows in the matrix A.
+ Restriction: n_row >= 0.
+ Colamd returns FALSE if n_row is negative.
+
+ int n_col ; Input argument.
+
+ Number of columns in the matrix A.
+ Restriction: n_col >= 0.
+ Colamd returns FALSE if n_col is negative.
+
+ int Alen ; Input argument.
+
+ Restriction (see note):
+ Alen >= 2*nnz + 6*(n_col+1) + 4*(n_row+1) + n_col
+ Colamd returns FALSE if these conditions are not met.
+
+ Note: this restriction makes an modest assumption regarding
+ the size of the two typedef's structures in colamd.h.
+ We do, however, guarantee that
+
+ Alen >= colamd_recommended (nnz, n_row, n_col)
+
+ or equivalently as a C preprocessor macro:
+
+ Alen >= COLAMD_RECOMMENDED (nnz, n_row, n_col)
+
+ will be sufficient.
+
+ int A [Alen] ; Input argument, undefined on output.
+
+ A is an integer array of size Alen. Alen must be at least as
+ large as the bare minimum value given above, but this is very
+ low, and can result in excessive run time. For best
+ performance, we recommend that Alen be greater than or equal to
+ colamd_recommended (nnz, n_row, n_col), which adds
+ nnz/5 to the bare minimum value given above.
+
+ On input, the row indices of the entries in column c of the
+ matrix are held in A [(p [c]) ... (p [c+1]-1)]. The row indices
+ in a given column c need not be in ascending order, and
+ duplicate row indices may be be present. However, colamd will
+ work a little faster if both of these conditions are met
+ (Colamd puts the matrix into this format, if it finds that the
+ the conditions are not met).
+
+ The matrix is 0-based. That is, rows are in the range 0 to
+ n_row-1, and columns are in the range 0 to n_col-1. Colamd
+ returns FALSE if any row index is out of range.
+
+ The contents of A are modified during ordering, and are
+ undefined on output.
+
+ int p [n_col+1] ; Both input and output argument.
+
+ p is an integer array of size n_col+1. On input, it holds the
+ "pointers" for the column form of the matrix A. Column c of
+ the matrix A is held in A [(p [c]) ... (p [c+1]-1)]. The first
+ entry, p [0], must be zero, and p [c] <= p [c+1] must hold
+ for all c in the range 0 to n_col-1. The value p [n_col] is
+ thus the total number of entries in the pattern of the matrix A.
+ Colamd returns FALSE if these conditions are not met.
+
+ On output, if colamd returns TRUE, the array p holds the column
+ permutation (Q, for P(AQ)=LU or (AQ)'(AQ)=LL'), where p [0] is
+ the first column index in the new ordering, and p [n_col-1] is
+ the last. That is, p [k] = j means that column j of A is the
+ kth pivot column, in AQ, where k is in the range 0 to n_col-1
+ (p [0] = j means that column j of A is the first column in AQ).
+
+ If colamd returns FALSE, then no permutation is returned, and
+ p is undefined on output.
+
+ double knobs [COLAMD_KNOBS] ; Input argument.
+
+ See colamd_set_defaults for a description.
+
+ int stats [COLAMD_STATS] ; Output argument.
+
+ Statistics on the ordering, and error status.
+ See colamd.h for related definitions.
+ Colamd returns FALSE if stats is not present.
+
+ stats [0]: number of dense or empty rows ignored.
+
+ stats [1]: number of dense or empty columns ignored (and
+ ordered last in the output permutation p)
+ Note that a row can become "empty" if it
+ contains only "dense" and/or "empty" columns,
+ and similarly a column can become "empty" if it
+ only contains "dense" and/or "empty" rows.
+
+ stats [2]: number of garbage collections performed.
+ This can be excessively high if Alen is close
+ to the minimum required value.
+
+ stats [3]: status code. < 0 is an error code.
+ > 1 is a warning or notice.
+
+ 0 OK. Each column of the input matrix contained
+ row indices in increasing order, with no
+ duplicates.
+
+ 1 OK, but columns of input matrix were jumbled
+ (unsorted columns or duplicate entries). Colamd
+ had to do some extra work to sort the matrix
+ first and remove duplicate entries, but it
+ still was able to return a valid permutation
+ (return value of colamd was TRUE).
+
+ stats [4]: highest numbered column that
+ is unsorted or has duplicate
+ entries.
+ stats [5]: last seen duplicate or
+ unsorted row index.
+ stats [6]: number of duplicate or
+ unsorted row indices.
+
+ -1 A is a null pointer
+
+ -2 p is a null pointer
+
+ -3 n_row is negative
+
+ stats [4]: n_row
+
+ -4 n_col is negative
+
+ stats [4]: n_col
+
+ -5 number of nonzeros in matrix is negative
+
+ stats [4]: number of nonzeros, p [n_col]
+
+ -6 p [0] is nonzero
+
+ stats [4]: p [0]
+
+ -7 A is too small
+
+ stats [4]: required size
+ stats [5]: actual size (Alen)
+
+ -8 a column has a negative number of entries
+
+ stats [4]: column with < 0 entries
+ stats [5]: number of entries in col
+
+ -9 a row index is out of bounds
+
+ stats [4]: column with bad row index
+ stats [5]: bad row index
+ stats [6]: n_row, # of rows of matrx
+
+ -10 (unused; see symamd.c)
+
+ -999 (unused; see symamd.c)
+
+ Future versions may return more statistics in the stats array.
+
+ Example:
+
+ See http://www.cise.ufl.edu/research/sparse/colamd/example.c
+ for a complete example.
+
+ To order the columns of a 5-by-4 matrix with 11 nonzero entries in
+ the following nonzero pattern
+
+ x 0 x 0
+ x 0 x x
+ 0 x x 0
+ 0 0 x x
+ x x 0 0
+
+ with default knobs and no output statistics, do the following:
+
+ #include "colamd.h"
+ #define ALEN COLAMD_RECOMMENDED (11, 5, 4)
+ int A [ALEN] = {1, 2, 5, 3, 5, 1, 2, 3, 4, 2, 4} ;
+ int p [ ] = {0, 3, 5, 9, 11} ;
+ int stats [COLAMD_STATS] ;
+ colamd (5, 4, ALEN, A, p, (double *) NULL, stats) ;
+
+ The permutation is returned in the array p, and A is destroyed.
+
+ ----------------------------------------------------------------------------
+ symamd:
+ ----------------------------------------------------------------------------
+
+ C syntax:
+
+ #include "colamd.h"
+ int symamd (int n, int *A, int *p, int *perm,
+ double knobs [COLAMD_KNOBS], int stats [COLAMD_STATS],
+ void (*allocate) (size_t, size_t), void (*release) (void *)) ;
+
+ Purpose:
+
+ The symamd routine computes an ordering P of a symmetric sparse
+ matrix A such that the Cholesky factorization PAP' = LL' remains
+ sparse. It is based on a column ordering of a matrix M constructed
+ so that the nonzero pattern of M'M is the same as A. The matrix A
+ is assumed to be symmetric; only the strictly lower triangular part
+ is accessed. You must pass your selected memory allocator (usually
+ calloc/free or mxCalloc/mxFree) to symamd, for it to allocate
+ memory for the temporary matrix M.
+
+ Returns:
+
+ TRUE (1) if successful, FALSE (0) otherwise.
+
+ Arguments:
+
+ int n ; Input argument.
+
+ Number of rows and columns in the symmetrix matrix A.
+ Restriction: n >= 0.
+ Symamd returns FALSE if n is negative.
+
+ int A [nnz] ; Input argument.
+
+ A is an integer array of size nnz, where nnz = p [n].
+
+ The row indices of the entries in column c of the matrix are
+ held in A [(p [c]) ... (p [c+1]-1)]. The row indices in a
+ given column c need not be in ascending order, and duplicate
+ row indices may be present. However, symamd will run faster
+ if the columns are in sorted order with no duplicate entries.
+
+ The matrix is 0-based. That is, rows are in the range 0 to
+ n-1, and columns are in the range 0 to n-1. Symamd
+ returns FALSE if any row index is out of range.
+
+ The contents of A are not modified.
+
+ int p [n+1] ; Input argument.
+
+ p is an integer array of size n+1. On input, it holds the
+ "pointers" for the column form of the matrix A. Column c of
+ the matrix A is held in A [(p [c]) ... (p [c+1]-1)]. The first
+ entry, p [0], must be zero, and p [c] <= p [c+1] must hold
+ for all c in the range 0 to n-1. The value p [n] is
+ thus the total number of entries in the pattern of the matrix A.
+ Symamd returns FALSE if these conditions are not met.
+
+ The contents of p are not modified.
+
+ int perm [n+1] ; Output argument.
+
+ On output, if symamd returns TRUE, the array perm holds the
+ permutation P, where perm [0] is the first index in the new
+ ordering, and perm [n-1] is the last. That is, perm [k] = j
+ means that row and column j of A is the kth column in PAP',
+ where k is in the range 0 to n-1 (perm [0] = j means
+ that row and column j of A are the first row and column in
+ PAP'). The array is used as a workspace during the ordering,
+ which is why it must be of length n+1, not just n.
+
+ double knobs [COLAMD_KNOBS] ; Input argument.
+
+ See colamd_set_defaults for a description.
+
+ int stats [COLAMD_STATS] ; Output argument.
+
+ Statistics on the ordering, and error status.
+ See colamd.h for related definitions.
+ Symamd returns FALSE if stats is not present.
+
+ stats [0]: number of dense or empty row and columns ignored
+ (and ordered last in the output permutation
+ perm). Note that a row/column can become
+ "empty" if it contains only "dense" and/or
+ "empty" columns/rows.
+
+ stats [1]: (same as stats [0])
+
+ stats [2]: number of garbage collections performed.
+
+ stats [3]: status code. < 0 is an error code.
+ > 1 is a warning or notice.
+
+ 0 OK. Each column of the input matrix contained
+ row indices in increasing order, with no
+ duplicates.
+
+ 1 OK, but columns of input matrix were jumbled
+ (unsorted columns or duplicate entries). Symamd
+ had to do some extra work to sort the matrix
+ first and remove duplicate entries, but it
+ still was able to return a valid permutation
+ (return value of symamd was TRUE).
+
+ stats [4]: highest numbered column that
+ is unsorted or has duplicate
+ entries.
+ stats [5]: last seen duplicate or
+ unsorted row index.
+ stats [6]: number of duplicate or
+ unsorted row indices.
+
+ -1 A is a null pointer
+
+ -2 p is a null pointer
+
+ -3 (unused, see colamd.c)
+
+ -4 n is negative
+
+ stats [4]: n
+
+ -5 number of nonzeros in matrix is negative
+
+ stats [4]: # of nonzeros (p [n]).
+
+ -6 p [0] is nonzero
+
+ stats [4]: p [0]
+
+ -7 (unused)
+
+ -8 a column has a negative number of entries
+
+ stats [4]: column with < 0 entries
+ stats [5]: number of entries in col
+
+ -9 a row index is out of bounds
+
+ stats [4]: column with bad row index
+ stats [5]: bad row index
+ stats [6]: n_row, # of rows of matrx
+
+ -10 out of memory (unable to allocate temporary
+ workspace for M or count arrays using the
+ "allocate" routine passed into symamd).
+
+ -999 internal error. colamd failed to order the
+ matrix M, when it should have succeeded. This
+ indicates a bug. If this (and *only* this)
+ error code occurs, please contact the authors.
+ Don't contact the authors if you get any other
+ error code.
+
+ Future versions may return more statistics in the stats array.
+
+ void * (*allocate) (size_t, size_t)
+
+ A pointer to a function providing memory allocation. The
+ allocated memory must be returned initialized to zero. For a
+ C application, this argument should normally be a pointer to
+ calloc. For a MATLAB mexFunction, the routine mxCalloc is
+ passed instead.
+
+ void (*release) (size_t, size_t)
+
+ A pointer to a function that frees memory allocated by the
+ memory allocation routine above. For a C application, this
+ argument should normally be a pointer to free. For a MATLAB
+ mexFunction, the routine mxFree is passed instead.
+
+
+ ----------------------------------------------------------------------------
+ colamd_report:
+ ----------------------------------------------------------------------------
+
+ C syntax:
+
+ #include "colamd.h"
+ colamd_report (int stats [COLAMD_STATS]) ;
+
+ Purpose:
+
+ Prints the error status and statistics recorded in the stats
+ array on the standard error output (for a standard C routine)
+ or on the MATLAB output (for a mexFunction).
+
+ Arguments:
+
+ int stats [COLAMD_STATS] ; Input only. Statistics from colamd.
+
+
+ ----------------------------------------------------------------------------
+ symamd_report:
+ ----------------------------------------------------------------------------
+
+ C syntax:
+
+ #include "colamd.h"
+ symamd_report (int stats [COLAMD_STATS]) ;
+
+ Purpose:
+
+ Prints the error status and statistics recorded in the stats
+ array on the standard error output (for a standard C routine)
+ or on the MATLAB output (for a mexFunction).
+
+ Arguments:
+
+ int stats [COLAMD_STATS] ; Input only. Statistics from symamd.
+
+ </pre>
+*/
+
+/* ========================================================================== */
+/* === Scaffolding code definitions ======================================== */
+/* ========================================================================== */
+
+/* Ensure that debugging is turned off: */
+#ifndef NDEBUG
+#define NDEBUG
+#endif /* NDEBUG */
+
+/*
+ Our "scaffolding code" philosophy: In our opinion, well-written library
+ code should keep its "debugging" code, and just normally have it turned off
+ by the compiler so as not to interfere with performance. This serves
+ several purposes:
+
+ (1) assertions act as comments to the reader, telling you what the code
+ expects at that point. All assertions will always be true (unless
+ there really is a bug, of course).
+
+ (2) leaving in the scaffolding code assists anyone who would like to modify
+ the code, or understand the algorithm (by reading the debugging output,
+ one can get a glimpse into what the code is doing).
+
+ (3) (gasp!) for actually finding bugs. This code has been heavily tested
+ and "should" be fully functional and bug-free ... but you never know...
+
+ To enable debugging, comment out the "#define NDEBUG" above. For a MATLAB
+ mexFunction, you will also need to modify mexopts.sh to remove the -DNDEBUG
+ definition. The code will become outrageously slow when debugging is
+ enabled. To control the level of debugging output, set an environment
+ variable D to 0 (little), 1 (some), 2, 3, or 4 (lots). When debugging,
+ you should see the following message on the standard output:
+
+ colamd: debug version, D = 1 (THIS WILL BE SLOW!)
+
+ or a similar message for symamd. If you don't, then debugging has not
+ been enabled.
+
+*/
+
+/* ========================================================================== */
+/* === Include files ======================================================== */
+/* ========================================================================== */
+
+#include "colamd.h"
+#include <limits.h>
+
+#ifdef MATLAB_MEX_FILE
+#include "mex.h"
+#include "matrix.h"
+#else
+#include <stdio.h>
+#include <assert.h>
+#endif /* MATLAB_MEX_FILE */
+
+/* ========================================================================== */
+/* === Definitions ========================================================== */
+/* ========================================================================== */
+
+/* Routines are either PUBLIC (user-callable) or PRIVATE (not user-callable) */
+#define PUBLIC
+#define PRIVATE static
+
+#define MAX(a,b) (((a) > (b)) ? (a) : (b))
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
+
+#define ONES_COMPLEMENT(r) (-(r)-1)
+
+/* -------------------------------------------------------------------------- */
+/* Change for version 2.1: define TRUE and FALSE only if not yet defined */
+/* -------------------------------------------------------------------------- */
+
+#ifndef TRUE
+#define TRUE (1)
+#endif
+
+#ifndef FALSE
+#define FALSE (0)
+#endif
+
+/* -------------------------------------------------------------------------- */
+
+#define EMPTY (-1)
+
+/* Row and column status */
+#define ALIVE (0)
+#define DEAD (-1)
+
+/* Column status */
+#define DEAD_PRINCIPAL (-1)
+#define DEAD_NON_PRINCIPAL (-2)
+
+/* Macros for row and column status update and checking. */
+#define ROW_IS_DEAD(r) ROW_IS_MARKED_DEAD (Row[r].shared2.mark)
+#define ROW_IS_MARKED_DEAD(row_mark) (row_mark < ALIVE)
+#define ROW_IS_ALIVE(r) (Row [r].shared2.mark >= ALIVE)
+#define COL_IS_DEAD(c) (Col [c].start < ALIVE)
+#define COL_IS_ALIVE(c) (Col [c].start >= ALIVE)
+#define COL_IS_DEAD_PRINCIPAL(c) (Col [c].start == DEAD_PRINCIPAL)
+#define KILL_ROW(r) { Row [r].shared2.mark = DEAD ; }
+#define KILL_PRINCIPAL_COL(c) { Col [c].start = DEAD_PRINCIPAL ; }
+#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = DEAD_NON_PRINCIPAL ; }
+
+/* ========================================================================== */
+/* === Colamd reporting mechanism =========================================== */
+/* ========================================================================== */
+
+#ifdef MATLAB_MEX_FILE
+
+/* use mexPrintf in a MATLAB mexFunction, for debugging and statistics output */
+#define PRINTF mexPrintf
+
+/* In MATLAB, matrices are 1-based to the user, but 0-based internally */
+#define INDEX(i) ((i)+1)
+
+#else
+
+/* Use printf in standard C environment, for debugging and statistics output. */
+/* Output is generated only if debugging is enabled at compile time, or if */
+/* the caller explicitly calls colamd_report or symamd_report. */
+#define PRINTF printf
+
+/* In C, matrices are 0-based and indices are reported as such in *_report */
+#define INDEX(i) (i)
+
+#endif /* MATLAB_MEX_FILE */
+
+/* ========================================================================== */
+/* === Prototypes of PRIVATE routines ======================================= */
+/* ========================================================================== */
+
+PRIVATE int init_rows_cols
+(
+ int n_row,
+ int n_col,
+ Colamd_Row Row [],
+ Colamd_Col Col [],
+ int A [],
+ int p [],
+ int stats [COLAMD_STATS]
+) ;
+
+PRIVATE void init_scoring
+(
+ int n_row,
+ int n_col,
+ Colamd_Row Row [],
+ Colamd_Col Col [],
+ int A [],
+ int head [],
+ double knobs [COLAMD_KNOBS],
+ int *p_n_row2,
+ int *p_n_col2,
+ int *p_max_deg
+) ;
+
+PRIVATE int find_ordering
+(
+ int n_row,
+ int n_col,
+ int Alen,
+ Colamd_Row Row [],
+ Colamd_Col Col [],
+ int A [],
+ int head [],
+ int n_col2,
+ int max_deg,
+ int pfree
+) ;
+
+PRIVATE void order_children
+(
+ int n_col,
+ Colamd_Col Col [],
+ int p []
+) ;
+
+PRIVATE void detect_super_cols
+(
+
+#ifndef NDEBUG
+ int n_col,
+ Colamd_Row Row [],
+#endif /* NDEBUG */
+
+ Colamd_Col Col [],
+ int A [],
+ int head [],
+ int row_start,
+ int row_length
+) ;
+
+PRIVATE int garbage_collection
+(
+ int n_row,
+ int n_col,
+ Colamd_Row Row [],
+ Colamd_Col Col [],
+ int A [],
+ int *pfree
+) ;
+
+PRIVATE int clear_mark
+(
+ int n_row,
+ Colamd_Row Row []
+) ;
+
+PRIVATE void print_report
+(
+ char *method,
+ int stats [COLAMD_STATS]
+) ;
+
+/* ========================================================================== */
+/* === Debugging prototypes and definitions ================================= */
+/* ========================================================================== */
+
+#ifndef NDEBUG
+
+/* colamd_debug is the *ONLY* global variable, and is only */
+/* present when debugging */
+
+PRIVATE int colamd_debug ; /* debug print level */
+
+#define DEBUG0(params) { (void) PRINTF params ; }
+#define DEBUG1(params) { if (colamd_debug >= 1) (void) PRINTF params ; }
+#define DEBUG2(params) { if (colamd_debug >= 2) (void) PRINTF params ; }
+#define DEBUG3(params) { if (colamd_debug >= 3) (void) PRINTF params ; }
+#define DEBUG4(params) { if (colamd_debug >= 4) (void) PRINTF params ; }
+
+#ifdef MATLAB_MEX_FILE
+#define ASSERT(expression) (mxAssert ((expression), ""))
+#else
+#define ASSERT(expression) (assert (expression))
+#endif /* MATLAB_MEX_FILE */
+
+PRIVATE void colamd_get_debug /* gets the debug print level from getenv */
+(
+ char *method
+) ;
+
+PRIVATE void debug_deg_lists
+(
+ int n_row,
+ int n_col,
+ Colamd_Row Row [],
+ Colamd_Col Col [],
+ int head [],
+ int min_score,
+ int should,
+ int max_deg
+) ;
+
+PRIVATE void debug_mark
+(
+ int n_row,
+ Colamd_Row Row [],
+ int tag_mark,
+ int max_mark
+) ;
+
+PRIVATE void debug_matrix
+(
+ int n_row,
+ int n_col,
+ Colamd_Row Row [],
+ Colamd_Col Col [],
+ int A []
+) ;
+
+PRIVATE void debug_structures
+(
+ int n_row,
+ int n_col,
+ Colamd_Row Row [],
+ Colamd_Col Col [],
+ int A [],
+ int n_col2
+) ;
+
+#else /* NDEBUG */
+
+/* === No debugging ========================================================= */
+
+#define DEBUG0(params) ;
+#define DEBUG1(params) ;
+#define DEBUG2(params) ;
+#define DEBUG3(params) ;
+#define DEBUG4(params) ;
+
+#define ASSERT(expression) ((void) 0)
+
+#endif /* NDEBUG */
+
+/* ========================================================================== */
+
+
+
+/* ========================================================================== */
+/* === USER-CALLABLE ROUTINES: ============================================== */
+/* ========================================================================== */
+
+
+/* ========================================================================== */
+/* === colamd_recommended =================================================== */
+/* ========================================================================== */
+
+/*
+ The colamd_recommended routine returns the suggested size for Alen. This
+ value has been determined to provide good balance between the number of
+ garbage collections and the memory requirements for colamd. If any
+ argument is negative, a -1 is returned as an error condition. This
+ function is also available as a macro defined in colamd.h, so that you
+ can use it for a statically-allocated array size.
+*/
+
+PUBLIC int colamd_recommended /* returns recommended value of Alen. */
+(
+ /* === Parameters ======================================================= */
+
+ int nnz, /* number of nonzeros in A */
+ int n_row, /* number of rows in A */
+ int n_col /* number of columns in A */
+)
+{
+ return (COLAMD_RECOMMENDED (nnz, n_row, n_col)) ;
+}
+
+
+/* ========================================================================== */
+/* === colamd_set_defaults ================================================== */
+/* ========================================================================== */
+
+/*
+ The colamd_set_defaults routine sets the default values of the user-
+ controllable parameters for colamd:
+
+ knobs [0] rows with knobs[0]*n_col entries or more are removed
+ prior to ordering in colamd. Rows and columns with
+ knobs[0]*n_col entries or more are removed prior to
+ ordering in symamd and placed last in the output
+ ordering.
+
+ knobs [1] columns with knobs[1]*n_row entries or more are removed
+ prior to ordering in colamd, and placed last in the
+ column permutation. Symamd ignores this knob.
+
+ knobs [2..19] unused, but future versions might use this
+*/
+
+PUBLIC void colamd_set_defaults
+(
+ /* === Parameters ======================================================= */
+
+ double knobs [COLAMD_KNOBS] /* knob array */
+)
+{
+ /* === Local variables ================================================== */
+
+ int i ;
+
+ if (!knobs)
+ {
+ return ; /* no knobs to initialize */
+ }
+ for (i = 0 ; i < COLAMD_KNOBS ; i++)
+ {
+ knobs [i] = 0 ;
+ }
+ knobs [COLAMD_DENSE_ROW] = 0.5 ; /* ignore rows over 50% dense */
+ knobs [COLAMD_DENSE_COL] = 0.5 ; /* ignore columns over 50% dense */
+}
+
+
+/* ========================================================================== */
+/* === symamd =============================================================== */
+/* ========================================================================== */
+
+PUBLIC int symamd /* return TRUE if OK, FALSE otherwise */
+(
+ /* === Parameters ======================================================= */
+
+ int n, /* number of rows and columns of A */
+ int A [], /* row indices of A */
+ int p [], /* column pointers of A */
+ int perm [], /* output permutation, size n+1 */
+ double knobs [COLAMD_KNOBS], /* parameters (uses defaults if NULL) */
+ int stats [COLAMD_STATS], /* output statistics and error codes */
+ void * (*allocate) (size_t, size_t),
+ /* pointer to calloc (ANSI C) or */
+ /* mxCalloc (for MATLAB mexFunction) */
+ void (*release) (void *)
+ /* pointer to free (ANSI C) or */
+ /* mxFree (for MATLAB mexFunction) */
+)
+{
+ /* === Local variables ================================================== */
+
+ int *count ; /* length of each column of M, and col pointer*/
+ int *mark ; /* mark array for finding duplicate entries */
+ int *M ; /* row indices of matrix M */
+ int Mlen ; /* length of M */
+ int n_row ; /* number of rows in M */
+ int nnz ; /* number of entries in A */
+ int i ; /* row index of A */
+ int j ; /* column index of A */
+ int k ; /* row index of M */
+ int mnz ; /* number of nonzeros in M */
+ int pp ; /* index into a column of A */
+ int last_row ; /* last row seen in the current column */
+ int length ; /* number of nonzeros in a column */
+
+ double cknobs [COLAMD_KNOBS] ; /* knobs for colamd */
+ double default_knobs [COLAMD_KNOBS] ; /* default knobs for colamd */
+ int cstats [COLAMD_STATS] ; /* colamd stats */
+
+#ifndef NDEBUG
+ colamd_get_debug ("symamd") ;
+#endif /* NDEBUG */
+
+ /* === Check the input arguments ======================================== */
+
+ if (!stats)
+ {
+ DEBUG0 (("symamd: stats not present\n")) ;
+ return (FALSE) ;
+ }
+ for (i = 0 ; i < COLAMD_STATS ; i++)
+ {
+ stats [i] = 0 ;
+ }
+ stats [COLAMD_STATUS] = COLAMD_OK ;
+ stats [COLAMD_INFO1] = -1 ;
+ stats [COLAMD_INFO2] = -1 ;
+
+ if (!A)
+ {
+ stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ;
+ DEBUG0 (("symamd: A not present\n")) ;
+ return (FALSE) ;
+ }
+
+ if (!p) /* p is not present */
+ {
+ stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ;
+ DEBUG0 (("symamd: p not present\n")) ;
+ return (FALSE) ;
+ }
+
+ if (n < 0) /* n must be >= 0 */
+ {
+ stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ;
+ stats [COLAMD_INFO1] = n ;
+ DEBUG0 (("symamd: n negative %d\n", n)) ;
+ return (FALSE) ;
+ }
+
+ nnz = p [n] ;
+ if (nnz < 0) /* nnz must be >= 0 */
+ {
+ stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ;
+ stats [COLAMD_INFO1] = nnz ;
+ DEBUG0 (("symamd: number of entries negative %d\n", nnz)) ;
+ return (FALSE) ;
+ }
+
+ if (p [0] != 0)
+ {
+ stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero ;
+ stats [COLAMD_INFO1] = p [0] ;
+ DEBUG0 (("symamd: p[0] not zero %d\n", p [0])) ;
+ return (FALSE) ;
+ }
+
+ /* === If no knobs, set default knobs =================================== */
+
+ if (!knobs)
+ {
+ colamd_set_defaults (default_knobs) ;
+ knobs = default_knobs ;
+ }
+
+ /* === Allocate count and mark ========================================== */
+
+ count = (int *) ((*allocate) (n+1, sizeof (int))) ;
+ if (!count)
+ {
+ stats [COLAMD_STATUS] = COLAMD_ERROR_out_of_memory ;
+ DEBUG0 (("symamd: allocate count (size %d) failed\n", n+1)) ;
+ return (FALSE) ;
+ }
+
+ mark = (int *) ((*allocate) (n+1, sizeof (int))) ;
+ if (!mark)
+ {
+ stats [COLAMD_STATUS] = COLAMD_ERROR_out_of_memory ;
+ (*release) ((void *) count) ;
+ DEBUG0 (("symamd: allocate mark (size %d) failed\n", n+1)) ;
+ return (FALSE) ;
+ }
+
+ /* === Compute column counts of M, check if A is valid ================== */
+
+ stats [COLAMD_INFO3] = 0 ; /* number of duplicate or unsorted row indices*/
+
+ for (i = 0 ; i < n ; i++)
+ {
+ mark [i] = -1 ;
+ }
+
+ for (j = 0 ; j < n ; j++)
+ {
+ last_row = -1 ;
+
+ length = p [j+1] - p [j] ;
+ if (length < 0)
+ {
+ /* column pointers must be non-decreasing */
+ stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ;
+ stats [COLAMD_INFO1] = j ;
+ stats [COLAMD_INFO2] = length ;
+ (*release) ((void *) count) ;
+ (*release) ((void *) mark) ;
+ DEBUG0 (("symamd: col %d negative length %d\n", j, length)) ;
+ return (FALSE) ;
+ }
+
+ for (pp = p [j] ; pp < p [j+1] ; pp++)
+ {
+ i = A [pp] ;
+ if (i < 0 || i >= n)
+ {
+ /* row index i, in column j, is out of bounds */
+ stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ;
+ stats [COLAMD_INFO1] = j ;
+ stats [COLAMD_INFO2] = i ;
+ stats [COLAMD_INFO3] = n ;
+ (*release) ((void *) count) ;
+ (*release) ((void *) mark) ;
+ DEBUG0 (("symamd: row %d col %d out of bounds\n", i, j)) ;
+ return (FALSE) ;
+ }
+
+ if (i <= last_row || mark [i] == j)
+ {
+ /* row index is unsorted or repeated (or both), thus col */
+ /* is jumbled. This is a notice, not an error condition. */
+ stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ;
+ stats [COLAMD_INFO1] = j ;
+ stats [COLAMD_INFO2] = i ;
+ (stats [COLAMD_INFO3]) ++ ;
+ DEBUG1 (("symamd: row %d col %d unsorted/duplicate\n", i, j)) ;
+ }
+
+ if (i > j && mark [i] != j)
+ {
+ /* row k of M will contain column indices i and j */
+ count [i]++ ;
+ count [j]++ ;
+ }
+
+ /* mark the row as having been seen in this column */
+ mark [i] = j ;
+
+ last_row = i ;
+ }
+ }
+
+ if (stats [COLAMD_STATUS] == COLAMD_OK)
+ {
+ /* if there are no duplicate entries, then mark is no longer needed */
+ (*release) ((void *) mark) ;
+ }
+
+ /* === Compute column pointers of M ===================================== */
+
+ /* use output permutation, perm, for column pointers of M */
+ perm [0] = 0 ;
+ for (j = 1 ; j <= n ; j++)
+ {
+ perm [j] = perm [j-1] + count [j-1] ;
+ }
+ for (j = 0 ; j < n ; j++)
+ {
+ count [j] = perm [j] ;
+ }
+
+ /* === Construct M ====================================================== */
+
+ mnz = perm [n] ;
+ n_row = mnz / 2 ;
+ Mlen = colamd_recommended (mnz, n_row, n) ;
+ M = (int *) ((*allocate) (Mlen, sizeof (int))) ;
+ DEBUG0 (("symamd: M is %d-by-%d with %d entries, Mlen = %d\n",
+ n_row, n, mnz, Mlen)) ;
+
+ if (!M)
+ {
+ stats [COLAMD_STATUS] = COLAMD_ERROR_out_of_memory ;
+ (*release) ((void *) count) ;
+ (*release) ((void *) mark) ;
+ DEBUG0 (("symamd: allocate M (size %d) failed\n", Mlen)) ;
+ return (FALSE) ;
+ }
+
+ k = 0 ;
+
+ if (stats [COLAMD_STATUS] == COLAMD_OK)
+ {
+ /* Matrix is OK */
+ for (j = 0 ; j < n ; j++)
+ {
+ ASSERT (p [j+1] - p [j] >= 0) ;
+ for (pp = p [j] ; pp < p [j+1] ; pp++)
+ {
+ i = A [pp] ;
+ ASSERT (i >= 0 && i < n) ;
+ if (i > j)
+ {
+ /* row k of M contains column indices i and j */
+ M [count [i]++] = k ;
+ M [count [j]++] = k ;
+ k++ ;
+ }
+ }
+ }
+ }
+ else
+ {
+ /* Matrix is jumbled. Do not add duplicates to M. Unsorted cols OK. */
+ DEBUG0 (("symamd: Duplicates in A.\n")) ;
+ for (i = 0 ; i < n ; i++)
+ {
+ mark [i] = -1 ;
+ }
+ for (j = 0 ; j < n ; j++)
+ {
+ ASSERT (p [j+1] - p [j] >= 0) ;
+ for (pp = p [j] ; pp < p [j+1] ; pp++)
+ {
+ i = A [pp] ;
+ ASSERT (i >= 0 && i < n) ;
+ if (i > j && mark [i] != j)
+ {
+ /* row k of M contains column indices i and j */
+ M [count [i]++] = k ;
+ M [count [j]++] = k ;
+ k++ ;
+ mark [i] = j ;
+ }
+ }
+ }
+ (*release) ((void *) mark) ;
+ }
+
+ /* count and mark no longer needed */
+ (*release) ((void *) count) ;
+ ASSERT (k == n_row) ;
+
+ /* === Adjust the knobs for M =========================================== */
+
+ for (i = 0 ; i < COLAMD_KNOBS ; i++)
+ {
+ cknobs [i] = knobs [i] ;
+ }
+
+ /* there are no dense rows in M */
+ cknobs [COLAMD_DENSE_ROW] = 1.0 ;
+
+ if (n_row != 0 && n < n_row)
+ {
+ /* On input, the knob is a fraction of 1..n, the number of rows of A. */
+ /* Convert it to a fraction of 1..n_row, of the number of rows of M. */
+ cknobs [COLAMD_DENSE_COL] = (knobs [COLAMD_DENSE_ROW] * n) / n_row ;
+ }
+ else
+ {
+ /* no dense columns in M */
+ cknobs [COLAMD_DENSE_COL] = 1.0 ;
+ }
+
+ DEBUG0 (("symamd: dense col knob for M: %g\n", cknobs [COLAMD_DENSE_COL])) ;
+
+ /* === Order the columns of M =========================================== */
+
+ if (!colamd (n_row, n, Mlen, M, perm, cknobs, cstats))
+ {
+ /* This "cannot" happen, unless there is a bug in the code. */
+ stats [COLAMD_STATUS] = COLAMD_ERROR_internal_error ;
+ (*release) ((void *) M) ;
+ DEBUG0 (("symamd: internal error!\n")) ;
+ return (FALSE) ;
+ }
+
+ /* Note that the output permutation is now in perm */
+
+ /* === get the statistics for symamd from colamd ======================== */
+
+ /* note that a dense column in colamd means a dense row and col in symamd */
+ stats [COLAMD_DENSE_ROW] = cstats [COLAMD_DENSE_COL] ;
+ stats [COLAMD_DENSE_COL] = cstats [COLAMD_DENSE_COL] ;
+ stats [COLAMD_DEFRAG_COUNT] = cstats [COLAMD_DEFRAG_COUNT] ;
+
+ /* === Free M =========================================================== */
+
+ (*release) ((void *) M) ;
+ DEBUG0 (("symamd: done.\n")) ;
+ return (TRUE) ;
+
+}
+
+/* ========================================================================== */
+/* === colamd =============================================================== */
+/* ========================================================================== */
+
+/*
+ The colamd routine computes a column ordering Q of a sparse matrix
+ A such that the LU factorization P(AQ) = LU remains sparse, where P is
+ selected via partial pivoting. The routine can also be viewed as
+ providing a permutation Q such that the Cholesky factorization
+ (AQ)'(AQ) = LL' remains sparse.
+*/
+
+PUBLIC int colamd /* returns TRUE if successful, FALSE otherwise*/
+(
+ /* === Parameters ======================================================= */
+
+ int n_row, /* number of rows in A */
+ int n_col, /* number of columns in A */
+ int Alen, /* length of A */
+ int A [], /* row indices of A */
+ int p [], /* pointers to columns in A */
+ double knobs [COLAMD_KNOBS],/* parameters (uses defaults if NULL) */
+ int stats [COLAMD_STATS] /* output statistics and error codes */
+)
+{
+ /* === Local variables ================================================== */
+
+ int i ; /* loop index */
+ int nnz ; /* nonzeros in A */
+ int Row_size ; /* size of Row [], in integers */
+ int Col_size ; /* size of Col [], in integers */
+ int need ; /* minimum required length of A */
+ Colamd_Row *Row ; /* pointer into A of Row [0..n_row] array */
+ Colamd_Col *Col ; /* pointer into A of Col [0..n_col] array */
+ int n_col2 ; /* number of non-dense, non-empty columns */
+ int n_row2 ; /* number of non-dense, non-empty rows */
+ int ngarbage ; /* number of garbage collections performed */
+ int max_deg ; /* maximum row degree */
+ double default_knobs [COLAMD_KNOBS] ; /* default knobs array */
+
+#ifndef NDEBUG
+ colamd_get_debug ("colamd") ;
+#endif /* NDEBUG */
+
+ /* === Check the input arguments ======================================== */
+
+ if (!stats)
+ {
+ DEBUG0 (("colamd: stats not present\n")) ;
+ return (FALSE) ;
+ }
+ for (i = 0 ; i < COLAMD_STATS ; i++)
+ {
+ stats [i] = 0 ;
+ }
+ stats [COLAMD_STATUS] = COLAMD_OK ;
+ stats [COLAMD_INFO1] = -1 ;
+ stats [COLAMD_INFO2] = -1 ;
+
+ if (!A) /* A is not present */
+ {
+ stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ;
+ DEBUG0 (("colamd: A not present\n")) ;
+ return (FALSE) ;
+ }
+
+ if (!p) /* p is not present */
+ {
+ stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ;
+ DEBUG0 (("colamd: p not present\n")) ;
+ return (FALSE) ;
+ }
+
+ if (n_row < 0) /* n_row must be >= 0 */
+ {
+ stats [COLAMD_STATUS] = COLAMD_ERROR_nrow_negative ;
+ stats [COLAMD_INFO1] = n_row ;
+ DEBUG0 (("colamd: nrow negative %d\n", n_row)) ;
+ return (FALSE) ;
+ }
+
+ if (n_col < 0) /* n_col must be >= 0 */
+ {
+ stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ;
+ stats [COLAMD_INFO1] = n_col ;
+ DEBUG0 (("colamd: ncol negative %d\n", n_col)) ;
+ return (FALSE) ;
+ }
+
+ nnz = p [n_col] ;
+ if (nnz < 0) /* nnz must be >= 0 */
+ {
+ stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ;
+ stats [COLAMD_INFO1] = nnz ;
+ DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ;
+ return (FALSE) ;
+ }
+
+ if (p [0] != 0)
+ {
+ stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero ;
+ stats [COLAMD_INFO1] = p [0] ;
+ DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ;
+ return (FALSE) ;
+ }
+
+ /* === If no knobs, set default knobs =================================== */
+
+ if (!knobs)
+ {
+ colamd_set_defaults (default_knobs) ;
+ knobs = default_knobs ;
+ }
+
+ /* === Allocate the Row and Col arrays from array A ===================== */
+
+ Col_size = COLAMD_C (n_col) ;
+ Row_size = COLAMD_R (n_row) ;
+ need = 2*nnz + n_col + Col_size + Row_size ;
+
+ if (need > Alen)
+ {
+ /* not enough space in array A to perform the ordering */
+ stats [COLAMD_STATUS] = COLAMD_ERROR_A_too_small ;
+ stats [COLAMD_INFO1] = need ;
+ stats [COLAMD_INFO2] = Alen ;
+ DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen));
+ return (FALSE) ;
+ }
+
+ Alen -= Col_size + Row_size ;
+ Col = (Colamd_Col *) &A [Alen] ;
+ Row = (Colamd_Row *) &A [Alen + Col_size] ;
+
+ /* === Construct the row and column data structures ===================== */
+
+ if (!init_rows_cols (n_row, n_col, Row, Col, A, p, stats))
+ {
+ /* input matrix is invalid */
+ DEBUG0 (("colamd: Matrix invalid\n")) ;
+ return (FALSE) ;
+ }
+
+ /* === Initialize scores, kill dense rows/columns ======================= */
+
+ init_scoring (n_row, n_col, Row, Col, A, p, knobs,
+ &n_row2, &n_col2, &max_deg) ;
+
+ /* === Order the supercolumns =========================================== */
+
+ ngarbage = find_ordering (n_row, n_col, Alen, Row, Col, A, p,
+ n_col2, max_deg, 2*nnz) ;
+
+ /* === Order the non-principal columns ================================== */
+
+ order_children (n_col, Col, p) ;
+
+ /* === Return statistics in stats ======================================= */
+
+ stats [COLAMD_DENSE_ROW] = n_row - n_row2 ;
+ stats [COLAMD_DENSE_COL] = n_col - n_col2 ;
+ stats [COLAMD_DEFRAG_COUNT] = ngarbage ;
+ DEBUG0 (("colamd: done.\n")) ;
+ return (TRUE) ;
+}
+
+
+/* ========================================================================== */
+/* === colamd_report ======================================================== */
+/* ========================================================================== */
+
+PUBLIC void colamd_report
+(
+ int stats [COLAMD_STATS]
+)
+{
+ print_report ("colamd", stats) ;
+}
+
+
+/* ========================================================================== */
+/* === symamd_report ======================================================== */
+/* ========================================================================== */
+
+PUBLIC void symamd_report
+(
+ int stats [COLAMD_STATS]
+)
+{
+ print_report ("symamd", stats) ;
+}
+
+
+
+/* ========================================================================== */
+/* === NON-USER-CALLABLE ROUTINES: ========================================== */
+/* ========================================================================== */
+
+/* There are no user-callable routines beyond this point in the file */
+
+
+/* ========================================================================== */
+/* === init_rows_cols ======================================================= */
+/* ========================================================================== */
+
+/*
+ Takes the column form of the matrix in A and creates the row form of the
+ matrix. Also, row and column attributes are stored in the Col and Row
+ structs. If the columns are un-sorted or contain duplicate row indices,
+ this routine will also sort and remove duplicate row indices from the
+ column form of the matrix. Returns FALSE if the matrix is invalid,
+ TRUE otherwise. Not user-callable.
+*/
+
+PRIVATE int init_rows_cols /* returns TRUE if OK, or FALSE otherwise */
+(
+ /* === Parameters ======================================================= */
+
+ int n_row, /* number of rows of A */
+ int n_col, /* number of columns of A */
+ Colamd_Row Row [], /* of size n_row+1 */
+ Colamd_Col Col [], /* of size n_col+1 */
+ int A [], /* row indices of A, of size Alen */
+ int p [], /* pointers to columns in A, of size n_col+1 */
+ int stats [COLAMD_STATS] /* colamd statistics */
+)
+{
+ /* === Local variables ================================================== */
+
+ int col ; /* a column index */
+ int row ; /* a row index */
+ int *cp ; /* a column pointer */
+ int *cp_end ; /* a pointer to the end of a column */
+ int *rp ; /* a row pointer */
+ int *rp_end ; /* a pointer to the end of a row */
+ int last_row ; /* previous row */
+
+ /* === Initialize columns, and check column pointers ==================== */
+
+ for (col = 0 ; col < n_col ; col++)
+ {
+ Col [col].start = p [col] ;
+ Col [col].length = p [col+1] - p [col] ;
+
+ if (Col [col].length < 0)
+ {
+ /* column pointers must be non-decreasing */
+ stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ;
+ stats [COLAMD_INFO1] = col ;
+ stats [COLAMD_INFO2] = Col [col].length ;
+ DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ;
+ return (FALSE) ;
+ }
+
+ Col [col].shared1.thickness = 1 ;
+ Col [col].shared2.score = 0 ;
+ Col [col].shared3.prev = EMPTY ;
+ Col [col].shared4.degree_next = EMPTY ;
+ }
+
+ /* p [0..n_col] no longer needed, used as "head" in subsequent routines */
+
+ /* === Scan columns, compute row degrees, and check row indices ========= */
+
+ stats [COLAMD_INFO3] = 0 ; /* number of duplicate or unsorted row indices*/
+
+ for (row = 0 ; row < n_row ; row++)
+ {
+ Row [row].length = 0 ;
+ Row [row].shared2.mark = -1 ;
+ }
+
+ for (col = 0 ; col < n_col ; col++)
+ {
+ last_row = -1 ;
+
+ cp = &A [p [col]] ;
+ cp_end = &A [p [col+1]] ;
+
+ while (cp < cp_end)
+ {
+ row = *cp++ ;
+
+ /* make sure row indices within range */
+ if (row < 0 || row >= n_row)
+ {
+ stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ;
+ stats [COLAMD_INFO1] = col ;
+ stats [COLAMD_INFO2] = row ;
+ stats [COLAMD_INFO3] = n_row ;
+ DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ;
+ return (FALSE) ;
+ }
+
+ if (row <= last_row || Row [row].shared2.mark == col)
+ {
+ /* row index are unsorted or repeated (or both), thus col */
+ /* is jumbled. This is a notice, not an error condition. */
+ stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ;
+ stats [COLAMD_INFO1] = col ;
+ stats [COLAMD_INFO2] = row ;
+ (stats [COLAMD_INFO3]) ++ ;
+ DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col));
+ }
+
+ if (Row [row].shared2.mark != col)
+ {
+ Row [row].length++ ;
+ }
+ else
+ {
+ /* this is a repeated entry in the column, */
+ /* it will be removed */
+ Col [col].length-- ;
+ }
+
+ /* mark the row as having been seen in this column */
+ Row [row].shared2.mark = col ;
+
+ last_row = row ;
+ }
+ }
+
+ /* === Compute row pointers ============================================= */
+
+ /* row form of the matrix starts directly after the column */
+ /* form of matrix in A */
+ Row [0].start = p [n_col] ;
+ Row [0].shared1.p = Row [0].start ;
+ Row [0].shared2.mark = -1 ;
+ for (row = 1 ; row < n_row ; row++)
+ {
+ Row [row].start = Row [row-1].start + Row [row-1].length ;
+ Row [row].shared1.p = Row [row].start ;
+ Row [row].shared2.mark = -1 ;
+ }
+
+ /* === Create row form ================================================== */
+
+ if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
+ {
+ /* if cols jumbled, watch for repeated row indices */
+ for (col = 0 ; col < n_col ; col++)
+ {
+ cp = &A [p [col]] ;
+ cp_end = &A [p [col+1]] ;
+ while (cp < cp_end)
+ {
+ row = *cp++ ;
+ if (Row [row].shared2.mark != col)
+ {
+ A [(Row [row].shared1.p)++] = col ;
+ Row [row].shared2.mark = col ;
+ }
+ }
+ }
+ }
+ else
+ {
+ /* if cols not jumbled, we don't need the mark (this is faster) */
+ for (col = 0 ; col < n_col ; col++)
+ {
+ cp = &A [p [col]] ;
+ cp_end = &A [p [col+1]] ;
+ while (cp < cp_end)
+ {
+ A [(Row [*cp++].shared1.p)++] = col ;
+ }
+ }
+ }
+
+ /* === Clear the row marks and set row degrees ========================== */
+
+ for (row = 0 ; row < n_row ; row++)
+ {
+ Row [row].shared2.mark = 0 ;
+ Row [row].shared1.degree = Row [row].length ;
+ }
+
+ /* === See if we need to re-create columns ============================== */
+
+ if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
+ {
+ DEBUG0 (("colamd: reconstructing column form, matrix jumbled\n")) ;
+
+#ifndef NDEBUG
+ /* make sure column lengths are correct */
+ for (col = 0 ; col < n_col ; col++)
+ {
+ p [col] = Col [col].length ;
+ }
+ for (row = 0 ; row < n_row ; row++)
+ {
+ rp = &A [Row [row].start] ;
+ rp_end = rp + Row [row].length ;
+ while (rp < rp_end)
+ {
+ p [*rp++]-- ;
+ }
+ }
+ for (col = 0 ; col < n_col ; col++)
+ {
+ ASSERT (p [col] == 0) ;
+ }
+ /* now p is all zero (different than when debugging is turned off) */
+#endif /* NDEBUG */
+
+ /* === Compute col pointers ========================================= */
+
+ /* col form of the matrix starts at A [0]. */
+ /* Note, we may have a gap between the col form and the row */
+ /* form if there were duplicate entries, if so, it will be */
+ /* removed upon the first garbage collection */
+ Col [0].start = 0 ;
+ p [0] = Col [0].start ;
+ for (col = 1 ; col < n_col ; col++)
+ {
+ /* note that the lengths here are for pruned columns, i.e. */
+ /* no duplicate row indices will exist for these columns */
+ Col [col].start = Col [col-1].start + Col [col-1].length ;
+ p [col] = Col [col].start ;
+ }
+
+ /* === Re-create col form =========================================== */
+
+ for (row = 0 ; row < n_row ; row++)
+ {
+ rp = &A [Row [row].start] ;
+ rp_end = rp + Row [row].length ;
+ while (rp < rp_end)
+ {
+ A [(p [*rp++])++] = row ;
+ }
+ }
+ }
+
+ /* === Done. Matrix is not (or no longer) jumbled ====================== */
+
+ return (TRUE) ;
+}
+
+
+/* ========================================================================== */
+/* === init_scoring ========================================================= */
+/* ========================================================================== */
+
+/*
+ Kills dense or empty columns and rows, calculates an initial score for
+ each column, and places all columns in the degree lists. Not user-callable.
+*/
+
+PRIVATE void init_scoring
+(
+ /* === Parameters ======================================================= */
+
+ int n_row, /* number of rows of A */
+ int n_col, /* number of columns of A */
+ Colamd_Row Row [], /* of size n_row+1 */
+ Colamd_Col Col [], /* of size n_col+1 */
+ int A [], /* column form and row form of A */
+ int head [], /* of size n_col+1 */
+ double knobs [COLAMD_KNOBS],/* parameters */
+ int *p_n_row2, /* number of non-dense, non-empty rows */
+ int *p_n_col2, /* number of non-dense, non-empty columns */
+ int *p_max_deg /* maximum row degree */
+)
+{
+ /* === Local variables ================================================== */
+
+ int c ; /* a column index */
+ int r, row ; /* a row index */
+ int *cp ; /* a column pointer */
+ int deg ; /* degree of a row or column */
+ int *cp_end ; /* a pointer to the end of a column */
+ int *new_cp ; /* new column pointer */
+ int col_length ; /* length of pruned column */
+ int score ; /* current column score */
+ int n_col2 ; /* number of non-dense, non-empty columns */
+ int n_row2 ; /* number of non-dense, non-empty rows */
+ int dense_row_count ; /* remove rows with more entries than this */
+ int dense_col_count ; /* remove cols with more entries than this */
+ int min_score ; /* smallest column score */
+ int max_deg ; /* maximum row degree */
+ int next_col ; /* Used to add to degree list.*/
+
+#ifndef NDEBUG
+ int debug_count ; /* debug only. */
+#endif /* NDEBUG */
+
+ /* === Extract knobs ==================================================== */
+
+ dense_row_count = MAX (0, MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ;
+ dense_col_count = MAX (0, MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ;
+ DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ;
+ max_deg = 0 ;
+ n_col2 = n_col ;
+ n_row2 = n_row ;
+
+ /* === Kill empty columns =============================================== */
+
+ /* Put the empty columns at the end in their natural order, so that LU */
+ /* factorization can proceed as far as possible. */
+ for (c = n_col-1 ; c >= 0 ; c--)
+ {
+ deg = Col [c].length ;
+ if (deg == 0)
+ {
+ /* this is a empty column, kill and order it last */
+ Col [c].shared2.order = --n_col2 ;
+ KILL_PRINCIPAL_COL (c) ;
+ }
+ }
+ DEBUG1 (("colamd: null columns killed: %d\n", n_col - n_col2)) ;
+
+ /* === Kill dense columns =============================================== */
+
+ /* Put the dense columns at the end, in their natural order */
+ for (c = n_col-1 ; c >= 0 ; c--)
+ {
+ /* skip any dead columns */
+ if (COL_IS_DEAD (c))
+ {
+ continue ;
+ }
+ deg = Col [c].length ;
+ if (deg > dense_col_count)
+ {
+ /* this is a dense column, kill and order it last */
+ Col [c].shared2.order = --n_col2 ;
+ /* decrement the row degrees */
+ cp = &A [Col [c].start] ;
+ cp_end = cp + Col [c].length ;
+ while (cp < cp_end)
+ {
+ Row [*cp++].shared1.degree-- ;
+ }
+ KILL_PRINCIPAL_COL (c) ;
+ }
+ }
+ DEBUG1 (("colamd: Dense and null columns killed: %d\n", n_col - n_col2)) ;
+
+ /* === Kill dense and empty rows ======================================== */
+
+ for (r = 0 ; r < n_row ; r++)
+ {
+ deg = Row [r].shared1.degree ;
+ ASSERT (deg >= 0 && deg <= n_col) ;
+ if (deg > dense_row_count || deg == 0)
+ {
+ /* kill a dense or empty row */
+ KILL_ROW (r) ;
+ --n_row2 ;
+ }
+ else
+ {
+ /* keep track of max degree of remaining rows */
+ max_deg = MAX (max_deg, deg) ;
+ }
+ }
+ DEBUG1 (("colamd: Dense and null rows killed: %d\n", n_row - n_row2)) ;
+
+ /* === Compute initial column scores ==================================== */
+
+ /* At this point the row degrees are accurate. They reflect the number */
+ /* of "live" (non-dense) columns in each row. No empty rows exist. */
+ /* Some "live" columns may contain only dead rows, however. These are */
+ /* pruned in the code below. */
+
+ /* now find the initial matlab score for each column */
+ for (c = n_col-1 ; c >= 0 ; c--)
+ {
+ /* skip dead column */
+ if (COL_IS_DEAD (c))
+ {
+ continue ;
+ }
+ score = 0 ;
+ cp = &A [Col [c].start] ;
+ new_cp = cp ;
+ cp_end = cp + Col [c].length ;
+ while (cp < cp_end)
+ {
+ /* get a row */
+ row = *cp++ ;
+ /* skip if dead */
+ if (ROW_IS_DEAD (row))
+ {
+ continue ;
+ }
+ /* compact the column */
+ *new_cp++ = row ;
+ /* add row's external degree */
+ score += Row [row].shared1.degree - 1 ;
+ /* guard against integer overflow */
+ score = MIN (score, n_col) ;
+ }
+ /* determine pruned column length */
+ col_length = (int) (new_cp - &A [Col [c].start]) ;
+ if (col_length == 0)
+ {
+ /* a newly-made null column (all rows in this col are "dense" */
+ /* and have already been killed) */
+ DEBUG2 (("Newly null killed: %d\n", c)) ;
+ Col [c].shared2.order = --n_col2 ;
+ KILL_PRINCIPAL_COL (c) ;
+ }
+ else
+ {
+ /* set column length and set score */
+ ASSERT (score >= 0) ;
+ ASSERT (score <= n_col) ;
+ Col [c].length = col_length ;
+ Col [c].shared2.score = score ;
+ }
+ }
+ DEBUG1 (("colamd: Dense, null, and newly-null columns killed: %d\n",
+ n_col-n_col2)) ;
+
+ /* At this point, all empty rows and columns are dead. All live columns */
+ /* are "clean" (containing no dead rows) and simplicial (no supercolumns */
+ /* yet). Rows may contain dead columns, but all live rows contain at */
+ /* least one live column. */
+
+#ifndef NDEBUG
+ debug_structures (n_row, n_col, Row, Col, A, n_col2) ;
+#endif /* NDEBUG */
+
+ /* === Initialize degree lists ========================================== */
+
+#ifndef NDEBUG
+ debug_count = 0 ;
+#endif /* NDEBUG */
+
+ /* clear the hash buckets */
+ for (c = 0 ; c <= n_col ; c++)
+ {
+ head [c] = EMPTY ;
+ }
+ min_score = n_col ;
+ /* place in reverse order, so low column indices are at the front */
+ /* of the lists. This is to encourage natural tie-breaking */
+ for (c = n_col-1 ; c >= 0 ; c--)
+ {
+ /* only add principal columns to degree lists */
+ if (COL_IS_ALIVE (c))
+ {
+ DEBUG4 (("place %d score %d minscore %d ncol %d\n",
+ c, Col [c].shared2.score, min_score, n_col)) ;
+
+ /* === Add columns score to DList =============================== */
+
+ score = Col [c].shared2.score ;
+
+ ASSERT (min_score >= 0) ;
+ ASSERT (min_score <= n_col) ;
+ ASSERT (score >= 0) ;
+ ASSERT (score <= n_col) ;
+ ASSERT (head [score] >= EMPTY) ;
+
+ /* now add this column to dList at proper score location */
+ next_col = head [score] ;
+ Col [c].shared3.prev = EMPTY ;
+ Col [c].shared4.degree_next = next_col ;
+
+ /* if there already was a column with the same score, set its */
+ /* previous pointer to this new column */
+ if (next_col != EMPTY)
+ {
+ Col [next_col].shared3.prev = c ;
+ }
+ head [score] = c ;
+
+ /* see if this score is less than current min */
+ min_score = MIN (min_score, score) ;
+
+#ifndef NDEBUG
+ debug_count++ ;
+#endif /* NDEBUG */
+
+ }
+ }
+
+#ifndef NDEBUG
+ DEBUG1 (("colamd: Live cols %d out of %d, non-princ: %d\n",
+ debug_count, n_col, n_col-debug_count)) ;
+ ASSERT (debug_count == n_col2) ;
+ debug_deg_lists (n_row, n_col, Row, Col, head, min_score, n_col2, max_deg) ;
+#endif /* NDEBUG */
+
+ /* === Return number of remaining columns, and max row degree =========== */
+
+ *p_n_col2 = n_col2 ;
+ *p_n_row2 = n_row2 ;
+ *p_max_deg = max_deg ;
+}
+
+
+/* ========================================================================== */
+/* === find_ordering ======================================================== */
+/* ========================================================================== */
+
+/*
+ Order the principal columns of the supercolumn form of the matrix
+ (no supercolumns on input). Uses a minimum approximate column minimum
+ degree ordering method. Not user-callable.
+*/
+
+PRIVATE int find_ordering /* return the number of garbage collections */
+(
+ /* === Parameters ======================================================= */
+
+ int n_row, /* number of rows of A */
+ int n_col, /* number of columns of A */
+ int Alen, /* size of A, 2*nnz + n_col or larger */
+ Colamd_Row Row [], /* of size n_row+1 */
+ Colamd_Col Col [], /* of size n_col+1 */
+ int A [], /* column form and row form of A */
+ int head [], /* of size n_col+1 */
+ int n_col2, /* Remaining columns to order */
+ int max_deg, /* Maximum row degree */
+ int pfree /* index of first free slot (2*nnz on entry) */
+)
+{
+ /* === Local variables ================================================== */
+
+ int k ; /* current pivot ordering step */
+ int pivot_col ; /* current pivot column */
+ int *cp ; /* a column pointer */
+ int *rp ; /* a row pointer */
+ int pivot_row ; /* current pivot row */
+ int *new_cp ; /* modified column pointer */
+ int *new_rp ; /* modified row pointer */
+ int pivot_row_start ; /* pointer to start of pivot row */
+ int pivot_row_degree ; /* number of columns in pivot row */
+ int pivot_row_length ; /* number of supercolumns in pivot row */
+ int pivot_col_score ; /* score of pivot column */
+ int needed_memory ; /* free space needed for pivot row */
+ int *cp_end ; /* pointer to the end of a column */
+ int *rp_end ; /* pointer to the end of a row */
+ int row ; /* a row index */
+ int col ; /* a column index */
+ int max_score ; /* maximum possible score */
+ int cur_score ; /* score of current column */
+ unsigned int hash ; /* hash value for supernode detection */
+ int head_column ; /* head of hash bucket */
+ int first_col ; /* first column in hash bucket */
+ int tag_mark ; /* marker value for mark array */
+ int row_mark ; /* Row [row].shared2.mark */
+ int set_difference ; /* set difference size of row with pivot row */
+ int min_score ; /* smallest column score */
+ int col_thickness ; /* "thickness" (no. of columns in a supercol) */
+ int max_mark ; /* maximum value of tag_mark */
+ int pivot_col_thickness ; /* number of columns represented by pivot col */
+ int prev_col ; /* Used by Dlist operations. */
+ int next_col ; /* Used by Dlist operations. */
+ int ngarbage ; /* number of garbage collections performed */
+
+#ifndef NDEBUG
+ int debug_d ; /* debug loop counter */
+ int debug_step = 0 ; /* debug loop counter */
+#endif /* NDEBUG */
+
+ /* === Initialization and clear mark ==================================== */
+
+ max_mark = INT_MAX - n_col ; /* INT_MAX defined in <limits.h> */
+ tag_mark = clear_mark (n_row, Row) ;
+ min_score = 0 ;
+ ngarbage = 0 ;
+ DEBUG1 (("colamd: Ordering, n_col2=%d\n", n_col2)) ;
+
+ /* === Order the columns ================================================ */
+
+ for (k = 0 ; k < n_col2 ; /* 'k' is incremented below */)
+ {
+
+#ifndef NDEBUG
+ if (debug_step % 100 == 0)
+ {
+ DEBUG2 (("\n... Step k: %d out of n_col2: %d\n", k, n_col2)) ;
+ }
+ else
+ {
+ DEBUG3 (("\n----------Step k: %d out of n_col2: %d\n", k, n_col2)) ;
+ }
+ debug_step++ ;
+ debug_deg_lists (n_row, n_col, Row, Col, head,
+ min_score, n_col2-k, max_deg) ;
+ debug_matrix (n_row, n_col, Row, Col, A) ;
+#endif /* NDEBUG */
+
+ /* === Select pivot column, and order it ============================ */
+
+ /* make sure degree list isn't empty */
+ ASSERT (min_score >= 0) ;
+ ASSERT (min_score <= n_col) ;
+ ASSERT (head [min_score] >= EMPTY) ;
+
+#ifndef NDEBUG
+ for (debug_d = 0 ; debug_d < min_score ; debug_d++)
+ {
+ ASSERT (head [debug_d] == EMPTY) ;
+ }
+#endif /* NDEBUG */
+
+ /* get pivot column from head of minimum degree list */
+ while (head [min_score] == EMPTY && min_score < n_col)
+ {
+ min_score++ ;
+ }
+ pivot_col = head [min_score] ;
+ ASSERT (pivot_col >= 0 && pivot_col <= n_col) ;
+ next_col = Col [pivot_col].shared4.degree_next ;
+ head [min_score] = next_col ;
+ if (next_col != EMPTY)
+ {
+ Col [next_col].shared3.prev = EMPTY ;
+ }
+
+ ASSERT (COL_IS_ALIVE (pivot_col)) ;
+ DEBUG3 (("Pivot col: %d\n", pivot_col)) ;
+
+ /* remember score for defrag check */
+ pivot_col_score = Col [pivot_col].shared2.score ;
+
+ /* the pivot column is the kth column in the pivot order */
+ Col [pivot_col].shared2.order = k ;
+
+ /* increment order count by column thickness */
+ pivot_col_thickness = Col [pivot_col].shared1.thickness ;
+ k += pivot_col_thickness ;
+ ASSERT (pivot_col_thickness > 0) ;
+
+ /* === Garbage_collection, if necessary ============================= */
+
+ needed_memory = MIN (pivot_col_score, n_col - k) ;
+ if (pfree + needed_memory >= Alen)
+ {
+ pfree = garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
+ ngarbage++ ;
+ /* after garbage collection we will have enough */
+ ASSERT (pfree + needed_memory < Alen) ;
+ /* garbage collection has wiped out the Row[].shared2.mark array */
+ tag_mark = clear_mark (n_row, Row) ;
+
+#ifndef NDEBUG
+ debug_matrix (n_row, n_col, Row, Col, A) ;
+#endif /* NDEBUG */
+ }
+
+ /* === Compute pivot row pattern ==================================== */
+
+ /* get starting location for this new merged row */
+ pivot_row_start = pfree ;
+
+ /* initialize new row counts to zero */
+ pivot_row_degree = 0 ;
+
+ /* tag pivot column as having been visited so it isn't included */
+ /* in merged pivot row */
+ Col [pivot_col].shared1.thickness = -pivot_col_thickness ;
+
+ /* pivot row is the union of all rows in the pivot column pattern */
+ cp = &A [Col [pivot_col].start] ;
+ cp_end = cp + Col [pivot_col].length ;
+ while (cp < cp_end)
+ {
+ /* get a row */
+ row = *cp++ ;
+ DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ;
+ /* skip if row is dead */
+ if (ROW_IS_DEAD (row))
+ {
+ continue ;
+ }
+ rp = &A [Row [row].start] ;
+ rp_end = rp + Row [row].length ;
+ while (rp < rp_end)
+ {
+ /* get a column */
+ col = *rp++ ;
+ /* add the column, if alive and untagged */
+ col_thickness = Col [col].shared1.thickness ;
+ if (col_thickness > 0 && COL_IS_ALIVE (col))
+ {
+ /* tag column in pivot row */
+ Col [col].shared1.thickness = -col_thickness ;
+ ASSERT (pfree < Alen) ;
+ /* place column in pivot row */
+ A [pfree++] = col ;
+ pivot_row_degree += col_thickness ;
+ }
+ }
+ }
+
+ /* clear tag on pivot column */
+ Col [pivot_col].shared1.thickness = pivot_col_thickness ;
+ max_deg = MAX (max_deg, pivot_row_degree) ;
+
+#ifndef NDEBUG
+ DEBUG3 (("check2\n")) ;
+ debug_mark (n_row, Row, tag_mark, max_mark) ;
+#endif /* NDEBUG */
+
+ /* === Kill all rows used to construct pivot row ==================== */
+
+ /* also kill pivot row, temporarily */
+ cp = &A [Col [pivot_col].start] ;
+ cp_end = cp + Col [pivot_col].length ;
+ while (cp < cp_end)
+ {
+ /* may be killing an already dead row */
+ row = *cp++ ;
+ DEBUG3 (("Kill row in pivot col: %d\n", row)) ;
+ KILL_ROW (row) ;
+ }
+
+ /* === Select a row index to use as the new pivot row =============== */
+
+ pivot_row_length = pfree - pivot_row_start ;
+ if (pivot_row_length > 0)
+ {
+ /* pick the "pivot" row arbitrarily (first row in col) */
+ pivot_row = A [Col [pivot_col].start] ;
+ DEBUG3 (("Pivotal row is %d\n", pivot_row)) ;
+ }
+ else
+ {
+ /* there is no pivot row, since it is of zero length */
+ pivot_row = EMPTY ;
+ ASSERT (pivot_row_length == 0) ;
+ }
+ ASSERT (Col [pivot_col].length > 0 || pivot_row_length == 0) ;
+
+ /* === Approximate degree computation =============================== */
+
+ /* Here begins the computation of the approximate degree. The column */
+ /* score is the sum of the pivot row "length", plus the size of the */
+ /* set differences of each row in the column minus the pattern of the */
+ /* pivot row itself. The column ("thickness") itself is also */
+ /* excluded from the column score (we thus use an approximate */
+ /* external degree). */
+
+ /* The time taken by the following code (compute set differences, and */
+ /* add them up) is proportional to the size of the data structure */
+ /* being scanned - that is, the sum of the sizes of each column in */
+ /* the pivot row. Thus, the amortized time to compute a column score */
+ /* is proportional to the size of that column (where size, in this */
+ /* context, is the column "length", or the number of row indices */
+ /* in that column). The number of row indices in a column is */
+ /* monotonically non-decreasing, from the length of the original */
+ /* column on input to colamd. */
+
+ /* === Compute set differences ====================================== */
+
+ DEBUG3 (("** Computing set differences phase. **\n")) ;
+
+ /* pivot row is currently dead - it will be revived later. */
+
+ DEBUG3 (("Pivot row: ")) ;
+ /* for each column in pivot row */
+ rp = &A [pivot_row_start] ;
+ rp_end = rp + pivot_row_length ;
+ while (rp < rp_end)
+ {
+ col = *rp++ ;
+ ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
+ DEBUG3 (("Col: %d\n", col)) ;
+
+ /* clear tags used to construct pivot row pattern */
+ col_thickness = -Col [col].shared1.thickness ;
+ ASSERT (col_thickness > 0) ;
+ Col [col].shared1.thickness = col_thickness ;
+
+ /* === Remove column from degree list =========================== */
+
+ cur_score = Col [col].shared2.score ;
+ prev_col = Col [col].shared3.prev ;
+ next_col = Col [col].shared4.degree_next ;
+ ASSERT (cur_score >= 0) ;
+ ASSERT (cur_score <= n_col) ;
+ ASSERT (cur_score >= EMPTY) ;
+ if (prev_col == EMPTY)
+ {
+ head [cur_score] = next_col ;
+ }
+ else
+ {
+ Col [prev_col].shared4.degree_next = next_col ;
+ }
+ if (next_col != EMPTY)
+ {
+ Col [next_col].shared3.prev = prev_col ;
+ }
+
+ /* === Scan the column ========================================== */
+
+ cp = &A [Col [col].start] ;
+ cp_end = cp + Col [col].length ;
+ while (cp < cp_end)
+ {
+ /* get a row */
+ row = *cp++ ;
+ row_mark = Row [row].shared2.mark ;
+ /* skip if dead */
+ if (ROW_IS_MARKED_DEAD (row_mark))
+ {
+ continue ;
+ }
+ ASSERT (row != pivot_row) ;
+ set_difference = row_mark - tag_mark ;
+ /* check if the row has been seen yet */
+ if (set_difference < 0)
+ {
+ ASSERT (Row [row].shared1.degree <= max_deg) ;
+ set_difference = Row [row].shared1.degree ;
+ }
+ /* subtract column thickness from this row's set difference */
+ set_difference -= col_thickness ;
+ ASSERT (set_difference >= 0) ;
+ /* absorb this row if the set difference becomes zero */
+ if (set_difference == 0)
+ {
+ DEBUG3 (("aggressive absorption. Row: %d\n", row)) ;
+ KILL_ROW (row) ;
+ }
+ else
+ {
+ /* save the new mark */
+ Row [row].shared2.mark = set_difference + tag_mark ;
+ }
+ }
+ }
+
+#ifndef NDEBUG
+ debug_deg_lists (n_row, n_col, Row, Col, head,
+ min_score, n_col2-k-pivot_row_degree, max_deg) ;
+#endif /* NDEBUG */
+
+ /* === Add up set differences for each column ======================= */
+
+ DEBUG3 (("** Adding set differences phase. **\n")) ;
+
+ /* for each column in pivot row */
+ rp = &A [pivot_row_start] ;
+ rp_end = rp + pivot_row_length ;
+ while (rp < rp_end)
+ {
+ /* get a column */
+ col = *rp++ ;
+ ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
+ hash = 0 ;
+ cur_score = 0 ;
+ cp = &A [Col [col].start] ;
+ /* compact the column */
+ new_cp = cp ;
+ cp_end = cp + Col [col].length ;
+
+ DEBUG4 (("Adding set diffs for Col: %d.\n", col)) ;
+
+ while (cp < cp_end)
+ {
+ /* get a row */
+ row = *cp++ ;
+ ASSERT(row >= 0 && row < n_row) ;
+ row_mark = Row [row].shared2.mark ;
+ /* skip if dead */
+ if (ROW_IS_MARKED_DEAD (row_mark))
+ {
+ continue ;
+ }
+ ASSERT (row_mark > tag_mark) ;
+ /* compact the column */
+ *new_cp++ = row ;
+ /* compute hash function */
+ hash += row ;
+ /* add set difference */
+ cur_score += row_mark - tag_mark ;
+ /* integer overflow... */
+ cur_score = MIN (cur_score, n_col) ;
+ }
+
+ /* recompute the column's length */
+ Col [col].length = (int) (new_cp - &A [Col [col].start]) ;
+
+ /* === Further mass elimination ================================= */
+
+ if (Col [col].length == 0)
+ {
+ DEBUG4 (("further mass elimination. Col: %d\n", col)) ;
+ /* nothing left but the pivot row in this column */
+ KILL_PRINCIPAL_COL (col) ;
+ pivot_row_degree -= Col [col].shared1.thickness ;
+ ASSERT (pivot_row_degree >= 0) ;
+ /* order it */
+ Col [col].shared2.order = k ;
+ /* increment order count by column thickness */
+ k += Col [col].shared1.thickness ;
+ }
+ else
+ {
+ /* === Prepare for supercolumn detection ==================== */
+
+ DEBUG4 (("Preparing supercol detection for Col: %d.\n", col)) ;
+
+ /* save score so far */
+ Col [col].shared2.score = cur_score ;
+
+ /* add column to hash table, for supercolumn detection */
+ hash %= n_col + 1 ;
+
+ DEBUG4 ((" Hash = %d, n_col = %d.\n", hash, n_col)) ;
+ ASSERT (hash <= n_col) ;
+
+ head_column = head [hash] ;
+ if (head_column > EMPTY)
+ {
+ /* degree list "hash" is non-empty, use prev (shared3) of */
+ /* first column in degree list as head of hash bucket */
+ first_col = Col [head_column].shared3.headhash ;
+ Col [head_column].shared3.headhash = col ;
+ }
+ else
+ {
+ /* degree list "hash" is empty, use head as hash bucket */
+ first_col = - (head_column + 2) ;
+ head [hash] = - (col + 2) ;
+ }
+ Col [col].shared4.hash_next = first_col ;
+
+ /* save hash function in Col [col].shared3.hash */
+ Col [col].shared3.hash = (int) hash ;
+ ASSERT (COL_IS_ALIVE (col)) ;
+ }
+ }
+
+ /* The approximate external column degree is now computed. */
+
+ /* === Supercolumn detection ======================================== */
+
+ DEBUG3 (("** Supercolumn detection phase. **\n")) ;
+
+ detect_super_cols (
+
+#ifndef NDEBUG
+ n_col, Row,
+#endif /* NDEBUG */
+
+ Col, A, head, pivot_row_start, pivot_row_length) ;
+
+ /* === Kill the pivotal column ====================================== */
+
+ KILL_PRINCIPAL_COL (pivot_col) ;
+
+ /* === Clear mark =================================================== */
+
+ tag_mark += (max_deg + 1) ;
+ if (tag_mark >= max_mark)
+ {
+ DEBUG2 (("clearing tag_mark\n")) ;
+ tag_mark = clear_mark (n_row, Row) ;
+ }
+
+#ifndef NDEBUG
+ DEBUG3 (("check3\n")) ;
+ debug_mark (n_row, Row, tag_mark, max_mark) ;
+#endif /* NDEBUG */
+
+ /* === Finalize the new pivot row, and column scores ================ */
+
+ DEBUG3 (("** Finalize scores phase. **\n")) ;
+
+ /* for each column in pivot row */
+ rp = &A [pivot_row_start] ;
+ /* compact the pivot row */
+ new_rp = rp ;
+ rp_end = rp + pivot_row_length ;
+ while (rp < rp_end)
+ {
+ col = *rp++ ;
+ /* skip dead columns */
+ if (COL_IS_DEAD (col))
+ {
+ continue ;
+ }
+ *new_rp++ = col ;
+ /* add new pivot row to column */
+ A [Col [col].start + (Col [col].length++)] = pivot_row ;
+
+ /* retrieve score so far and add on pivot row's degree. */
+ /* (we wait until here for this in case the pivot */
+ /* row's degree was reduced due to mass elimination). */
+ cur_score = Col [col].shared2.score + pivot_row_degree ;
+
+ /* calculate the max possible score as the number of */
+ /* external columns minus the 'k' value minus the */
+ /* columns thickness */
+ max_score = n_col - k - Col [col].shared1.thickness ;
+
+ /* make the score the external degree of the union-of-rows */
+ cur_score -= Col [col].shared1.thickness ;
+
+ /* make sure score is less or equal than the max score */
+ cur_score = MIN (cur_score, max_score) ;
+ ASSERT (cur_score >= 0) ;
+
+ /* store updated score */
+ Col [col].shared2.score = cur_score ;
+
+ /* === Place column back in degree list ========================= */
+
+ ASSERT (min_score >= 0) ;
+ ASSERT (min_score <= n_col) ;
+ ASSERT (cur_score >= 0) ;
+ ASSERT (cur_score <= n_col) ;
+ ASSERT (head [cur_score] >= EMPTY) ;
+ next_col = head [cur_score] ;
+ Col [col].shared4.degree_next = next_col ;
+ Col [col].shared3.prev = EMPTY ;
+ if (next_col != EMPTY)
+ {
+ Col [next_col].shared3.prev = col ;
+ }
+ head [cur_score] = col ;
+
+ /* see if this score is less than current min */
+ min_score = MIN (min_score, cur_score) ;
+
+ }
+
+#ifndef NDEBUG
+ debug_deg_lists (n_row, n_col, Row, Col, head,
+ min_score, n_col2-k, max_deg) ;
+#endif /* NDEBUG */
+
+ /* === Resurrect the new pivot row ================================== */
+
+ if (pivot_row_degree > 0)
+ {
+ /* update pivot row length to reflect any cols that were killed */
+ /* during super-col detection and mass elimination */
+ Row [pivot_row].start = pivot_row_start ;
+ Row [pivot_row].length = (int) (new_rp - &A[pivot_row_start]) ;
+ Row [pivot_row].shared1.degree = pivot_row_degree ;
+ Row [pivot_row].shared2.mark = 0 ;
+ /* pivot row is no longer dead */
+ }
+ }
+
+ /* === All principal columns have now been ordered ====================== */
+
+ return (ngarbage) ;
+}
+
+
+/* ========================================================================== */
+/* === order_children ======================================================= */
+/* ========================================================================== */
+
+/*
+ The find_ordering routine has ordered all of the principal columns (the
+ representatives of the supercolumns). The non-principal columns have not
+ yet been ordered. This routine orders those columns by walking up the
+ parent tree (a column is a child of the column which absorbed it). The
+ final permutation vector is then placed in p [0 ... n_col-1], with p [0]
+ being the first column, and p [n_col-1] being the last. It doesn't look
+ like it at first glance, but be assured that this routine takes time linear
+ in the number of columns. Although not immediately obvious, the time
+ taken by this routine is O (n_col), that is, linear in the number of
+ columns. Not user-callable.
+*/
+
+PRIVATE void order_children
+(
+ /* === Parameters ======================================================= */
+
+ int n_col, /* number of columns of A */
+ Colamd_Col Col [], /* of size n_col+1 */
+ int p [] /* p [0 ... n_col-1] is the column permutation*/
+)
+{
+ /* === Local variables ================================================== */
+
+ int i ; /* loop counter for all columns */
+ int c ; /* column index */
+ int parent ; /* index of column's parent */
+ int order ; /* column's order */
+
+ /* === Order each non-principal column ================================== */
+
+ for (i = 0 ; i < n_col ; i++)
+ {
+ /* find an un-ordered non-principal column */
+ ASSERT (COL_IS_DEAD (i)) ;
+ if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == EMPTY)
+ {
+ parent = i ;
+ /* once found, find its principal parent */
+ do
+ {
+ parent = Col [parent].shared1.parent ;
+ } while (!COL_IS_DEAD_PRINCIPAL (parent)) ;
+
+ /* now, order all un-ordered non-principal columns along path */
+ /* to this parent. collapse tree at the same time */
+ c = i ;
+ /* get order of parent */
+ order = Col [parent].shared2.order ;
+
+ do
+ {
+ ASSERT (Col [c].shared2.order == EMPTY) ;
+
+ /* order this column */
+ Col [c].shared2.order = order++ ;
+ /* collaps tree */
+ Col [c].shared1.parent = parent ;
+
+ /* get immediate parent of this column */
+ c = Col [c].shared1.parent ;
+
+ /* continue until we hit an ordered column. There are */
+ /* guarranteed not to be anymore unordered columns */
+ /* above an ordered column */
+ } while (Col [c].shared2.order == EMPTY) ;
+
+ /* re-order the super_col parent to largest order for this group */
+ Col [parent].shared2.order = order ;
+ }
+ }
+
+ /* === Generate the permutation ========================================= */
+
+ for (c = 0 ; c < n_col ; c++)
+ {
+ p [Col [c].shared2.order] = c ;
+ }
+}
+
+
+/* ========================================================================== */
+/* === detect_super_cols ==================================================== */
+/* ========================================================================== */
+
+/*
+ Detects supercolumns by finding matches between columns in the hash buckets.
+ Check amongst columns in the set A [row_start ... row_start + row_length-1].
+ The columns under consideration are currently *not* in the degree lists,
+ and have already been placed in the hash buckets.
+
+ The hash bucket for columns whose hash function is equal to h is stored
+ as follows:
+
+ if head [h] is >= 0, then head [h] contains a degree list, so:
+
+ head [h] is the first column in degree bucket h.
+ Col [head [h]].headhash gives the first column in hash bucket h.
+
+ otherwise, the degree list is empty, and:
+
+ -(head [h] + 2) is the first column in hash bucket h.
+
+ For a column c in a hash bucket, Col [c].shared3.prev is NOT a "previous
+ column" pointer. Col [c].shared3.hash is used instead as the hash number
+ for that column. The value of Col [c].shared4.hash_next is the next column
+ in the same hash bucket.
+
+ Assuming no, or "few" hash collisions, the time taken by this routine is
+ linear in the sum of the sizes (lengths) of each column whose score has
+ just been computed in the approximate degree computation.
+ Not user-callable.
+*/
+
+PRIVATE void detect_super_cols
+(
+ /* === Parameters ======================================================= */
+
+#ifndef NDEBUG
+ /* these two parameters are only needed when debugging is enabled: */
+ int n_col, /* number of columns of A */
+ Colamd_Row Row [], /* of size n_row+1 */
+#endif /* NDEBUG */
+
+ Colamd_Col Col [], /* of size n_col+1 */
+ int A [], /* row indices of A */
+ int head [], /* head of degree lists and hash buckets */
+ int row_start, /* pointer to set of columns to check */
+ int row_length /* number of columns to check */
+)
+{
+ /* === Local variables ================================================== */
+
+ int hash ; /* hash value for a column */
+ int *rp ; /* pointer to a row */
+ int c ; /* a column index */
+ int super_c ; /* column index of the column to absorb into */
+ int *cp1 ; /* column pointer for column super_c */
+ int *cp2 ; /* column pointer for column c */
+ int length ; /* length of column super_c */
+ int prev_c ; /* column preceding c in hash bucket */
+ int i ; /* loop counter */
+ int *rp_end ; /* pointer to the end of the row */
+ int col ; /* a column index in the row to check */
+ int head_column ; /* first column in hash bucket or degree list */
+ int first_col ; /* first column in hash bucket */
+
+ /* === Consider each column in the row ================================== */
+
+ rp = &A [row_start] ;
+ rp_end = rp + row_length ;
+ while (rp < rp_end)
+ {
+ col = *rp++ ;
+ if (COL_IS_DEAD (col))
+ {
+ continue ;
+ }
+
+ /* get hash number for this column */
+ hash = Col [col].shared3.hash ;
+ ASSERT (hash <= n_col) ;
+
+ /* === Get the first column in this hash bucket ===================== */
+
+ head_column = head [hash] ;
+ if (head_column > EMPTY)
+ {
+ first_col = Col [head_column].shared3.headhash ;
+ }
+ else
+ {
+ first_col = - (head_column + 2) ;
+ }
+
+ /* === Consider each column in the hash bucket ====================== */
+
+ for (super_c = first_col ; super_c != EMPTY ;
+ super_c = Col [super_c].shared4.hash_next)
+ {
+ ASSERT (COL_IS_ALIVE (super_c)) ;
+ ASSERT (Col [super_c].shared3.hash == hash) ;
+ length = Col [super_c].length ;
+
+ /* prev_c is the column preceding column c in the hash bucket */
+ prev_c = super_c ;
+
+ /* === Compare super_c with all columns after it ================ */
+
+ for (c = Col [super_c].shared4.hash_next ;
+ c != EMPTY ; c = Col [c].shared4.hash_next)
+ {
+ ASSERT (c != super_c) ;
+ ASSERT (COL_IS_ALIVE (c)) ;
+ ASSERT (Col [c].shared3.hash == hash) ;
+
+ /* not identical if lengths or scores are different */
+ if (Col [c].length != length ||
+ Col [c].shared2.score != Col [super_c].shared2.score)
+ {
+ prev_c = c ;
+ continue ;
+ }
+
+ /* compare the two columns */
+ cp1 = &A [Col [super_c].start] ;
+ cp2 = &A [Col [c].start] ;
+
+ for (i = 0 ; i < length ; i++)
+ {
+ /* the columns are "clean" (no dead rows) */
+ ASSERT (ROW_IS_ALIVE (*cp1)) ;
+ ASSERT (ROW_IS_ALIVE (*cp2)) ;
+ /* row indices will same order for both supercols, */
+ /* no gather scatter nessasary */
+ if (*cp1++ != *cp2++)
+ {
+ break ;
+ }
+ }
+
+ /* the two columns are different if the for-loop "broke" */
+ if (i != length)
+ {
+ prev_c = c ;
+ continue ;
+ }
+
+ /* === Got it! two columns are identical =================== */
+
+ ASSERT (Col [c].shared2.score == Col [super_c].shared2.score) ;
+
+ Col [super_c].shared1.thickness += Col [c].shared1.thickness ;
+ Col [c].shared1.parent = super_c ;
+ KILL_NON_PRINCIPAL_COL (c) ;
+ /* order c later, in order_children() */
+ Col [c].shared2.order = EMPTY ;
+ /* remove c from hash bucket */
+ Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ;
+ }
+ }
+
+ /* === Empty this hash bucket ======================================= */
+
+ if (head_column > EMPTY)
+ {
+ /* corresponding degree list "hash" is not empty */
+ Col [head_column].shared3.headhash = EMPTY ;
+ }
+ else
+ {
+ /* corresponding degree list "hash" is empty */
+ head [hash] = EMPTY ;
+ }
+ }
+}
+
+
+/* ========================================================================== */
+/* === garbage_collection =================================================== */
+/* ========================================================================== */
+
+/*
+ Defragments and compacts columns and rows in the workspace A. Used when
+ all avaliable memory has been used while performing row merging. Returns
+ the index of the first free position in A, after garbage collection. The
+ time taken by this routine is linear is the size of the array A, which is
+ itself linear in the number of nonzeros in the input matrix.
+ Not user-callable.
+*/
+
+PRIVATE int garbage_collection /* returns the new value of pfree */
+(
+ /* === Parameters ======================================================= */
+
+ int n_row, /* number of rows */
+ int n_col, /* number of columns */
+ Colamd_Row Row [], /* row info */
+ Colamd_Col Col [], /* column info */
+ int A [], /* A [0 ... Alen-1] holds the matrix */
+ int *pfree /* &A [0] ... pfree is in use */
+)
+{
+ /* === Local variables ================================================== */
+
+ int *psrc ; /* source pointer */
+ int *pdest ; /* destination pointer */
+ int j ; /* counter */
+ int r ; /* a row index */
+ int c ; /* a column index */
+ int length ; /* length of a row or column */
+
+#ifndef NDEBUG
+ int debug_rows ;
+ DEBUG2 (("Defrag..\n")) ;
+ for (psrc = &A[0] ; psrc < pfree ; psrc++) ASSERT (*psrc >= 0) ;
+ debug_rows = 0 ;
+#endif /* NDEBUG */
+
+ /* === Defragment the columns =========================================== */
+
+ pdest = &A[0] ;
+ for (c = 0 ; c < n_col ; c++)
+ {
+ if (COL_IS_ALIVE (c))
+ {
+ psrc = &A [Col [c].start] ;
+
+ /* move and compact the column */
+ ASSERT (pdest <= psrc) ;
+ Col [c].start = (int) (pdest - &A [0]) ;
+ length = Col [c].length ;
+ for (j = 0 ; j < length ; j++)
+ {
+ r = *psrc++ ;
+ if (ROW_IS_ALIVE (r))
+ {
+ *pdest++ = r ;
+ }
+ }
+ Col [c].length = (int) (pdest - &A [Col [c].start]) ;
+ }
+ }
+
+ /* === Prepare to defragment the rows =================================== */
+
+ for (r = 0 ; r < n_row ; r++)
+ {
+ if (ROW_IS_ALIVE (r))
+ {
+ if (Row [r].length == 0)
+ {
+ /* this row is of zero length. cannot compact it, so kill it */
+ DEBUG3 (("Defrag row kill\n")) ;
+ KILL_ROW (r) ;
+ }
+ else
+ {
+ /* save first column index in Row [r].shared2.first_column */
+ psrc = &A [Row [r].start] ;
+ Row [r].shared2.first_column = *psrc ;
+ ASSERT (ROW_IS_ALIVE (r)) ;
+ /* flag the start of the row with the one's complement of row */
+ *psrc = ONES_COMPLEMENT (r) ;
+
+#ifndef NDEBUG
+ debug_rows++ ;
+#endif /* NDEBUG */
+
+ }
+ }
+ }
+
+ /* === Defragment the rows ============================================== */
+
+ psrc = pdest ;
+ while (psrc < pfree)
+ {
+ /* find a negative number ... the start of a row */
+ if (*psrc++ < 0)
+ {
+ psrc-- ;
+ /* get the row index */
+ r = ONES_COMPLEMENT (*psrc) ;
+ ASSERT (r >= 0 && r < n_row) ;
+ /* restore first column index */
+ *psrc = Row [r].shared2.first_column ;
+ ASSERT (ROW_IS_ALIVE (r)) ;
+
+ /* move and compact the row */
+ ASSERT (pdest <= psrc) ;
+ Row [r].start = (int) (pdest - &A [0]) ;
+ length = Row [r].length ;
+ for (j = 0 ; j < length ; j++)
+ {
+ c = *psrc++ ;
+ if (COL_IS_ALIVE (c))
+ {
+ *pdest++ = c ;
+ }
+ }
+ Row [r].length = (int) (pdest - &A [Row [r].start]) ;
+
+#ifndef NDEBUG
+ debug_rows-- ;
+#endif /* NDEBUG */
+
+ }
+ }
+ /* ensure we found all the rows */
+ ASSERT (debug_rows == 0) ;
+
+ /* === Return the new value of pfree ==================================== */
+
+ return ((int) (pdest - &A [0])) ;
+}
+
+
+/* ========================================================================== */
+/* === clear_mark =========================================================== */
+/* ========================================================================== */
+
+/*
+ Clears the Row [].shared2.mark array, and returns the new tag_mark.
+ Return value is the new tag_mark. Not user-callable.
+*/
+
+PRIVATE int clear_mark /* return the new value for tag_mark */
+(
+ /* === Parameters ======================================================= */
+
+ int n_row, /* number of rows in A */
+ Colamd_Row Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */
+)
+{
+ /* === Local variables ================================================== */
+
+ int r ;
+
+ for (r = 0 ; r < n_row ; r++)
+ {
+ if (ROW_IS_ALIVE (r))
+ {
+ Row [r].shared2.mark = 0 ;
+ }
+ }
+ return (1) ;
+}
+
+
+/* ========================================================================== */
+/* === print_report ========================================================= */
+/* ========================================================================== */
+
+PRIVATE void print_report
+(
+ char *method,
+ int stats [COLAMD_STATS]
+)
+{
+
+ int i1, i2, i3 ;
+
+ if (!stats)
+ {
+ PRINTF ("%s: No statistics available.\n", method) ;
+ return ;
+ }
+
+ i1 = stats [COLAMD_INFO1] ;
+ i2 = stats [COLAMD_INFO2] ;
+ i3 = stats [COLAMD_INFO3] ;
+
+ if (stats [COLAMD_STATUS] >= 0)
+ {
+ PRINTF ("%s: OK. ", method) ;
+ }
+ else
+ {
+ PRINTF ("%s: ERROR. ", method) ;
+ }
+
+ switch (stats [COLAMD_STATUS])
+ {
+
+ case COLAMD_OK_BUT_JUMBLED:
+
+ PRINTF ("Matrix has unsorted or duplicate row indices.\n") ;
+
+ PRINTF ("%s: number of duplicate or out-of-order row indices: %d\n",
+ method, i3) ;
+
+ PRINTF ("%s: last seen duplicate or out-of-order row index: %d\n",
+ method, INDEX (i2)) ;
+
+ PRINTF ("%s: last seen in column: %d",
+ method, INDEX (i1)) ;
+
+ /* no break - fall through to next case instead */
+
+ case COLAMD_OK:
+
+ PRINTF ("\n") ;
+
+ PRINTF ("%s: number of dense or empty rows ignored: %d\n",
+ method, stats [COLAMD_DENSE_ROW]) ;
+
+ PRINTF ("%s: number of dense or empty columns ignored: %d\n",
+ method, stats [COLAMD_DENSE_COL]) ;
+
+ PRINTF ("%s: number of garbage collections performed: %d\n",
+ method, stats [COLAMD_DEFRAG_COUNT]) ;
+ break ;
+
+ case COLAMD_ERROR_A_not_present:
+
+ PRINTF ("Array A (row indices of matrix) not present.\n") ;
+ break ;
+
+ case COLAMD_ERROR_p_not_present:
+
+ PRINTF ("Array p (column pointers for matrix) not present.\n") ;
+ break ;
+
+ case COLAMD_ERROR_nrow_negative:
+
+ PRINTF ("Invalid number of rows (%d).\n", i1) ;
+ break ;
+
+ case COLAMD_ERROR_ncol_negative:
+
+ PRINTF ("Invalid number of columns (%d).\n", i1) ;
+ break ;
+
+ case COLAMD_ERROR_nnz_negative:
+
+ PRINTF ("Invalid number of nonzero entries (%d).\n", i1) ;
+ break ;
+
+ case COLAMD_ERROR_p0_nonzero:
+
+ PRINTF ("Invalid column pointer, p [0] = %d, must be zero.\n", i1) ;
+ break ;
+
+ case COLAMD_ERROR_A_too_small:
+
+ PRINTF ("Array A too small.\n") ;
+ PRINTF (" Need Alen >= %d, but given only Alen = %d.\n",
+ i1, i2) ;
+ break ;
+
+ case COLAMD_ERROR_col_length_negative:
+
+ PRINTF
+ ("Column %d has a negative number of nonzero entries (%d).\n",
+ INDEX (i1), i2) ;
+ break ;
+
+ case COLAMD_ERROR_row_index_out_of_bounds:
+
+ PRINTF
+ ("Row index (row %d) out of bounds (%d to %d) in column %d.\n",
+ INDEX (i2), INDEX (0), INDEX (i3-1), INDEX (i1)) ;
+ break ;
+
+ case COLAMD_ERROR_out_of_memory:
+
+ PRINTF ("Out of memory.\n") ;
+ break ;
+
+ case COLAMD_ERROR_internal_error:
+
+ /* if this happens, there is a bug in the code */
+ PRINTF
+ ("Internal error! Please contact authors (davis at cise.ufl.edu).\n") ;
+ break ;
+ }
+}
+
+
+
+
+/* ========================================================================== */
+/* === colamd debugging routines ============================================ */
+/* ========================================================================== */
+
+/* When debugging is disabled, the remainder of this file is ignored. */
+
+#ifndef NDEBUG
+
+
+/* ========================================================================== */
+/* === debug_structures ===================================================== */
+/* ========================================================================== */
+
+/*
+ At this point, all empty rows and columns are dead. All live columns
+ are "clean" (containing no dead rows) and simplicial (no supercolumns
+ yet). Rows may contain dead columns, but all live rows contain at
+ least one live column.
+*/
+
+PRIVATE void debug_structures
+(
+ /* === Parameters ======================================================= */
+
+ int n_row,
+ int n_col,
+ Colamd_Row Row [],
+ Colamd_Col Col [],
+ int A [],
+ int n_col2
+)
+{
+ /* === Local variables ================================================== */
+
+ int i ;
+ int c ;
+ int *cp ;
+ int *cp_end ;
+ int len ;
+ int score ;
+ int r ;
+ int *rp ;
+ int *rp_end ;
+ int deg ;
+
+ /* === Check A, Row, and Col ============================================ */
+
+ for (c = 0 ; c < n_col ; c++)
+ {
+ if (COL_IS_ALIVE (c))
+ {
+ len = Col [c].length ;
+ score = Col [c].shared2.score ;
+ DEBUG4 (("initial live col %5d %5d %5d\n", c, len, score)) ;
+ ASSERT (len > 0) ;
+ ASSERT (score >= 0) ;
+ ASSERT (Col [c].shared1.thickness == 1) ;
+ cp = &A [Col [c].start] ;
+ cp_end = cp + len ;
+ while (cp < cp_end)
+ {
+ r = *cp++ ;
+ ASSERT (ROW_IS_ALIVE (r)) ;
+ }
+ }
+ else
+ {
+ i = Col [c].shared2.order ;
+ ASSERT (i >= n_col2 && i < n_col) ;
+ }
+ }
+
+ for (r = 0 ; r < n_row ; r++)
+ {
+ if (ROW_IS_ALIVE (r))
+ {
+ i = 0 ;
+ len = Row [r].length ;
+ deg = Row [r].shared1.degree ;
+ ASSERT (len > 0) ;
+ ASSERT (deg > 0) ;
+ rp = &A [Row [r].start] ;
+ rp_end = rp + len ;
+ while (rp < rp_end)
+ {
+ c = *rp++ ;
+ if (COL_IS_ALIVE (c))
+ {
+ i++ ;
+ }
+ }
+ ASSERT (i > 0) ;
+ }
+ }
+}
+
+
+/* ========================================================================== */
+/* === debug_deg_lists ====================================================== */
+/* ========================================================================== */
+
+/*
+ Prints the contents of the degree lists. Counts the number of columns
+ in the degree list and compares it to the total it should have. Also
+ checks the row degrees.
+*/
+
+PRIVATE void debug_deg_lists
+(
+ /* === Parameters ======================================================= */
+
+ int n_row,
+ int n_col,
+ Colamd_Row Row [],
+ Colamd_Col Col [],
+ int head [],
+ int min_score,
+ int should,
+ int max_deg
+)
+{
+ /* === Local variables ================================================== */
+
+ int deg ;
+ int col ;
+ int have ;
+ int row ;
+
+ /* === Check the degree lists =========================================== */
+
+ if (n_col > 10000 && colamd_debug <= 0)
+ {
+ return ;
+ }
+ have = 0 ;
+ DEBUG4 (("Degree lists: %d\n", min_score)) ;
+ for (deg = 0 ; deg <= n_col ; deg++)
+ {
+ col = head [deg] ;
+ if (col == EMPTY)
+ {
+ continue ;
+ }
+ DEBUG4 (("%d:", deg)) ;
+ while (col != EMPTY)
+ {
+ DEBUG4 ((" %d", col)) ;
+ have += Col [col].shared1.thickness ;
+ ASSERT (COL_IS_ALIVE (col)) ;
+ col = Col [col].shared4.degree_next ;
+ }
+ DEBUG4 (("\n")) ;
+ }
+ DEBUG4 (("should %d have %d\n", should, have)) ;
+ ASSERT (should == have) ;
+
+ /* === Check the row degrees ============================================ */
+
+ if (n_row > 10000 && colamd_debug <= 0)
+ {
+ return ;
+ }
+ for (row = 0 ; row < n_row ; row++)
+ {
+ if (ROW_IS_ALIVE (row))
+ {
+ ASSERT (Row [row].shared1.degree <= max_deg) ;
+ }
+ }
+}
+
+
+/* ========================================================================== */
+/* === debug_mark =========================================================== */
+/* ========================================================================== */
+
+/*
+ Ensures that the tag_mark is less that the maximum and also ensures that
+ each entry in the mark array is less than the tag mark.
+*/
+
+PRIVATE void debug_mark
+(
+ /* === Parameters ======================================================= */
+
+ int n_row,
+ Colamd_Row Row [],
+ int tag_mark,
+ int max_mark
+)
+{
+ /* === Local variables ================================================== */
+
+ int r ;
+
+ /* === Check the Row marks ============================================== */
+
+ ASSERT (tag_mark > 0 && tag_mark <= max_mark) ;
+ if (n_row > 10000 && colamd_debug <= 0)
+ {
+ return ;
+ }
+ for (r = 0 ; r < n_row ; r++)
+ {
+ ASSERT (Row [r].shared2.mark < tag_mark) ;
+ }
+}
+
+
+/* ========================================================================== */
+/* === debug_matrix ========================================================= */
+/* ========================================================================== */
+
+/*
+ Prints out the contents of the columns and the rows.
+*/
+
+PRIVATE void debug_matrix
+(
+ /* === Parameters ======================================================= */
+
+ int n_row,
+ int n_col,
+ Colamd_Row Row [],
+ Colamd_Col Col [],
+ int A []
+)
+{
+ /* === Local variables ================================================== */
+
+ int r ;
+ int c ;
+ int *rp ;
+ int *rp_end ;
+ int *cp ;
+ int *cp_end ;
+
+ /* === Dump the rows and columns of the matrix ========================== */
+
+ if (colamd_debug < 3)
+ {
+ return ;
+ }
+ DEBUG3 (("DUMP MATRIX:\n")) ;
+ for (r = 0 ; r < n_row ; r++)
+ {
+ DEBUG3 (("Row %d alive? %d\n", r, ROW_IS_ALIVE (r))) ;
+ if (ROW_IS_DEAD (r))
+ {
+ continue ;
+ }
+ DEBUG3 (("start %d length %d degree %d\n",
+ Row [r].start, Row [r].length, Row [r].shared1.degree)) ;
+ rp = &A [Row [r].start] ;
+ rp_end = rp + Row [r].length ;
+ while (rp < rp_end)
+ {
+ c = *rp++ ;
+ DEBUG4 ((" %d col %d\n", COL_IS_ALIVE (c), c)) ;
+ }
+ }
+
+ for (c = 0 ; c < n_col ; c++)
+ {
+ DEBUG3 (("Col %d alive? %d\n", c, COL_IS_ALIVE (c))) ;
+ if (COL_IS_DEAD (c))
+ {
+ continue ;
+ }
+ DEBUG3 (("start %d length %d shared1 %d shared2 %d\n",
+ Col [c].start, Col [c].length,
+ Col [c].shared1.thickness, Col [c].shared2.score)) ;
+ cp = &A [Col [c].start] ;
+ cp_end = cp + Col [c].length ;
+ while (cp < cp_end)
+ {
+ r = *cp++ ;
+ DEBUG4 ((" %d row %d\n", ROW_IS_ALIVE (r), r)) ;
+ }
+ }
+}
+
+PRIVATE void colamd_get_debug
+(
+ char *method
+)
+{
+ colamd_debug = 0 ; /* no debug printing */
+
+ /* get "D" environment variable, which gives the debug printing level */
+ if (getenv ("D"))
+ {
+ colamd_debug = atoi (getenv ("D")) ;
+ }
+
+ DEBUG0 (("%s: debug version, D = %d (THIS WILL BE SLOW!)\n",
+ method, colamd_debug)) ;
+}
+
+#endif /* NDEBUG */
+
diff --git a/SRC/colamd.h b/SRC/colamd.h
new file mode 100644
index 0000000..03fc3bd
--- /dev/null
+++ b/SRC/colamd.h
@@ -0,0 +1,259 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file colamd.h
+ \brief Colamd prototypes and definitions
+
+ <pre>
+ ==========================================================================
+ === colamd/symamd prototypes and definitions =============================
+ ==========================================================================
+
+ You must include this file (colamd.h) in any routine that uses colamd,
+ symamd, or the related macros and definitions.
+
+ Authors:
+
+ The authors of the code itself are Stefan I. Larimore and Timothy A.
+ Davis (davis at cise.ufl.edu), University of Florida. The algorithm was
+ developed in collaboration with John Gilbert, Xerox PARC, and Esmond
+ Ng, Oak Ridge National Laboratory.
+
+ Date:
+
+ September 8, 2003. Version 2.3.
+
+ Acknowledgements:
+
+ This work was supported by the National Science Foundation, under
+ grants DMS-9504974 and DMS-9803599.
+
+ Notice:
+
+ Copyright (c) 1998-2003 by the University of Florida.
+ All Rights Reserved.
+
+ THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
+
+ Permission is hereby granted to use, copy, modify, and/or distribute
+ this program, provided that the Copyright, this License, and the
+ Availability of the original version is retained on all copies and made
+ accessible to the end-user of any code or package that includes COLAMD
+ or any modified version of COLAMD.
+
+ Availability:
+
+ The colamd/symamd library is available at
+
+ http://www.cise.ufl.edu/research/sparse/colamd/
+
+ This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.h
+ file. It is required by the colamd.c, colamdmex.c, and symamdmex.c
+ files, and by any C code that calls the routines whose prototypes are
+ listed below, or that uses the colamd/symamd definitions listed below.
+ </pre>
+*/
+
+#ifndef COLAMD_H
+#define COLAMD_H
+
+/* ========================================================================== */
+/* === Include files ======================================================== */
+/* ========================================================================== */
+
+#include <stdlib.h>
+
+/* ========================================================================== */
+/* === Knob and statistics definitions ====================================== */
+/* ========================================================================== */
+
+/* size of the knobs [ ] array. Only knobs [0..1] are currently used. */
+#define COLAMD_KNOBS 20
+
+/* number of output statistics. Only stats [0..6] are currently used. */
+#define COLAMD_STATS 20
+
+/* knobs [0] and stats [0]: dense row knob and output statistic. */
+#define COLAMD_DENSE_ROW 0
+
+/* knobs [1] and stats [1]: dense column knob and output statistic. */
+#define COLAMD_DENSE_COL 1
+
+/* stats [2]: memory defragmentation count output statistic */
+#define COLAMD_DEFRAG_COUNT 2
+
+/* stats [3]: colamd status: zero OK, > 0 warning or notice, < 0 error */
+#define COLAMD_STATUS 3
+
+/* stats [4..6]: error info, or info on jumbled columns */
+#define COLAMD_INFO1 4
+#define COLAMD_INFO2 5
+#define COLAMD_INFO3 6
+
+/* error codes returned in stats [3]: */
+#define COLAMD_OK (0)
+#define COLAMD_OK_BUT_JUMBLED (1)
+#define COLAMD_ERROR_A_not_present (-1)
+#define COLAMD_ERROR_p_not_present (-2)
+#define COLAMD_ERROR_nrow_negative (-3)
+#define COLAMD_ERROR_ncol_negative (-4)
+#define COLAMD_ERROR_nnz_negative (-5)
+#define COLAMD_ERROR_p0_nonzero (-6)
+#define COLAMD_ERROR_A_too_small (-7)
+#define COLAMD_ERROR_col_length_negative (-8)
+#define COLAMD_ERROR_row_index_out_of_bounds (-9)
+#define COLAMD_ERROR_out_of_memory (-10)
+#define COLAMD_ERROR_internal_error (-999)
+
+/* ========================================================================== */
+/* === Row and Column structures ============================================ */
+/* ========================================================================== */
+
+/* User code that makes use of the colamd/symamd routines need not directly */
+/* reference these structures. They are used only for the COLAMD_RECOMMENDED */
+/* macro. */
+
+typedef struct Colamd_Col_struct
+{
+ int start ; /* index for A of first row in this column, or DEAD */
+ /* if column is dead */
+ int length ; /* number of rows in this column */
+ union
+ {
+ int thickness ; /* number of original columns represented by this */
+ /* col, if the column is alive */
+ int parent ; /* parent in parent tree super-column structure, if */
+ /* the column is dead */
+ } shared1 ;
+ union
+ {
+ int score ; /* the score used to maintain heap, if col is alive */
+ int order ; /* pivot ordering of this column, if col is dead */
+ } shared2 ;
+ union
+ {
+ int headhash ; /* head of a hash bucket, if col is at the head of */
+ /* a degree list */
+ int hash ; /* hash value, if col is not in a degree list */
+ int prev ; /* previous column in degree list, if col is in a */
+ /* degree list (but not at the head of a degree list) */
+ } shared3 ;
+ union
+ {
+ int degree_next ; /* next column, if col is in a degree list */
+ int hash_next ; /* next column, if col is in a hash list */
+ } shared4 ;
+
+} Colamd_Col ;
+
+typedef struct Colamd_Row_struct
+{
+ int start ; /* index for A of first col in this row */
+ int length ; /* number of principal columns in this row */
+ union
+ {
+ int degree ; /* number of principal & non-principal columns in row */
+ int p ; /* used as a row pointer in init_rows_cols () */
+ } shared1 ;
+ union
+ {
+ int mark ; /* for computing set differences and marking dead rows*/
+ int first_column ;/* first column in row (used in garbage collection) */
+ } shared2 ;
+
+} Colamd_Row ;
+
+/* ========================================================================== */
+/* === Colamd recommended memory size ======================================= */
+/* ========================================================================== */
+
+/*
+ The recommended length Alen of the array A passed to colamd is given by
+ the COLAMD_RECOMMENDED (nnz, n_row, n_col) macro. It returns -1 if any
+ argument is negative. 2*nnz space is required for the row and column
+ indices of the matrix. COLAMD_C (n_col) + COLAMD_R (n_row) space is
+ required for the Col and Row arrays, respectively, which are internal to
+ colamd. An additional n_col space is the minimal amount of "elbow room",
+ and nnz/5 more space is recommended for run time efficiency.
+
+ This macro is not needed when using symamd.
+
+ Explicit typecast to int added Sept. 23, 2002, COLAMD version 2.2, to avoid
+ gcc -pedantic warning messages.
+*/
+
+#define COLAMD_C(n_col) ((int) (((n_col) + 1) * sizeof (Colamd_Col) / sizeof (int)))
+#define COLAMD_R(n_row) ((int) (((n_row) + 1) * sizeof (Colamd_Row) / sizeof (int)))
+
+#define COLAMD_RECOMMENDED(nnz, n_row, n_col) \
+( \
+((nnz) < 0 || (n_row) < 0 || (n_col) < 0) \
+? \
+ (-1) \
+: \
+ (2 * (nnz) + COLAMD_C (n_col) + COLAMD_R (n_row) + (n_col) + ((nnz) / 5)) \
+)
+
+/* ========================================================================== */
+/* === Prototypes of user-callable routines ================================= */
+/* ========================================================================== */
+
+int colamd_recommended /* returns recommended value of Alen, */
+ /* or (-1) if input arguments are erroneous */
+(
+ int nnz, /* nonzeros in A */
+ int n_row, /* number of rows in A */
+ int n_col /* number of columns in A */
+) ;
+
+void colamd_set_defaults /* sets default parameters */
+( /* knobs argument is modified on output */
+ double knobs [COLAMD_KNOBS] /* parameter settings for colamd */
+) ;
+
+int colamd /* returns (1) if successful, (0) otherwise*/
+( /* A and p arguments are modified on output */
+ int n_row, /* number of rows in A */
+ int n_col, /* number of columns in A */
+ int Alen, /* size of the array A */
+ int A [], /* row indices of A, of size Alen */
+ int p [], /* column pointers of A, of size n_col+1 */
+ double knobs [COLAMD_KNOBS],/* parameter settings for colamd */
+ int stats [COLAMD_STATS] /* colamd output statistics and error codes */
+) ;
+
+int symamd /* return (1) if OK, (0) otherwise */
+(
+ int n, /* number of rows and columns of A */
+ int A [], /* row indices of A */
+ int p [], /* column pointers of A */
+ int perm [], /* output permutation, size n_col+1 */
+ double knobs [COLAMD_KNOBS], /* parameters (uses defaults if NULL) */
+ int stats [COLAMD_STATS], /* output statistics and error codes */
+ void * (*allocate) (size_t, size_t),
+ /* pointer to calloc (ANSI C) or */
+ /* mxCalloc (for MATLAB mexFunction) */
+ void (*release) (void *)
+ /* pointer to free (ANSI C) or */
+ /* mxFree (for MATLAB mexFunction) */
+) ;
+
+void colamd_report
+(
+ int stats [COLAMD_STATS]
+) ;
+
+void symamd_report
+(
+ int stats [COLAMD_STATS]
+) ;
+
+#endif /* COLAMD_H */
diff --git a/SRC/dSchCompUdt-2Ddynamic.c b/SRC/dSchCompUdt-2Ddynamic.c
index 360861f..38b8f11 100644
--- a/SRC/dSchCompUdt-2Ddynamic.c
+++ b/SRC/dSchCompUdt-2Ddynamic.c
@@ -16,29 +16,46 @@ at the top-level directory.
* Uses 2D partitioning for the scatter phase.
*
* <pre>
- * -- Distributed SuperLU routine (version 4.1) --
+ * -- Distributed SuperLU routine (version 5.2) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* October 1, 2014
*
+ * Modified: September 14, 2017
+ * - First gather U-panel, then depending on "ldu" (excluding leading zeros),
+ * gather only trailing columns of the L-panel corresponding to the nonzero
+ * of U-rows.
+ * - Padding zeros for nice dimensions of GEMM.
+ *
*/
#define SCHEDULE_STRATEGY guided
-double tt_start;
-double tt_end;
+
+/*
+ * Buffers:
+ * [ lookAhead_L_buff | Remain_L_buff ] : stores the gathered L-panel
+ * (A matrix in C := A*B )
+ * bigU : stores the U-panel (B matrix in C := A*B)
+ * bigV : stores the block GEMM result (C matrix in C := A*B)
+ */
if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
int cum_nrow = 0; /* cumulative number of nonzero rows in L(:,k) */
int temp_nbrow; /* nonzero rows in current block L(i,k) */
lptr = lptr0;
luptr = luptr0;
- /**
+ int Lnbrow, Rnbrow; /* number of nonzero rows in look-ahead window,
+ and remaining part. */
+
+ /*******************************************************************
* Seperating L blocks into the top part within look-ahead window
* and the remaining ones.
- */
+ *******************************************************************/
+
int lookAheadBlk=0, RemainBlk=0;
tt_start = SuperLU_timer_();
+ /* Sherry -- can this loop be threaded?? */
/* Loop through all blocks in L(:,k) to set up pointers to the start
* of each block in the data arrays.
* - lookAheadFullRow[i] := number of nonzero rows from block 0 to i
@@ -47,36 +64,36 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
* - (ditto Remain_Info[i])
*/
for (int i = 0; i < nlb; ++i) {
- ib = lsub[lptr]; /* block number of L(i,k). */
+ ib = lsub[lptr]; /* Block number of L(i,k). */
temp_nbrow = lsub[lptr+1]; /* Number of full rows. */
int look_up_flag = 1; /* assume ib is outside look-up window */
- for (int j = k0+1; j < SUPERLU_MIN (k0 + num_look_aheads+2, nsupers ); ++j)
- {
- if(ib == perm_c_supno[j]) {
- look_up_flag=0; /* flag ib is within look-up window */
- break; /* Sherry -- can exit the loop?? */
+ for (int j = k0+1; j < SUPERLU_MIN (k0 + num_look_aheads+2, nsupers );
+ ++j) {
+ if ( ib == perm_c_supno[j] ) {
+ look_up_flag = 0; /* flag ib within look-up window */
+ break; /* Sherry -- can exit the loop?? */
}
- }
+ }
- if( look_up_flag == 0 ) { /* ib is within look up window */
+ if ( look_up_flag == 0 ) { /* ib is within look-up window */
if (lookAheadBlk==0) {
lookAheadFullRow[lookAheadBlk] = temp_nbrow;
} else {
- lookAheadFullRow[lookAheadBlk] = temp_nbrow+lookAheadFullRow[lookAheadBlk-1];
+ lookAheadFullRow[lookAheadBlk] =
+ temp_nbrow + lookAheadFullRow[lookAheadBlk-1];
}
lookAheadStRow[lookAheadBlk] = cum_nrow;
lookAhead_lptr[lookAheadBlk] = lptr;
lookAhead_ib[lookAheadBlk] = ib;
lookAheadBlk++;
- } else { /* ib is not in look up window */
-
- if (RemainBlk==0) {
+ } else { /* ib is not in look-up window */
+ if ( RemainBlk==0 ) {
Remain_info[RemainBlk].FullRow = temp_nbrow;
} else {
- Remain_info[RemainBlk].FullRow = temp_nbrow+Remain_info[RemainBlk-1].FullRow;
+ Remain_info[RemainBlk].FullRow =
+ temp_nbrow + Remain_info[RemainBlk-1].FullRow;
}
-
RemainStRow[RemainBlk] = cum_nrow;
// Remain_lptr[RemainBlk] = lptr;
Remain_info[RemainBlk].lptr = lptr;
@@ -85,139 +102,105 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
RemainBlk++;
}
- cum_nrow +=temp_nbrow;
+ cum_nrow += temp_nbrow;
lptr += LB_DESCRIPTOR; /* Skip descriptor. */
lptr += temp_nbrow; /* Move to next block */
luptr += temp_nbrow;
- } /* for i ... all blocks in L(:,k) */
+ } /* for i ... set up pointers for all blocks in L(:,k) */
lptr = lptr0;
luptr = luptr0;
- /* leading dimension of L buffer */
-#if 0
- int LDlookAhead_LBuff = lookAheadFullRow[lookAheadBlk-1]; /* may go negative.*/
-#else /* Piyush fix */
- int LDlookAhead_LBuff = lookAheadBlk==0? 0 :lookAheadFullRow[lookAheadBlk-1];
-#endif
-
- /* Loop through the look-ahead blocks to copy Lval into the buffer */
-#ifdef __OPENMP
- /* #pragma omp parallel for -- why not?? Sherry */
-#endif
- for (int i = 0; i < lookAheadBlk; ++i) {
- int StRowDest = 0;
- int temp_nbrow;
- if (i==0) {
- temp_nbrow = lookAheadFullRow[0];
- } else {
- StRowDest = lookAheadFullRow[i-1];
- temp_nbrow = lookAheadFullRow[i]-lookAheadFullRow[i-1];
- }
-
- int StRowSource=lookAheadStRow[i];
-
- /* Now copying the matrix*/
- // #pragma omp parallel for (gives slow down)
- for (int j = 0; j < knsupc; ++j) {
- memcpy(&lookAhead_L_buff[StRowDest+j*LDlookAhead_LBuff],
- &lusup[luptr+j*nsupr+StRowSource],
- temp_nbrow * sizeof(double) );
- }
- }
-
- int LDRemain_LBuff = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
-
- /* Loop through the remaining blocks to copy Lval into the buffer */
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
- for (int i = 0; i < RemainBlk; ++i) {
- int StRowDest = 0;
- int temp_nbrow;
- if (i==0) {
- temp_nbrow = Remain_info[0].FullRow;
- } else {
- StRowDest = Remain_info[i-1].FullRow;
- temp_nbrow = Remain_info[i].FullRow-Remain_info[i-1].FullRow;
- }
-
- int StRowSource=RemainStRow[i];
-
- /* Now copying the matrix*/
- // #pragma omp parallel for (gives slow down)
- for (int j = 0; j < knsupc; ++j) {
- // printf("StRowDest %d LDRemain_LBuff %d StRowSource %d \n", StRowDest ,LDRemain_LBuff ,StRowSource );
- memcpy(&Remain_L_buff[StRowDest+j*LDRemain_LBuff],
- &lusup[luptr+j*nsupr+StRowSource],
- temp_nbrow * sizeof(double) );
- }
- } /* parallel for i ... */
-
-#if ( PRNTlevel>=1 )
- tt_end = SuperLU_timer_();
- GatherLTimer += tt_end - tt_start;
-#endif
-#if 0
- LookAheadRowSepMOP += 2*knsupc*(lookAheadFullRow[lookAheadBlk-1]+Remain_info[RemainBlk-1].FullRow );
-#else
- int_t lnbrow, rnbrow; /* number of nonzero rows in look-ahead window
- or remaining part. */
- lnbrow = lookAheadBlk==0 ? 0 : lookAheadFullRow[lookAheadBlk-1];
- rnbrow = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
- nbrow = lnbrow + rnbrow; /* total number of rows in L */
+ /* leading dimension of L look-ahead buffer, same as Lnbrow */
+ //int LDlookAhead_LBuff = lookAheadBlk==0 ? 0 :lookAheadFullRow[lookAheadBlk-1];
+ Lnbrow = lookAheadBlk==0 ? 0 : lookAheadFullRow[lookAheadBlk-1];
+ /* leading dimension of L remaining buffer, same as Rnbrow */
+ //int LDRemain_LBuff = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+ Rnbrow = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+ /* assert( cum_nrow == (LDlookAhead_LBuff + LDRemain_LBuff) );*/
+ /* Piyush fix */
+ //int LDlookAhead_LBuff = lookAheadBlk==0? 0 : lookAheadFullRow[lookAheadBlk-1];
+
+ nbrow = Lnbrow + Rnbrow; /* total number of rows in L */
LookAheadRowSepMOP += 2*knsupc*(nbrow);
-#endif
-
- /**********************
- * Gather U blocks *
- **********************/
+ /***********************************************
+ * Gather U blocks (AFTER LOOK-AHEAD WINDOW) *
+ ***********************************************/
tt_start = SuperLU_timer_();
-#if 0
- nbrow = lookAheadFullRow[lookAheadBlk-1]+Remain_info[RemainBlk-1].FullRow;
-#endif
if ( nbrow > 0 ) { /* L(:,k) is not empty */
/*
* Counting U blocks
*/
- ncols = 0; /* total number of nonzero columns in U(k,:) */
- ldu = 0;
- full = 1; /* flag the U block is indeed 'full', containing segments
- of same length. No need padding 0 */
- int temp_ncols=0;
+ ldu = 0; /* Calculate ldu for U(k,:) after look-ahead window. */
+ ncols = 0; /* Total number of nonzero columns in U(k,:) */
+ int temp_ncols = 0;
- /* Loop through all blocks in U(k,:) to set up pointers to the start
+#if 0
+ /* jj0 contains the look-ahead window that was updated in
+ dlook_ahead_update.c. Now the search can continue from that point,
+ not to start from block 0. */
+ iukp = iukp0; /* point to the first block in index[] */
+ rukp = rukp0; /* point to the start of nzval[] */
+#esle
+ /* Save pointers at location right after look-ahead window
+ for later restart. */
+ iukp0 = iukp;
+ rukp0 = rukp;
+#endif
+
+ /* if ( iam==0 ) printf("--- k0 %d, k %d, jj0 %d, nub %d\n", k0, k, jj0, nub);*/
+
+ /*
+ * Loop through all blocks in U(k,:) to set up pointers to the start
* of each block in the data arrays, store them in Ublock_info[j]
* for block U(k,j).
*/
- for (j = jj0; j < nub; ++j) { /* jj0 was set to 0 */
+ for (j = jj0; j < nub; ++j) { /* jj0 starts after look-ahead window. */
temp_ncols = 0;
+#if 0
+ /* Sherry - can remove following call, since perm_u == Identity */
arrive_at_ublock(
j, &iukp, &rukp, &jb, &ljb, &nsupc,
iukp0, rukp0, usub, perm_u, xsup, grid
);
+#else
+ jb = usub[iukp];
+ /* ljb = LBj (jb, grid); Local block number of U(k,j). */
+ nsupc = SuperSize(jb);
+ iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+#endif
Ublock_info[j].iukp = iukp;
Ublock_info[j].rukp = rukp;
Ublock_info[j].jb = jb;
-
+
+ /* if ( iam==0 )
+ printf("j %d: Ublock_info[j].iukp %d, Ublock_info[j].rukp %d,"
+ "Ublock_info[j].jb %d, nsupc %d\n",
+ j, Ublock_info[j].iukp, Ublock_info[j].rukp,
+ Ublock_info[j].jb, nsupc); */
+
/* Prepare to call GEMM. */
jj = iukp;
-
for (; jj < iukp+nsupc; ++jj) {
segsize = klst - usub[jj];
if ( segsize ) {
++temp_ncols;
- if ( segsize != ldu ) full = 0; /* need padding 0 */
if ( segsize > ldu ) ldu = segsize;
}
}
Ublock_info[j].full_u_cols = temp_ncols;
ncols += temp_ncols;
- }
+#if 1
+ /* Jump number of nonzeros in block U(k,jj);
+ Move to block U(k,j+1) in nzval[] array. */
+ rukp += usub[iukp - 1];
+ iukp += nsupc;
+#endif
+ } /* end for j ... compute ldu & ncols */
/* Now doing prefix sum on full_u_cols.
* After this, full_u_cols is the number of nonzero columns
@@ -227,101 +210,239 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
Ublock_info[j].full_u_cols += Ublock_info[j-1].full_u_cols;
}
+ /* Padding zeros to make {m,n,k} multiple of vector length. */
+ jj = 8; //n;
+ if (gemm_padding > 0 && Rnbrow > jj && ncols > jj && ldu > jj) {
+ gemm_m_pad = Rnbrow + (Rnbrow % GEMM_PADLEN);
+ gemm_n_pad = ncols + (ncols % GEMM_PADLEN);
+ //gemm_n_pad = ncols;
+ //gemm_k_pad = ldu + (ldu % GEMM_PADLEN);
+ gemm_k_pad = ldu;
+
+ for (i = Rnbrow; i < gemm_m_pad; ++i) // padding A matrix
+ for (j = 0; j < gemm_k_pad; ++j)
+ Remain_L_buff[i + j*gemm_m_pad] = zero;
+ for (i = 0; i < Rnbrow; ++i)
+ for (j = ldu; j < gemm_k_pad; ++j)
+ Remain_L_buff[i + j*gemm_m_pad] = zero;
+ for (i = ldu; i < gemm_k_pad; ++i) // padding B matrix
+ for (j = 0; j < gemm_n_pad; ++j)
+ bigU[i + j*gemm_k_pad] = zero;
+ for (i = 0; i < ldu; ++i)
+ for (j = ncols; j < gemm_n_pad; ++j)
+ bigU[i + j*gemm_k_pad] = zero;
+ } else {
+ gemm_m_pad = Rnbrow;
+ gemm_n_pad = ncols;
+ gemm_k_pad = ldu;
+ }
+
tempu = bigU; /* buffer the entire row block U(k,:) */
/* Gather U(k,:) into buffer bigU[] to prepare for GEMM */
-#ifdef _OPENMP
-#pragma omp parallel for private(j,iukp,rukp,tempu, jb, nsupc,ljb,segsize,\
- lead_zero, jj, i) \
- default (shared) schedule(SCHEDULE_STRATEGY)
+#ifdef _OPENMP
+#pragma omp parallel for firstprivate(iukp, rukp) \
+ private(j,tempu, jb, nsupc,ljb,segsize, lead_zero, jj, i) \
+ default (shared) schedule(SCHEDULE_STRATEGY)
#endif
- for (j = jj0; j < nub; ++j) { /* jj0 was set to 0 */
+ for (j = jj0; j < nub; ++j) { /* jj0 starts after look-ahead window. */
- if(j==jj0) tempu = bigU;
- else tempu = bigU + ldu*Ublock_info[j-1].full_u_cols;
+ if (j==jj0) tempu = bigU;
+ //else tempu = bigU + ldu * Ublock_info[j-1].full_u_cols;
+ else tempu = bigU + gemm_k_pad * Ublock_info[j-1].full_u_cols;
- /* == processing each of the remaining columns == */
+ /* == processing each of the remaining columns in parallel == */
+#if 0
+ /* Sherry - can remove following call, since perm_u == Identity */
arrive_at_ublock(j, &iukp, &rukp, &jb, &ljb, &nsupc,
iukp0, rukp0, usub,perm_u, xsup, grid);
-
- /* Copy from U(k,:) to tempu[], padding zeros. */
+#else
+ iukp = Ublock_info[j].iukp;
+ rukp = Ublock_info[j].rukp;
+ jb = Ublock_info[j].jb;
+ nsupc = SuperSize (jb );
+#endif
+ /* Copy from U(k,j) to tempu[], padding zeros. */
for (jj = iukp; jj < iukp+nsupc; ++jj) {
segsize = klst - usub[jj];
if ( segsize ) {
lead_zero = ldu - segsize;
for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
- tempu += lead_zero;
- for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i];
+ //tempu += lead_zero;
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+ for (i=0; i<segsize; ++i) tempu[i+lead_zero] = uval[rukp+i];
+
rukp += segsize;
- tempu += segsize;
+#if 0
+ tempu += segsize;
+#else
+ tempu += gemm_k_pad;
+#endif
}
- }
+ }
+#if 0
+ rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+#endif
+ } /* parallel for j = jj0 .. nub */
+
+#if 0
+ if (ldu==0) printf("[%d] .. k0 %d, before updating: ldu %d, Lnbrow %d, Rnbrow %d, ncols %d\n",iam,k0,ldu,Lnbrow,Rnbrow, ncols);
+ fflush(stdout);
+#endif
+ } /* end if (nbrow>0), end gather U blocks */
+
+ GatherUTimer += SuperLU_timer_() - tt_start;
+ GatherMOP += 2*ldu*ncols;
+ int jj_cpu = nub; /* limit between CPU and GPU */
+ int thread_id;
+ /*tempv = bigV;*/
- rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
- } /* parallel for j:jjj_st..jjj */
+ /**********************
+ * Gather L blocks *
+ **********************/
+ tt_start = SuperLU_timer_();
- tempu = bigU; /* setting to the start of padded U(k,:) */
+ /* Loop through the look-ahead blocks to copy Lval into the buffer */
+#ifdef _OPENMP
+#pragma omp parallel for private(j,jj,tempu,tempv) default (shared)
+#endif
+ for (int i = 0; i < lookAheadBlk; ++i) {
+ int StRowDest, temp_nbrow;
+ if ( i==0 ) {
+ StRowDest = 0;
+ temp_nbrow = lookAheadFullRow[0];
+ } else {
+ StRowDest = lookAheadFullRow[i-1];
+ temp_nbrow = lookAheadFullRow[i]-lookAheadFullRow[i-1];
+ }
+
+ int StRowSource = lookAheadStRow[i];
+
+ /* Now copying one block into L lookahead buffer */
+ /* #pragma omp parallel for (gives slow down) */
+ // for (int j = 0; j < knsupc; ++j) {
+ for (j = knsupc-ldu; j < knsupc; ++j) { /* skip leading columns
+ corresponding to zero U rows */
+#if 1
+ /* Better let compiler generate memcpy or vectorized code. */
+ //tempu = &lookAhead_L_buff[StRowDest + j*LDlookAhead_LBuff];
+ //tempu = &lookAhead_L_buff[StRowDest + j * Lnbrow];
+ tempu = &lookAhead_L_buff[StRowDest + (j - (knsupc-ldu)) * Lnbrow];
+ tempv = &lusup[luptr+j*nsupr + StRowSource];
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+ for (jj = 0; jj < temp_nbrow; ++jj) tempu[jj] = tempv[jj];
+#else
+ //memcpy(&lookAhead_L_buff[StRowDest + j*LDlookAhead_LBuff],
+ memcpy(&lookAhead_L_buff[StRowDest + (j - (knsupc-ldu)) * Lnbrow],
+ &lusup[luptr+j*nsupr + StRowSource],
+ temp_nbrow * sizeof(double) );
+#endif
+ } /* end for j ... */
+ } /* parallel for i ... gather Lval blocks from lookahead window */
- } /* end if (nbrow>0) */
+ /* Loop through the remaining blocks to copy Lval into the buffer */
+#ifdef _OPENMP
+#pragma omp parallel for private(i,j,jj,tempu,tempv) default (shared) \
+ schedule(SCHEDULE_STRATEGY)
+#endif
+ for (i = 0; i < RemainBlk; ++i) {
+ int StRowDest, temp_nbrow;
+ if ( i==0 ) {
+ StRowDest = 0;
+ temp_nbrow = Remain_info[0].FullRow;
+ } else {
+ StRowDest = Remain_info[i-1].FullRow;
+ temp_nbrow = Remain_info[i].FullRow - Remain_info[i-1].FullRow;
+ }
-#if ( PRNTlevel>=1 )
- GatherUTimer += SuperLU_timer_() - tt_start;
+ int StRowSource = RemainStRow[i];
+
+ /* Now copying a block into L remaining buffer */
+ // #pragma omp parallel for (gives slow down)
+ // for (int j = 0; j < knsupc; ++j) {
+ for (j = knsupc-ldu; j < knsupc; ++j) {
+ // printf("StRowDest %d Rnbrow %d StRowSource %d \n", StRowDest,Rnbrow ,StRowSource);
+#if 1
+ /* Better let compiler generate memcpy or vectorized code. */
+ //tempu = &Remain_L_buff[StRowDest + j*LDRemain_LBuff];
+ //tempu = &Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * Rnbrow];
+ tempu = &Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * gemm_m_pad];
+ tempv = &lusup[luptr + j*nsupr + StRowSource];
+#if (_OPENMP>=201307)
+#pragma omp simd
#endif
- GatherMOP += 2*ldu*ncols;
+ for (jj = 0; jj < temp_nbrow; ++jj) tempu[jj] = tempv[jj];
+#else
+ //memcpy(&Remain_L_buff[StRowDest + j*LDRemain_LBuff],
+ memcpy(&Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * gemm_m_pad],
+ &lusup[luptr+j*nsupr + StRowSource],
+ temp_nbrow * sizeof(double) );
+#endif
+ } /* end for j ... */
+ } /* parallel for i ... copy Lval into the remaining buffer */
- int Lnbrow = lookAheadBlk==0 ? 0 :lookAheadFullRow[lookAheadBlk-1];
- int Rnbrow = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
- int jj_cpu=nub; /*limit between CPU and GPU */
- int thread_id;
- tempv = bigV;
+ tt_end = SuperLU_timer_();
+ GatherLTimer += tt_end - tt_start;
- /**************************************
- * Perform GEMM followed by Scatter *
- **************************************/
- if ( Lnbrow>0 && ldu>0 && ncols>0 ) { /* Both L(:,k) and U(k,:) nonempty */
- /* Perform a large GEMM call */
- ncols = Ublock_info[nub-1].full_u_cols;
- schur_flop_counter += 2 * (double)Lnbrow * (double)ldu * (double)ncols;
- stat->ops[FACT] += 2 * (double)Lnbrow * (double)ldu * (double)ncols;
+ /*************************************************************************
+ * Perform GEMM (look-ahead L part, and remain L part) followed by Scatter
+ *************************************************************************/
+ tempu = bigU; /* setting to the start of padded U(k,:) */
+
+ if ( Lnbrow>0 && ldu>0 && ncols>0 ) { /* Both L(:,k) and U(k,:) nonempty */
+ /***************************************************************
+ * Updating blocks in look-ahead window of the LU(look-ahead-rows,:)
+ ***************************************************************/
+
+ /* Count flops for total GEMM calls */
+ ncols = Ublock_info[nub-1].full_u_cols;
+ flops_t flps = 2.0 * (flops_t)Lnbrow * ldu * ncols;
+ LookAheadScatterMOP += 3 * Lnbrow * ncols; /* scatter-add */
+ schur_flop_counter += flps;
+ stat->ops[FACT] += flps;
+ LookAheadGEMMFlOp += flps;
- /***************************************************************
- * Updating look-ahead blocks in both L and U look-ahead windows.
- ***************************************************************/
#ifdef _OPENMP
-#pragma omp parallel default (shared) private(thread_id,tt_start,tt_end)
- {
- thread_id = omp_get_thread_num();
+#pragma omp parallel default (shared) private(thread_id)
+ {
+ thread_id = omp_get_thread_num();
- /* Ideally, should organize the loop as:
- for (j = 0; j < nub; ++j) {
- for (lb = 0; lb < lookAheadBlk; ++lb) {
- L(lb,k) X U(k,j) -> tempv[]
- }
- }
- But now, we use collapsed loop to achieve more parallelism.
- Total number of block updates is:
- (# of lookAheadBlk in L(:,k)) X (# of blocks in U(k,:))
- */
+ /* Ideally, should organize the loop as:
+ for (j = 0; j < nub; ++j) {
+ for (lb = 0; lb < lookAheadBlk; ++lb) {
+ L(lb,k) X U(k,j) -> tempv[]
+ }
+ }
+ But now, we use collapsed loop to achieve more parallelism.
+ Total number of block updates is:
+ (# of lookAheadBlk in L(:,k)) X (# of blocks in U(k,:))
+ */
+
+ int i = sizeof(int);
+ int* indirect_thread = indirect + (ldt + CACHELINE/i) * thread_id;
+ int* indirect2_thread = indirect2 + (ldt + CACHELINE/i) * thread_id;
+
#pragma omp for \
- private (j,i,lb,rukp,iukp,jb,nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow) \
+ private (nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow) \
schedule(dynamic)
#else /* not use _OPENMP */
- thread_id = 0;
+ thread_id = 0;
+ int* indirect_thread = indirect;
+ int* indirect2_thread = indirect2;
#endif
- /* Each thread is assigned one loop index ij, responsible for
- block update L(lb,k) * U(k,j) -> tempv[]. */
- for (int ij = 0; ij < lookAheadBlk*(nub-jj0); ++ij) {
- if ( thread_id == 0 ) tt_start = SuperLU_timer_();
-
- int j = ij/lookAheadBlk + jj0; /* jj0 was set to 0 */
+ /* Each thread is assigned one loop index ij, responsible for
+ block update L(lb,k) * U(k,j) -> tempv[]. */
+ for (int ij = 0; ij < lookAheadBlk*(nub-jj0); ++ij) {
+ /* jj0 starts after look-ahead window. */
+ int j = ij/lookAheadBlk + jj0;
int lb = ij%lookAheadBlk;
- int* indirect_thread = indirect + ldt*thread_id;
- int* indirect2_thread = indirect2 + ldt*thread_id;
- double* tempv1 = bigV + thread_id*ldt*ldt;
-
/* Getting U block U(k,j) information */
/* unsigned long long ut_start, ut_end; */
int_t rukp = Ublock_info[j].rukp;
@@ -330,8 +451,8 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
int nsupc = SuperSize(jb);
int ljb = LBj (jb, grid); /* destination column block */
int st_col;
- int ncols;
- if ( j>jj0 ) { /* jj0 was set to 0 */
+ int ncols; /* Local variable counts only columns in the block */
+ if ( j > jj0 ) { /* jj0 starts after look-ahead window. */
ncols = Ublock_info[j].full_u_cols-Ublock_info[j-1].full_u_cols;
st_col = Ublock_info[j-1].full_u_cols;
} else {
@@ -346,7 +467,16 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
lptr += LB_DESCRIPTOR;
int cum_nrow = (lb==0 ? 0 : lookAheadFullRow[lb-1]);
+ /* Block-by-block GEMM in look-ahead window */
+#if 0
+ i = sizeof(double);
+ double* tempv1 = bigV + thread_id * (ldt*ldt + CACHELINE/i);
+#else
+ double* tempv1 = bigV + thread_id * (ldt*ldt);
+#endif
+
#if ( PRNTlevel>= 1)
+ if (thread_id == 0) tt_start = SuperLU_timer_();
gemm_max_m = SUPERLU_MAX(gemm_max_m, temp_nbrow);
gemm_max_n = SUPERLU_MAX(gemm_max_n, ncols);
gemm_max_k = SUPERLU_MAX(gemm_max_k, ldu);
@@ -354,14 +484,17 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
#if defined (USE_VENDOR_BLAS)
dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
- &lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
- &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
+ //&lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
+ &lookAhead_L_buff[cum_nrow], &Lnbrow,
+ &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
#else
dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
- &lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
- &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow);
+ //&lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
+ &lookAhead_L_buff[cum_nrow], &Lnbrow,
+ &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow);
#endif
-#if ( PRNTlevel>=1 )
+
+#if (PRNTlevel>=1 )
if (thread_id == 0) {
tt_end = SuperLU_timer_();
LookAheadGEMMTimer += tt_end - tt_start;
@@ -379,6 +512,11 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
grid
);
} else {
+#if 0
+ //#ifdef USE_VTUNE
+ __SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+ __itt_resume(); // start VTune, again use 2 underscores
+#endif
dscatter_l (
ib, ljb,
nsupc, iukp, xsup,
@@ -389,137 +527,187 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
Lrowind_bc_ptr, Lnzval_bc_ptr,
grid
);
+#if 0
+ //#ifdef USE_VTUNE
+ __itt_pause(); // stop VTune
+ __SSC_MARK(0x222); // stop SDE tracing
+#endif
}
#if ( PRNTlevel>=1 )
- if (thread_id == 0)
+ if (thread_id == 0)
LookAheadScatterTimer += SuperLU_timer_() - tt_start;
#endif
- } /* end omp for ij = ... */
+ } /* end omp for ij = ... */
+
#ifdef _OPENMP
- } /* end omp parallel */
+ } /* end omp parallel */
#endif
- LookAheadGEMMFlOp += 2*(double)Lnbrow * (double)ldu * (double)ncols;
- stat->ops[FACT] += 2*(double)Lnbrow * (double)ldu * (double)ncols;
- LookAheadScatterMOP += 3*Lnbrow*ncols;
- } /* end if Lnbrow < ... */
-
+ } /* end if Lnbrow>0 ... look-ahead GEMM and scatter */
+
/***************************************************************
* Updating remaining rows and columns on CPU.
***************************************************************/
- Rnbrow = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
- ncols = jj_cpu==0 ? 0 : Ublock_info[jj_cpu-1].full_u_cols;
+ ncols = jj_cpu==0 ? 0 : Ublock_info[jj_cpu-1].full_u_cols;
+
+ if ( Rnbrow>0 && ldu>0 ) { /* There are still blocks remaining ... */
+ double flps = 2.0 * (double)Rnbrow * ldu * ncols;
+ schur_flop_counter += flps;
+ stat->ops[FACT] += flps;
- schur_flop_counter += 2 * (double)Rnbrow * (double)ldu * (double)ncols;
- stat->ops[FACT] += 2 * (double)Rnbrow * (double)ldu * (double)ncols;
+#if ( PRNTlevel>=1 )
+ RemainGEMM_flops += flps;
+ gemm_max_m = SUPERLU_MAX(gemm_max_m, Rnbrow);
+ gemm_max_n = SUPERLU_MAX(gemm_max_n, ncols);
+ gemm_max_k = SUPERLU_MAX(gemm_max_k, ldu);
+ tt_start = SuperLU_timer_();
+ /* printf("[%d] .. k0 %d, before large GEMM: %d-%d-%d, RemainBlk %d\n",
+ iam, k0,Rnbrow,ldu,ncols,RemainBlk); fflush(stdout);
+ assert( Rnbrow*ncols < bigv_size ); */
+#endif
+ /* calling aggregated large GEMM, result stored in bigV[]. */
+#if defined (USE_VENDOR_BLAS)
+ //dgemm_("N", "N", &Rnbrow, &ncols, &ldu, &alpha,
+ dgemm_("N", "N", &gemm_m_pad, &gemm_n_pad, &gemm_k_pad, &alpha,
+ //&Remain_L_buff[(knsupc-ldu)*Rnbrow], &Rnbrow,
+ &Remain_L_buff[0], &gemm_m_pad,
+ &bigU[0], &gemm_k_pad, &beta, bigV, &gemm_m_pad, 1, 1);
+#else
+ //dgemm_("N", "N", &Rnbrow, &ncols, &ldu, &alpha,
+ dgemm_("N", "N", &gemm_m_pad, &gemm_n_pad, &gemm_k_pad, &alpha,
+ //&Remain_L_buff[(knsupc-ldu)*Rnbrow], &Rnbrow,
+ &Remain_L_buff[0], &gemm_m_pad,
+ &bigU[0], &gemm_k_pad, &beta, bigV, &gemm_m_pad);
+#endif
+#if ( PRNTlevel>=1 )
+ tt_end = SuperLU_timer_();
+ RemainGEMMTimer += tt_end - tt_start;
+#if ( PROFlevel>=1 )
+ //fprintf(fgemm, "%8d%8d%8d %16.8e\n", Rnbrow, ncols, ldu,
+ // (tt_end - tt_start)*1e6); // time in microsecond
+ //fflush(fgemm);
+ gemm_stats[gemm_count].m = Rnbrow;
+ gemm_stats[gemm_count].n = ncols;
+ gemm_stats[gemm_count].k = ldu;
+ gemm_stats[gemm_count++].microseconds = (tt_end - tt_start) * 1e6;
+#endif
+ tt_start = SuperLU_timer_();
+#endif
+
+#ifdef USE_VTUNE
+ __SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+ __itt_resume(); // start VTune, again use 2 underscores
+#endif
+
+ /* Scatter into destination block-by-block. */
#ifdef _OPENMP
-#pragma omp parallel default(shared) private(thread_id,tt_start,tt_end)
- {
- thread_id = omp_get_thread_num();
+#pragma omp parallel default(shared) private(thread_id)
+ {
+ thread_id = omp_get_thread_num();
- /* Ideally, should organize the loop as:
+ /* Ideally, should organize the loop as:
for (j = 0; j < jj_cpu; ++j) {
- for (lb = 0; lb < RemainBlk; ++lb) {
+ for (lb = 0; lb < RemainBlk; ++lb) {
L(lb,k) X U(k,j) -> tempv[]
}
}
- But now, we use collapsed loop to achieve more parallelism.
- Total number of block updates is:
- (# of RemainBlk in L(:,k)) X (# of blocks in U(k,:))
- */
+ But now, we use collapsed loop to achieve more parallelism.
+ Total number of block updates is:
+ (# of RemainBlk in L(:,k)) X (# of blocks in U(k,:))
+ */
+
+ int i = sizeof(int);
+ int* indirect_thread = indirect + (ldt + CACHELINE/i) * thread_id;
+ int* indirect2_thread = indirect2 + (ldt + CACHELINE/i) * thread_id;
+
#pragma omp for \
- private (j,i,lb,rukp,iukp,jb,nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow) \
+ private (j,lb,rukp,iukp,jb,nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow) \
schedule(dynamic)
#else /* not use _OPENMP */
- thread_id = 0;
-#endif
- /* Each thread is assigned one loop index ij, responsible for
- block update L(lb,k) * U(k,j) -> tempv[]. */
- for (int ij = 0; ij < RemainBlk*(jj_cpu-jj0); ++ij) { /* jj_cpu := nub */
- int j = ij / RemainBlk + jj0;
- int lb = ij % RemainBlk;
-
- int* indirect_thread = indirect + ldt*thread_id;
- int* indirect2_thread = indirect2 + ldt*thread_id;
- double* tempv1 = bigV + thread_id*ldt*ldt;
-
- /* Getting U block U(k,j) information */
- /* unsigned long long ut_start, ut_end; */
- int_t rukp = Ublock_info[j].rukp;
- int_t iukp = Ublock_info[j].iukp;
- int jb = Ublock_info[j].jb;
- int nsupc = SuperSize(jb);
- int ljb = LBj (jb, grid);
- int st_col;
- int ncols;
- if ( j>jj0 ) {
- ncols = Ublock_info[j].full_u_cols-Ublock_info[j-1].full_u_cols;
- st_col = Ublock_info[j-1].full_u_cols;
- } else {
- ncols = Ublock_info[j].full_u_cols;
- st_col = 0;
- }
-
- /* Getting L block L(i,k) information */
- int_t lptr = Remain_info[lb].lptr;
- int ib = Remain_info[lb].ib;
- int temp_nbrow = lsub[lptr+1];
- lptr += LB_DESCRIPTOR;
- int cum_nrow = (lb==0 ? 0 : Remain_info[lb-1].FullRow);
-
+ thread_id = 0;
+ int* indirect_thread = indirect;
+ int* indirect2_thread = indirect2;
+#endif
+ /* Each thread is assigned one loop index ij, responsible for
+ block update L(lb,k) * U(k,j) -> tempv[]. */
+ for (int ij = 0; ij < RemainBlk*(jj_cpu-jj0); ++ij) {
+ /* jj_cpu := nub, jj0 starts after look-ahead window. */
+ int j = ij / RemainBlk + jj0; /* j-th block in U panel */
+ int lb = ij % RemainBlk; /* lb-th block in L panel */
+
+ /* Getting U block U(k,j) information */
+ /* unsigned long long ut_start, ut_end; */
+ int_t rukp = Ublock_info[j].rukp;
+ int_t iukp = Ublock_info[j].iukp;
+ int jb = Ublock_info[j].jb;
+ int nsupc = SuperSize(jb);
+ int ljb = LBj (jb, grid);
+ int st_col;
+ int ncols;
+ if ( j>jj0 ) {
+ ncols = Ublock_info[j].full_u_cols - Ublock_info[j-1].full_u_cols;
+ st_col = Ublock_info[j-1].full_u_cols;
+ } else {
+ ncols = Ublock_info[j].full_u_cols;
+ st_col = 0;
+ }
+
+ /* Getting L block L(i,k) information */
+ int_t lptr = Remain_info[lb].lptr;
+ int ib = Remain_info[lb].ib;
+ int temp_nbrow = lsub[lptr+1];
+ lptr += LB_DESCRIPTOR;
+ int cum_nrow = (lb==0 ? 0 : Remain_info[lb-1].FullRow);
+
+ /* tempv1 points to block(i,j) in bigV : LDA == Rnbrow */
+ //double* tempv1 = bigV + (st_col * Rnbrow + cum_nrow); Sherry
+ double* tempv1 = bigV + (st_col * gemm_m_pad + cum_nrow); /* Sherry */
+
+ // printf("[%d] .. before scatter: ib %d, jb %d, temp_nbrow %d, Rnbrow %d\n", iam, ib, jb, temp_nbrow, Rnbrow); fflush(stdout);
+
+ /* Now scattering the block */
+
+ if ( ib < jb ) {
+ dscatter_u (
+ ib, jb,
+ nsupc, iukp, xsup,
+ //klst, Rnbrow, /*** klst, temp_nbrow, Sherry */
+ klst, gemm_m_pad, /*** klst, temp_nbrow, Sherry */
+ lptr, temp_nbrow, /* row dimension of the block */
+ lsub, usub, tempv1,
+ Ufstnz_br_ptr, Unzval_br_ptr,
+ grid
+ );
+ } else {
+ dscatter_l(
+ ib, ljb,
+ nsupc, iukp, xsup,
+ //klst, temp_nbrow, Sherry
+ klst, gemm_m_pad, /*** temp_nbrow, Sherry */
+ lptr, temp_nbrow, /* row dimension of the block */
+ usub, lsub, tempv1,
+ indirect_thread, indirect2_thread,
+ Lrowind_bc_ptr,Lnzval_bc_ptr,
+ grid
+ );
+ }
+
+ } /* end omp for (int ij =...) */
+
+#ifdef _OPENMP
+ } /* end omp parallel region */
+#endif
+
#if ( PRNTlevel>=1 )
- if ( thread_id==0 ) tt_start = SuperLU_timer_();
+ RemainScatterTimer += SuperLU_timer_() - tt_start;
#endif
- /* calling GEMM */
-#if defined (USE_VENDOR_BLAS)
- dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
- &Remain_L_buff[(knsupc-ldu)*Rnbrow+cum_nrow], &Rnbrow,
- &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
-#else
- dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
- &Remain_L_buff[(knsupc-ldu)*Rnbrow+cum_nrow], &Rnbrow,
- &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow);
+#ifdef USE_VTUNE
+ __itt_pause(); // stop VTune
+ __SSC_MARK(0x222); // stop SDE tracing
#endif
-#if ( PRNTlevel>=1 )
- if (thread_id==0) {
- tt_end = SuperLU_timer_();
- RemainGEMMTimer += tt_end - tt_start;
- tt_start = tt_end;
- }
-#endif
-
- /* Now scattering the block */
- if ( ib<jb ) {
- dscatter_u(
- ib, jb,
- nsupc, iukp, xsup,
- klst, temp_nbrow,
- lptr, temp_nbrow,lsub,
- usub, tempv1,
- Ufstnz_br_ptr, Unzval_br_ptr,
- grid
- );
- } else {
- dscatter_l(
- ib, ljb,
- nsupc, iukp, xsup,
- klst, temp_nbrow,
- lptr, temp_nbrow,
- usub, lsub, tempv1,
- indirect_thread, indirect2_thread,
- Lrowind_bc_ptr,Lnzval_bc_ptr,
- grid
- );
- }
+ } /* end if Rnbrow>0 ... update remaining block */
-#if ( PRNTlevel>=1 )
- if (thread_id==0) RemainScatterTimer += SuperLU_timer_() - tt_start;
-#endif
- } /* end omp for (int ij =...) */
-#ifdef _OPENMP
- } /* end omp parallel region */
-#endif
} /* end if L(:,k) and U(k,:) are not empty */
diff --git a/SRC/dbinary_io.c b/SRC/dbinary_io.c
new file mode 100644
index 0000000..22714a7
--- /dev/null
+++ b/SRC/dbinary_io.c
@@ -0,0 +1,40 @@
+#include "superlu_ddefs.h"
+
+int
+dread_binary(FILE *fp, int_t *m, int_t *n, int_t *nnz,
+ double **nzval, int_t **rowind, int_t **colptr)
+{
+ size_t isize = sizeof(int_t), dsize = sizeof(double);
+ int nnz_read;
+ fread(n, isize, 1, fp);
+ fread(nnz, isize, 1, fp);
+ printf("fread n %d\tnnz %d\n", *n, *nnz);
+ *m = *n;
+ *colptr = intMalloc_dist(*n+1);
+ *rowind = intMalloc_dist(*nnz);
+ *nzval = doubleMalloc_dist(*nnz);
+ fread(*colptr, isize, (size_t) (*n + 1), fp);
+ fread(*rowind, isize, (size_t) *nnz, fp);
+ nnz_read = fread(*nzval, dsize, (size_t) (*nnz), fp);
+ printf("# of doubles fread: %d\n", nnz_read);
+ fclose(fp);
+}
+
+int
+dwrite_binary(int_t n, int_t nnz,
+ double *values, int_t *rowind, int_t *colptr)
+{
+ FILE *fp1;
+ int nnz_written;
+ size_t isize = sizeof(int_t), dsize = sizeof(double);
+ fp1 = fopen("/scratch/scratchdirs/xiaoye/temp.bin", "wb");
+ fwrite(&n, isize, 1, fp1);
+ fwrite(&nnz, isize, 1, fp1);
+ fwrite(colptr, isize, n+1, fp1);
+ fwrite(rowind, isize, nnz, fp1);
+ nnz_written = fwrite(values, dsize, nnz, fp1);
+ printf("n %d, # of double: %d\n", n, nnz);
+ printf("dump binary file ... # of double fwrite: %d\n", nnz_written);
+ assert(nnz_written==nnz);
+ fclose(fp1);
+}
diff --git a/SRC/dlook_ahead_update.c b/SRC/dlook_ahead_update.c
index 7521506..a9f53b1 100644
--- a/SRC/dlook_ahead_update.c
+++ b/SRC/dlook_ahead_update.c
@@ -15,11 +15,17 @@ at the top-level directory.
* \brief Look-ahead update of the Schur complement.
*
* <pre>
- * -- Distributed SuperLU routine (version 4.0) --
+ * -- Distributed SuperLU routine (version 5.2) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* October 1, 2014
*
+ * Modified: September 18, 2017
+ *
*/
+
+iukp = iukp0; /* point to the first block in index[] */
+rukp = rukp0; /* point to the start of nzval[] */
+
#ifdef ISORT
while (j < nub && iperm_u[j] <= k0 + num_look_aheads)
#else
@@ -28,6 +34,8 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
{
double zero = 0.0;
+#if 0 // Sherry: no need to search
+ /* Caveat: There is a permutation perm_u involved for j */
/* Search along the row for the pointers {iukp, rukp} pointing to
* block U(k,j).
* j -- current block in look-ahead window, initialized to 0 on entry
@@ -39,6 +47,13 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
j, &iukp, &rukp, &jb, &ljb, &nsupc,
iukp0, rukp0, usub, perm_u, xsup, grid
);
+#else
+ jb = usub[iukp];
+ ljb = LBj (jb, grid); /* Local block number of U(k,j). */
+ nsupc = SuperSize(jb);
+ iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+#endif
+
j++;
jj0++;
jj = iukp;
@@ -47,48 +62,47 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
ldu = klst - usub[jj++];
ncols = 1;
- full = 1; /* flag the U block is indeed 'full', containing segments
- of same length. No need padding 0. */
+
+ /* This loop computes ldu. */
for (; jj < iukp + nsupc; ++jj) { /* for each column jj in block U(k,j) */
segsize = klst - usub[jj];
if (segsize) {
++ncols;
- if (segsize != ldu) full = 0; /* need padding 0 */
if (segsize > ldu) ldu = segsize;
}
}
#if ( DEBUGlevel>=3 )
++num_update;
#endif
- if (0) {
- tempu = &uval[rukp];
- }
- else { /* Copy block U(k,j) into tempU2d, padding zeros. */
+
#if ( DEBUGlevel>=3 )
- printf ("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
- iam, full, k, jb, ldu, ncols, nsupc);
- ++num_copy;
+ printf ("(%d) k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
+ iam, k, jb, ldu, ncols, nsupc);
+ ++num_copy;
#endif
- tempu = bigU; /* Copy one block U(k,j) to bigU for GEMM */
- for (jj = iukp; jj < iukp + nsupc; ++jj) {
- segsize = klst - usub[jj];
- if (segsize) {
- lead_zero = ldu - segsize;
- for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
- tempu += lead_zero;
- for (i = 0; i < segsize; ++i) {
- tempu[i] = uval[rukp + i];
- }
- rukp += segsize;
- tempu += segsize;
+
+ /* Now copy one block U(k,j) to bigU for GEMM, padding zeros up to ldu. */
+ tempu = bigU; /* Copy one block U(k,j) to bigU for GEMM */
+ for (jj = iukp; jj < iukp + nsupc; ++jj) {
+ segsize = klst - usub[jj];
+ if (segsize) {
+ lead_zero = ldu - segsize;
+ for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+ tempu += lead_zero;
+ for (i = 0; i < segsize; ++i) {
+ tempu[i] = uval[rukp + i];
}
+ rukp += segsize;
+ tempu += segsize;
}
- tempu = bigU;
- rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
- } /* if full ... */
+ }
+ tempu = bigU; /* set back to the beginning of the buffer */
+#if 0
+ rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+#endif
nbrow = lsub[1]; /* number of row subscripts in L(:,k) */
- if (myrow == krow) nbrow = lsub[1] - lsub[3]; /* skip diagonal block for those rows */
+ if (myrow == krow) nbrow = lsub[1] - lsub[3]; /* skip diagonal block for those rows. */
// double ttx =SuperLU_timer_();
int current_b = 0; /* Each thread starts searching from first block.
@@ -99,9 +113,9 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
#ifdef _OPENMP
/* Sherry -- examine all the shared variables ??
'firstprivate' ensures that the private variables are initialized
- to the values before entering the loop */
+ to the values before entering the loop. */
#pragma omp parallel for \
- firstprivate(lptr,luptr,ib,tempv,current_b) private(lb) \
+ firstprivate(lptr,luptr,ib,current_b) private(lb) \
default(shared) schedule(dynamic)
#endif
for (lb = 0; lb < nlb; lb++) { /* Loop through each block in L(:,k) */
@@ -134,7 +148,10 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+ /*if (thread_id == 0) tt_start = SuperLU_timer_();*/
+
/* calling gemm */
+ stat->ops[FACT] += 2.0 * (flops_t)temp_nbrow * ldu * ncols;
#if defined (USE_VENDOR_BLAS)
dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
&lusup[luptr + (knsupc - ldu) * nsupr], &nsupr,
@@ -145,7 +162,14 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
tempu, &ldu, &beta, tempv, &temp_nbrow );
#endif
- /* Now scattering the output*/
+#if 0
+ if (thread_id == 0) {
+ tt_end = SuperLU_timer_();
+ LookAheadGEMMTimer += tt_end - tt_start;
+ tt_start = tt_end;
+ }
+#endif
+ /* Now scattering the output. */
if (ib < jb) { /* A(i,j) is in U. */
dscatter_u (ib, jb,
nsupc, iukp, xsup,
@@ -159,14 +183,22 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
Lrowind_bc_ptr, Lnzval_bc_ptr, grid);
}
- ++current_b; /* move to next block */
+ ++current_b; /* Move to next block. */
lptr += temp_nbrow;
luptr += temp_nbrow;
+#if 0
+ if (thread_id == 0) {
+ tt_end = SuperLU_timer_();
+ LookAheadScatterTimer += tt_end - tt_start;
+ }
+#endif
} /* end parallel for lb = 0, nlb ... all blocks in L(:,k) */
- rukp += usub[iukp - 1]; /* Move to next U block, U(k,j+1) */
- iukp += nsupc;
+#if 0
+ rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
+#endif
+ iukp += nsupc; /* Mov to block U(k,j+1) */
/* =========================================== *
* == factorize L(:,j) and send if possible == *
@@ -187,17 +219,14 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
/* Factor diagonal and subdiagonal blocks and test for exact
singularity. */
factored[kk] = 0;
- /* double ttt1 = SuperLU_timer_(); */
-#if ( VAMPIR>=1 )
- VT_begin (5);
-#endif
+
+ double tt1 = SuperLU_timer_();
PDGSTRF2(options, kk0, kk, thresh, Glu_persist, grid, Llu,
U_diag_blk_send_req, tag_ub, stat, info);
-#if ( VAMPIR>=1 )
- VT_end (5);
-#endif
+ pdgstrf2_timer += SuperLU_timer_() - tt1;
+
/* stat->time7 += SuperLU_timer_() - ttt1; */
/* Multicasts numeric values of L(:,kk) to process rows. */
@@ -221,18 +250,12 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
#if ( PROFlevel>=1 )
TIC (t1);
#endif
-#if ( VAMPIR>=1 )
- VT_begin (1);
-#endif
MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
SLU_MPI_TAG (0, kk0) /* (4*kk0)%tag_ub */ ,
scp->comm, &send_req[pj]);
MPI_Isend (lusup1, msgcnt[1], MPI_DOUBLE, pj,
SLU_MPI_TAG (1, kk0) /* (4*kk0+1)%tag_ub */ ,
scp->comm, &send_req[pj + Pc]);
-#if ( VAMPIR>=1 )
- VT_end (1);
-#endif
#if ( PROFlevel>=1 )
TOC (t2, t1);
stat->utime[COMM] += t2;
diff --git a/SRC/dmemory_dist.c b/SRC/dmemory_dist.c
index 8f9e7a2..47f541f 100644
--- a/SRC/dmemory_dist.c
+++ b/SRC/dmemory_dist.c
@@ -129,10 +129,13 @@ int_t dQuerySpace_dist(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
mem_usage->total += (float)(2 * k * iword);
#else
/*mem_usage->total += stat->current_buffer;*/
- printf(".. dQuery_Space: peak_buffer %.2f (MB)\n", stat->peak_buffer * 1.0e-6);
mem_usage->total += stat->peak_buffer;
-#endif
+#if ( PRNTlevel>=1 )
+ if (iam==0) printf(".. dQuerySpace: peak_buffer %.2f (MB)\n",
+ stat->peak_buffer * 1.0e-6);
+#endif
+#endif
return 0;
} /* dQuerySpace_dist */
diff --git a/SRC/dreadMM.c b/SRC/dreadMM.c
index 9ddc538..f7e0a2e 100644
--- a/SRC/dreadMM.c
+++ b/SRC/dreadMM.c
@@ -17,6 +17,7 @@ at the top-level directory.
*
*/
#include <ctype.h>
+#include <stdio.h>
#include "superlu_ddefs.h"
#undef EXPAND_SYM
@@ -43,6 +44,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
int_t zero_base = 0;
char *p, line[512], banner[64], mtx[64], crd[64], arith[64], sym[64];
int expand;
+ char *cs;
/* File format:
* %%MatrixMarket matrix coordinate real general/symmetric/...
@@ -54,7 +56,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
*/
/* 1/ read header */
- fgets(line,512,fp);
+ cs = fgets(line,512,fp);
for (p=line; *p!='\0'; *p=tolower(*p),p++);
if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, arith, sym) != 5) {
@@ -100,7 +102,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
/* 2/ Skip comments */
while(banner[0]=='%') {
- fgets(line,512,fp);
+ cs = fgets(line,512,fp);
sscanf(line,"%s",banner);
}
@@ -123,16 +125,17 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
*m = *n;
printf("m %lld, n %lld, nonz %lld\n", (long long) *m, (long long) *n, (long long) *nonz);
+ fflush(stdout);
dallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */
a = *nzval;
asub = *rowind;
xa = *colptr;
- if ( !(val = (double *) SUPERLU_MALLOC(new_nonz * sizeof(double))) )
+ if ( !(val = doubleMalloc_dist(new_nonz)) )
ABORT("Malloc fails for val[]");
- if ( !(row = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+ if ( !(row = (int_t *) intMalloc_dist(new_nonz)) )
ABORT("Malloc fails for row[]");
- if ( !(col = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+ if ( !(col = (int_t *) intMalloc_dist(new_nonz)) )
ABORT("Malloc fails for col[]");
for (j = 0; j < *n; ++j) xa[j] = 0;
@@ -140,17 +143,19 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
/* 4/ Read triplets of values */
for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
#ifdef _LONGINT
- fscanf(fp, "%lld%lld%lf\n", &row[nz], &col[nz], &val[nz]);
+ j = fscanf(fp, "%lld%lld%lf\n", &row[nz], &col[nz], &val[nz]);
#else
- fscanf(fp, "%d%d%lf\n", &row[nz], &col[nz], &val[nz]);
+ j = fscanf(fp, "%d%d%lf\n", &row[nz], &col[nz], &val[nz]);
#endif
- if ( nnz == 0 ) /* first nonzero */
+ if ( nnz == 0 ) /* first nonzero */ {
if ( row[0] == 0 || col[0] == 0 ) {
zero_base = 1;
printf("triplet file: row/col indices are zero-based.\n");
} else
printf("triplet file: row/col indices are one-based.\n");
+ fflush(stdout);
+ }
if ( !zero_base ) {
/* Change to 0-based indexing. */
@@ -181,6 +186,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
*nonz = nz;
if(expand) {
printf("new_nonz after symmetric expansion:\t" IFMT "\n", *nonz);
+ fflush(stdout);
}
@@ -234,10 +240,8 @@ static void dreadrhs(int m, double *b)
exit(-1);
}
for (i = 0; i < m; ++i)
- fscanf(fp, "%lf\n", &b[i]);
+ i = fscanf(fp, "%lf\n", &b[i]);
/*fscanf(fp, "%d%lf\n", &j, &b[i]);*/
/* readpair_(j, &b[i]);*/
fclose(fp);
}
-
-
diff --git a/SRC/dscatter.c b/SRC/dscatter.c
index af18ea8..00adbdf 100644
--- a/SRC/dscatter.c
+++ b/SRC/dscatter.c
@@ -14,10 +14,13 @@ at the top-level directory.
* \brief Scatter the computed blocks into LU destination.
*
* <pre>
- * -- Distributed SuperLU routine (version 4.0) --
+ * -- Distributed SuperLU routine (version 5.2) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* October 1, 2014
*
+ * Modified:
+ * September 18, 2017, enable SIMD vectorized scatter operation.
+ *
*/
#include <math.h>
#include "superlu_ddefs.h"
@@ -112,9 +115,9 @@ dscatter_l (
int_t iukp, /* point to destination supernode's index[] */
int_t* xsup,
int klst,
- int nbrow,
+ int nbrow, /* LDA of the block in tempv[] */
int_t lptr, /* Input, point to index[] location of block L(i,k) */
- int temp_nbrow, /* number of rows in block L(i,k) */
+ int temp_nbrow, /* number of rows of source block L(i,k) */
int_t* usub,
int_t* lsub,
double *tempv,
@@ -126,7 +129,7 @@ dscatter_l (
int_t rel, i, segsize, jj;
double *nzval;
int_t *index = Lrowind_bc_ptr[ljb];
- int_t ldv = index[1]; /* LDA of the dest lusup. */
+ int_t ldv = index[1]; /* LDA of the destination lusup. */
int_t lptrj = BC_HEADER;
int_t luptrj = 0;
int_t ijb = index[lptrj];
@@ -139,36 +142,43 @@ dscatter_l (
}
/*
- * Build indirect table. This is needed because the
- * indices are not sorted for the L blocks.
+ * Build indirect table. This is needed because the indices are not sorted
+ * in the L blocks.
*/
int_t fnz = FstBlockC (ib);
int_t dest_nbrow;
lptrj += LB_DESCRIPTOR;
dest_nbrow=index[lptrj - 1];
- for (i = 0; i < dest_nbrow; ++i)
- {
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+ for (i = 0; i < dest_nbrow; ++i) {
rel = index[lptrj + i] - fnz;
indirect_thread[rel] = i;
}
- /* can be precalculated */
- for (i = 0; i < temp_nbrow; ++i)
- {
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+ /* can be precalculated? */
+ for (i = 0; i < temp_nbrow; ++i) { /* Source index is a subset of dest. */
rel = lsub[lptr + i] - fnz;
indirect2[i] =indirect_thread[rel];
}
- nzval = Lnzval_bc_ptr[ljb] + luptrj; /* Dest. block L(i,j) */
- for (jj = 0; jj < nsupc; ++jj)
- {
+ nzval = Lnzval_bc_ptr[ljb] + luptrj; /* Destination block L(i,j) */
+#ifdef __INTEL_COMPILER
+#pragma ivdep
+#endif
+ for (jj = 0; jj < nsupc; ++jj) {
segsize = klst - usub[iukp + jj];
- if (segsize)
- {
- for (i = 0; i < temp_nbrow; ++i)
- {
+ if (segsize) {
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+ for (i = 0; i < temp_nbrow; ++i) {
nzval[indirect2[i]] -= tempv[i];
}
tempv += nbrow;
@@ -186,9 +196,9 @@ dscatter_u (int ib,
int_t iukp,
int_t * xsup,
int klst,
- int nbrow,
- int_t lptr,
- int temp_nbrow,
+ int nbrow, /* LDA of the block in tempv[] */
+ int_t lptr, /* point to index location of block L(i,k) */
+ int temp_nbrow, /* number of rows of source block L(i,k) */
int_t* lsub,
int_t* usub,
double* tempv,
@@ -208,8 +218,8 @@ dscatter_u (int ib,
int_t lib = LBi (ib, grid);
int_t *index = Ufstnz_br_ptr[lib];
- /* Reinitilize the pointers to the begining of the
- * k-th column/row of L/U factors.
+ /* Reinitilize the pointers to the begining of the k-th column/row of
+ * L/U factors.
* usub[] - index array for panel U(k,:)
*/
int_t iuip_lib, ruip_lib;
@@ -217,38 +227,32 @@ dscatter_u (int ib,
ruip_lib = 0;
int_t ijb = index[iuip_lib];
- while (ijb < jb) /* Search for dest block. */
- {
+ while (ijb < jb) { /* Search for destination block. */
ruip_lib += index[iuip_lib + 1];
// printf("supersize[%ld] \t:%ld \n",ijb,SuperSize( ijb ) );
iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
ijb = index[iuip_lib];
}
- /* Skip descriptor. Now point to fstnz index of
- block U(i,j). */
+ /* Skip descriptor. Now point to fstnz index of block U(i,j). */
iuip_lib += UB_DESCRIPTOR;
// tempv = bigV + (cum_nrow + cum_ncol*nbrow);
- for (jj = 0; jj < nsupc; ++jj)
- {
+ for (jj = 0; jj < nsupc; ++jj) {
segsize = klst - usub[iukp + jj];
fnz = index[iuip_lib++];
- if (segsize) /* Nonzero segment in U(k.j). */
- {
+ if (segsize) { /* Nonzero segment in U(k,j). */
ucol = &Unzval_br_ptr[lib][ruip_lib];
// printf("========Entering loop=========\n");
- for (i = 0; i < temp_nbrow; ++i)
- {
-
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+ for (i = 0; i < temp_nbrow; ++i) {
rel = lsub[lptr + i] - fnz;
// printf("%d %d %d %d %d \n",lptr,i,fnz,temp_nbrow,nbrow );
// printf("hello ucol[%d] %d %d : \n",rel,lsub[lptr + i],fnz);
-
ucol[rel] -= tempv[i];
- // printf("hello\n");
-
#ifdef PI_DEBUG
double zz = 0.0;
if (!(*(long *) &zz == *(long *) &tempv[i]))
@@ -256,15 +260,16 @@ dscatter_u (int ib,
ucol[rel]);
//printing triplets (location??, old value, new value ) if none of them is zero
#endif
- } /* for i=0..temp_nbropw */
- tempv += nbrow;
+ } /* for i = 0:temp_nbropw */
+ tempv += nbrow; /* Jump LDA to next column */
#ifdef PI_DEBUG
// printf("\n");
#endif
- } /*ig segsize */
+ } /* if segsize */
+
ruip_lib += ilst - fnz;
- } /*for jj=0:nsupc */
+ } /* for jj = 0:nsupc */
#ifdef PI_DEBUG
// printf("\n");
#endif
diff --git a/SRC/get_perm_c.c b/SRC/get_perm_c.c
index 14b208d..4353ca4 100644
--- a/SRC/get_perm_c.c
+++ b/SRC/get_perm_c.c
@@ -23,6 +23,7 @@ at the top-level directory.
*/
#include "superlu_ddefs.h"
+#include "colamd.h"
void
@@ -102,6 +103,39 @@ get_metis(
SUPERLU_FREE(perm);
}
+void
+get_colamd_dist(
+ const int m, /* number of rows in matrix A. */
+ const int n, /* number of columns in matrix A. */
+ const int nnz,/* number of nonzeros in matrix A. */
+ int_t *colptr, /* column pointer of size n+1 for matrix A. */
+ int_t *rowind, /* row indices of size nz for matrix A. */
+ int_t *perm_c /* out - the column permutation vector. */
+ )
+{
+ int Alen, *A, i, info, *p;
+ double knobs[COLAMD_KNOBS];
+ int stats[COLAMD_STATS];
+
+ Alen = colamd_recommended(nnz, m, n);
+
+ colamd_set_defaults(knobs);
+
+ if (!(A = (int *) SUPERLU_MALLOC(Alen * sizeof(int))) )
+ ABORT("Malloc fails for A[]");
+ if (!(p = (int *) SUPERLU_MALLOC((n+1) * sizeof(int))) )
+ ABORT("Malloc fails for p[]");
+ for (i = 0; i <= n; ++i) p[i] = colptr[i];
+ for (i = 0; i < nnz; ++i) A[i] = rowind[i];
+ info = colamd(m, n, Alen, A, p, knobs, stats);
+ if ( info == FALSE ) ABORT("COLAMD failed");
+
+ for (i = 0; i < n; ++i) perm_c[p[i]] = i;
+
+ SUPERLU_FREE(A);
+ SUPERLU_FREE(p);
+}
+
/*! \brief
*
* <pre>
@@ -472,6 +506,13 @@ get_perm_c_dist(int_t pnum, int_t ispec, SuperMatrix *A, int_t *perm_c)
#endif
break;
+ case (COLAMD): /* Approximate minimum degree column ordering. */
+ get_colamd_dist(m, n, Astore->nnz, Astore->colptr, Astore->rowind,
+ perm_c);
+#if ( PRNTlevel>=1 )
+ printf(".. Use approximate minimum degree column ordering.\n");
+#endif
+ return;
case METIS_AT_PLUS_A: /* METIS ordering on A'+A */
if ( m != n ) ABORT("Matrix is not square");
at_plus_a_dist(n, Astore->nnz, Astore->colptr, Astore->rowind,
diff --git a/SRC/mc64ad_dist.c b/SRC/mc64ad_dist.c
deleted file mode 100644
index bf722fd..0000000
--- a/SRC/mc64ad_dist.c
+++ /dev/null
@@ -1,2654 +0,0 @@
-/* mc64ad.f -- translated by f2c (version 20100827).
- You must link the resulting object file with libf2c:
- on Microsoft Windows system, link with libf2c.lib;
- on Linux or Unix systems, link with .../path/to/libf2c.a -lm
- or, if you install libf2c.a in a standard place, with -lf2c -lm
- -- in that order, at the end of the command line, as in
- cc *.o -lf2c -lm
- Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
-
- http://www.netlib.org/f2c/libf2c.zip
-*/
-
-#include "superlu_ddefs.h"
-
-#define abs(x) ((x) >= 0 ? (x) : -(x))
-#define min(a,b) ((a) < (b)) ? (a) : (b)
-
-/* Table of constant values */
-
-static int_t c__1 = 1;
-static int_t c__2 = 2;
-
-/*! @file
- * \brief Permute large entries to the main diagonal
- */
-/* CCCC COPYRIGHT (c) 1999 Council for the Central Laboratory of the */
-/* CCCC Research Councils. All rights reserved. */
-/* CCCC PACKAGE MC64A/AD */
-/* CCCC AUTHORS Iain Duff (i.duff at rl.ac.uk) and Jacko Koster (jak at ii.uib.no) */
-/* CCCC LAST UPDATE 20/09/99 */
-/* CCCC */
-/* *** Conditions on external use *** */
-
-/* The user shall acknowledge the contribution of this */
-/* package in any publication of material dependent upon the use of */
-/* the package. The user shall use reasonable endeavours to notify */
-/* the authors of the package of this publication. */
-
-/* The user can modify this code but, at no time */
-/* shall the right or title to all or any part of this package pass */
-/* to the user. The user shall make available free of charge */
-/* to the authors for any purpose all information relating to any */
-/* alteration or addition made to this package for the purposes of */
-/* extending the capabilities or enhancing the performance of this */
-/* package. */
-
-/* The user shall not pass this code directly to a third party without the */
-/* express prior consent of the authors. Users wanting to licence their */
-/* own copy of these routines should send email to hsl at aeat.co.uk */
-
-/* None of the comments from the Copyright notice up to and including this */
-/* one shall be removed or altered in any way. */
-/* ********************************************************************** */
-/* </pre>
- */
-
-/* Subroutine */ int_t mc64id_dist(int_t *icntl)
-{
- int_t i__;
-
-
-/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
-/* Research Councils *** */
-/* *** Although every effort has been made to ensure robustness and *** */
-/* *** reliability of the subroutines in this MC64 suite, we *** */
-/* *** disclaim any liability arising through the use or misuse of *** */
-/* *** any of the subroutines. *** */
-/* *** Any problems? Contact ... */
-/* Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no) *** */
-
-/* Purpose */
-/* ======= */
-
-/* The components of the array ICNTL control the action of MC64A/AD. */
-/* Default values for these are set in this subroutine. */
-
-/* Parameters */
-/* ========== */
-
-
-/* Local variables */
-
-/* ICNTL(1) has default value 6. */
-/* It is the output stream for error messages. If it */
-/* is negative, these messages will be suppressed. */
-
-/* ICNTL(2) has default value 6. */
-/* It is the output stream for warning messages. */
-/* If it is negative, these messages are suppressed. */
-
-/* ICNTL(3) has default value -1. */
-/* It is the output stream for monitoring printing. */
-/* If it is negative, these messages are suppressed. */
-
-/* ICNTL(4) has default value 0. */
-/* If left at the defaut value, the incoming data is checked for */
-/* out-of-range indices and duplicates. Setting ICNTL(4) to any */
-/* other will avoid the checks but is likely to cause problems */
-/* later if out-of-range indices or duplicates are present. */
-/* The user should only set ICNTL(4) non-zero, if the data is */
-/* known to avoid these problems. */
-
-/* ICNTL(5) to ICNTL(10) are not used by MC64A/AD but are set to */
-/* zero in this routine. */
-/* Initialization of the ICNTL array. */
- /* Parameter adjustments */
- --icntl;
-
- /* Function Body */
- icntl[1] = 6;
- icntl[2] = 6;
- icntl[3] = -1;
- for (i__ = 4; i__ <= 10; ++i__) {
- icntl[i__] = 0;
-/* L10: */
- }
- return 0;
-} /* mc64id_ */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64ad_dist(int_t *job, int_t *n, int_t *ne, int_t *
- ip, int_t *irn, double *a, int_t *num, int_t *cperm,
- int_t *liw, int_t *iw, int_t *ldw, double *dw, int_t *
- icntl, int_t *info)
-{
- /* System generated locals */
- int_t i__1, i__2;
- double d__1, d__2;
-
- /* Builtin functions */
- double log(double);
-
- /* Local variables */
- int_t i__, j, k;
- double fact, rinf;
-
- extern /* Subroutine */ int_t mc21ad_dist(int_t *, int_t *, int_t *,
- int_t *, int_t *, int_t *, int_t *, int_t *),
- mc64bd_dist(int_t *, int_t *, int_t *, int_t *, double *, int_t
- *, int_t *, int_t *, int_t *, int_t *, int_t *, double *),
- mc64rd_dist(int_t *, int_t *, int_t *, int_t *, double *),
- mc64sd_dist(int_t *, int_t *, int_t *, int_t *
- , double *, int_t *, int_t *, int_t *, int_t *,
- int_t *, int_t *, int_t *, int_t *, int_t *),
- mc64wd_dist(int_t *, int_t *, int_t *, int_t *, double *, int_t
- *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t
- *, double *, double *);
-
-/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
-/* Research Councils *** */
-/* *** Although every effort has been made to ensure robustness and *** */
-/* *** reliability of the subroutines in this MC64 suite, we *** */
-/* *** disclaim any liability arising through the use or misuse of *** */
-/* *** any of the subroutines. *** */
-/* *** Any problems? Contact ... */
-/* Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no) *** */
-
-/* Purpose */
-/* ======= */
-
-/*! \brief
- * <pre>
- * This subroutine attempts to find a column permutation for an NxN
- * sparse matrix A = {a_ij} that makes the permuted matrix have N
- * entries on its diagonal.
- * If the matrix is structurally nonsingular, the subroutine optionally
- * returns a column permutation that maximizes the smallest element
- * on the diagonal, maximizes the sum of the diagonal entries, or
- * maximizes the product of the diagonal entries of the permuted matrix.
- * For the latter option, the subroutine also finds scaling factors
- * that may be used to scale the matrix so that the nonzero diagonal
- * entries of the permuted matrix are one in absolute value and all the
- * off-diagonal entries are less than or equal to one in absolute value.
- * The natural logarithms of the scaling factors u(i), i=1..N, for the
- * rows and v(j), j=1..N, for the columns are returned so that the
- * scaled matrix B = {b_ij} has entries b_ij = a_ij * EXP(u_i + v_j).
- * </pre>
- */
-
-/* Parameters */
-/* ========== */
-
-
-/* JOB is an INT_T variable which must be set by the user to */
-/* control the action. It is not altered by the subroutine. */
-/* Possible values for JOB are: */
-/* 1 Compute a column permutation of the matrix so that the */
-/* permuted matrix has as many entries on its diagonal as possible. */
-/* The values on the diagonal are of arbitrary size. HSL subroutine */
-/* MC21A/AD is used for this. See [1]. */
-/* 2 Compute a column permutation of the matrix so that the smallest */
-/* value on the diagonal of the permuted matrix is maximized. */
-/* See [3]. */
-/* 3 Compute a column permutation of the matrix so that the smallest */
-/* value on the diagonal of the permuted matrix is maximized. */
-/* The algorithm differs from the one used for JOB = 2 and may */
-/* have quite a different performance. See [2]. */
-/* 4 Compute a column permutation of the matrix so that the sum */
-/* of the diagonal entries of the permuted matrix is maximized. */
-/* See [3]. */
-/* 5 Compute a column permutation of the matrix so that the product */
-/* of the diagonal entries of the permuted matrix is maximized */
-/* and vectors to scale the matrix so that the nonzero diagonal */
-/* entries of the permuted matrix are one in absolute value and */
-/* all the off-diagonal entries are less than or equal to one in */
-/* absolute value. See [3]. */
-/* Restriction: 1 <= JOB <= 5. */
-
-/* N is an INT_T variable which must be set by the user to the */
-/* order of the matrix A. It is not altered by the subroutine. */
-/* Restriction: N >= 1. */
-
-/* NE is an INT_T variable which must be set by the user to the */
-/* number of entries in the matrix. It is not altered by the */
-/* subroutine. */
-/* Restriction: NE >= 1. */
-
-/* IP is an INT_T array of length N+1. */
-/* IP(J), J=1..N, must be set by the user to the position in array IRN */
-/* of the first row index of an entry in column J. IP(N+1) must be set */
-/* to NE+1. It is not altered by the subroutine. */
-
-/* IRN is an INT_T array of length NE. */
-/* IRN(K), K=1..NE, must be set by the user to hold the row indices of */
-/* the entries of the matrix. Those belonging to column J must be */
-/* stored contiguously in the positions IP(J)..IP(J+1)-1. The ordering */
-/* of the row indices within each column is unimportant. Repeated */
-/* entries are not allowed. The array IRN is not altered by the */
-/* subroutine. */
-
-/* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */
-/* The user must set A(K), K=1..NE, to the numerical value of the */
-/* entry that corresponds to IRN(K). */
-/* It is not used by the subroutine when JOB = 1. */
-/* It is not altered by the subroutine. */
-
-/* NUM is an INT_T variable that need not be set by the user. */
-/* On successful exit, NUM will be the number of entries on the */
-/* diagonal of the permuted matrix. */
-/* If NUM < N, the matrix is structurally singular. */
-
-/* CPERM is an INT_T array of length N that need not be set by the */
-/* user. On successful exit, CPERM contains the column permutation. */
-/* Column CPERM(J) of the original matrix is column J in the permuted */
-/* matrix, J=1..N. */
-
-/* LIW is an INT_T variable that must be set by the user to */
-/* the dimension of array IW. It is not altered by the subroutine. */
-/* Restriction: */
-/* JOB = 1 : LIW >= 5N */
-/* JOB = 2 : LIW >= 4N */
-/* JOB = 3 : LIW >= 10N + NE */
-/* JOB = 4 : LIW >= 5N */
-/* JOB = 5 : LIW >= 5N */
-
-/* IW is an INT_T array of length LIW that is used for workspace. */
-
-/* LDW is an INT_T variable that must be set by the user to the */
-/* dimension of array DW. It is not altered by the subroutine. */
-/* Restriction: */
-/* JOB = 1 : LDW is not used */
-/* JOB = 2 : LDW >= N */
-/* JOB = 3 : LDW >= NE */
-/* JOB = 4 : LDW >= 2N + NE */
-/* JOB = 5 : LDW >= 3N + NE */
-
-/* DW is a REAL (DOUBLE PRECISION in the D-version) array of length LDW */
-/* that is used for workspace. If JOB = 5, on return, */
-/* DW(i) contains u_i, i=1..N, and DW(N+j) contains v_j, j=1..N. */
-
-/* ICNTL is an INT_T array of length 10. Its components control the */
-/* output of MC64A/AD and must be set by the user before calling */
-/* MC64A/AD. They are not altered by the subroutine. */
-
-/* ICNTL(1) must be set to specify the output stream for */
-/* error messages. If ICNTL(1) < 0, messages are suppressed. */
-/* The default value set by MC46I/ID is 6. */
-
-/* ICNTL(2) must be set by the user to specify the output stream for */
-/* warning messages. If ICNTL(2) < 0, messages are suppressed. */
-/* The default value set by MC46I/ID is 6. */
-
-/* ICNTL(3) must be set by the user to specify the output stream for */
-/* diagnostic messages. If ICNTL(3) < 0, messages are suppressed. */
-/* The default value set by MC46I/ID is -1. */
-
-/* ICNTL(4) must be set by the user to a value other than 0 to avoid */
-/* checking of the input data. */
-/* The default value set by MC46I/ID is 0. */
-
-/* INFO is an INT_T array of length 10 which need not be set by the */
-/* user. INFO(1) is set non-negative to indicate success. A negative */
-/* value is returned if an error occurred, a positive value if a */
-/* warning occurred. INFO(2) holds further information on the error. */
-/* On exit from the subroutine, INFO(1) will take one of the */
-/* following values: */
-/* 0 : successful entry (for structurally nonsingular matrix). */
-/* +1 : successful entry (for structurally singular matrix). */
-/* +2 : the returned scaling factors are large and may cause */
-/* overflow when used to scale the matrix. */
-/* (For JOB = 5 entry only.) */
-/* -1 : JOB < 1 or JOB > 5. Value of JOB held in INFO(2). */
-/* -2 : N < 1. Value of N held in INFO(2). */
-/* -3 : NE < 1. Value of NE held in INFO(2). */
-/* -4 : the defined length LIW violates the restriction on LIW. */
-/* Value of LIW required given by INFO(2). */
-/* -5 : the defined length LDW violates the restriction on LDW. */
-/* Value of LDW required given by INFO(2). */
-/* -6 : entries are found whose row indices are out of range. INFO(2) */
-/* contains the index of a column in which such an entry is found. */
-/* -7 : repeated entries are found. INFO(2) contains the index of a */
-/* column in which such entries are found. */
-/* INFO(3) to INFO(10) are not currently used and are set to zero by */
-/* the routine. */
-
-/* References: */
-/* [1] I. S. Duff, (1981), */
-/* "Algorithm 575. Permutations for a zero-free diagonal", */
-/* ACM Trans. Math. Software 7(3), 387-390. */
-/* [2] I. S. Duff and J. Koster, (1998), */
-/* "The design and use of algorithms for permuting large */
-/* entries to the diagonal of sparse matrices", */
-/* SIAM J. Matrix Anal. Appl., vol. 20, no. 4, pp. 889-901. */
-/* [3] I. S. Duff and J. Koster, (1999), */
-/* "On algorithms for permuting large entries to the diagonal */
-/* of sparse matrices", */
-/* Technical Report RAL-TR-1999-030, RAL, Oxfordshire, England. */
-/* Local variables and parameters */
-/* External routines and functions */
-/* EXTERNAL FD05AD */
-/* DOUBLE PRECISION FD05AD */
-/* Intrinsic functions */
-/* Set RINF to largest positive real number (infinity) */
-/* XSL RINF = FD05AD(5) */
- /* Parameter adjustments */
- --cperm;
- --ip;
- --a;
- --irn;
- --iw;
- --dw;
- --icntl;
- --info;
-
- /* Function Body */
- rinf = dmach_dist("Overflow");
-/* Check value of JOB */
- if (*job < 1 || *job > 5) {
- info[1] = -1;
- info[2] = *job;
- if (icntl[1] >= 0) {
- printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT
- " because JOB = " IFMT "\n", info[1], *job);
- }
- goto L99;
- }
-/* Check value of N */
- if (*n < 1) {
- info[1] = -2;
- info[2] = *n;
- if (icntl[1] >= 0) {
- printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT
- " because N = " IFMT "\n", info[1], *job);
- }
- goto L99;
- }
-/* Check value of NE */
- if (*ne < 1) {
- info[1] = -3;
- info[2] = *ne;
- if (icntl[1] >= 0) {
- printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT
- " because NE = " IFMT "\n", info[1], *job);
- }
- goto L99;
- }
-/* Check LIW */
- if (*job == 1) {
- k = *n * 5;
- }
- if (*job == 2) {
- k = *n << 2;
- }
- if (*job == 3) {
- k = *n * 10 + *ne;
- }
- if (*job == 4) {
- k = *n * 5;
- }
- if (*job == 5) {
- k = *n * 5;
- }
- if (*liw < k) {
- info[1] = -4;
- info[2] = k;
- if (icntl[1] >= 0) {
- printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT
- " LIW too small, must be at least " IFMT "\n", info[1], k);
- }
- goto L99;
- }
-/* Check LDW */
-/* If JOB = 1, do not check */
- if (*job > 1) {
- if (*job == 2) {
- k = *n;
- }
- if (*job == 3) {
- k = *ne;
- }
- if (*job == 4) {
- k = (*n << 1) + *ne;
- }
- if (*job == 5) {
- k = *n * 3 + *ne;
- }
- if (*ldw < k) {
- info[1] = -5;
- info[2] = k;
- if (icntl[1] >= 0) {
- printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT
- " LDW too small, must be at least " IFMT "\n", info[1], k);
- }
- goto L99;
- }
- }
- if (icntl[4] == 0) {
-/* Check row indices. Use IW(1:N) as workspace */
- i__1 = *n;
- for (i__ = 1; i__ <= i__1; ++i__) {
- iw[i__] = 0;
-/* L3: */
- }
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
- i__2 = ip[j + 1] - 1;
- for (k = ip[j]; k <= i__2; ++k) {
- i__ = irn[k];
-/* Check for row indices that are out of range */
- if (i__ < 1 || i__ > *n) {
- info[1] = -6;
- info[2] = j;
- if (icntl[1] >= 0) {
- printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT
- " Column " IFMT
- " contains an entry with invalid row index " IFMT "\n",
- info[1], j, i__);
- }
- goto L99;
- }
-/* Check for repeated row indices within a column */
- if (iw[i__] == j) {
- info[1] = -7;
- info[2] = j;
- if (icntl[1] >= 0) {
- printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT
- " Column " IFMT
- " contains two or more entries with row index " IFMT "\n",
- info[1], j, i__);
- }
- goto L99;
- } else {
- iw[i__] = j;
- }
-/* L4: */
- }
-/* L6: */
- }
- }
-/* Print diagnostics on input */
- if (icntl[3] >= 0) {
- printf(" ****** Input parameters for MC64A/AD: JOB = " IFMT ","
- " N = " IFMT ", NE = " IFMT "\n", *job, *n, *ne);
- printf(" IP(1:N+1) = ");
- for (j=1; j<=(*n+1); ++j) {
- printf(IFMT, ip[j]);
- if (j%8 == 0) printf("\n");
- }
- printf("\n IRN(1:NE) = ");
- for (j=1; j<=(*ne); ++j) {
- printf(IFMT, irn[j]);
- if (j%8 == 0) printf("\n");
- }
- printf("\n");
-
- if (*job > 1) {
- printf(" A(1:NE) = ");
- for (j=1; j<=(*ne); ++j) {
- printf("%f14.4", a[j]);
- if (j%4 == 0) printf("\n");
- }
- printf("\n");
- }
- }
-/* Set components of INFO to zero */
- for (i__ = 1; i__ <= 10; ++i__) {
- info[i__] = 0;
-/* L8: */
- }
-/* Compute maximum matching with MC21A/AD */
- if (*job == 1) {
-/* Put length of column J in IW(J) */
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
- iw[j] = ip[j + 1] - ip[j];
-/* L10: */
- }
-/* IW(N+1:5N) is workspace */
-#if 0
- mc21ad_(n, &irn[1], ne, &ip[1], &iw[1], &cperm[1], num, &iw[*n+1]);
-#else
- printf(" ****** Warning from MC64A/AD. Need to link mc21ad.\n");
-#endif
- goto L90;
- }
-/* Compute bottleneck matching */
- if (*job == 2) {
-/* IW(1:5N), DW(1:N) are workspaces */
- mc64bd_dist(n, ne, &ip[1], &irn[1], &a[1], &cperm[1], num,
- &iw[1], &iw[*n + 1], &iw[(*n << 1) + 1], &iw[*n * 3 + 1],
- &dw[1]);
- goto L90;
- }
-/* Compute bottleneck matching */
- if (*job == 3) {
-/* Copy IRN(K) into IW(K), ABS(A(K)) into DW(K), K=1..NE */
- i__1 = *ne;
- for (k = 1; k <= i__1; ++k) {
- iw[k] = irn[k];
- dw[k] = (d__1 = a[k], abs(d__1));
-/* L20: */
- }
-/* Sort entries in each column by decreasing value. */
- mc64rd_dist(n, ne, &ip[1], &iw[1], &dw[1]);
-/* IW(NE+1:NE+10N) is workspace */
- mc64sd_dist(n, ne, &ip[1], &iw[1], &dw[1], &cperm[1], num,
- &iw[*ne + 1], &iw[*ne + *n + 1], &iw[*ne + (*n << 1) + 1],
- &iw[*ne + *n * 3 + 1], &iw[*ne + (*n << 2) + 1],
- &iw[*ne + *n * 5 + 1], &iw[*ne + *n * 6 + 1]);
- goto L90;
- }
- if (*job == 4) {
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
- fact = 0.;
- i__2 = ip[j + 1] - 1;
- for (k = ip[j]; k <= i__2; ++k) {
- if ((d__1 = a[k], abs(d__1)) > fact) {
- fact = (d__2 = a[k], abs(d__2));
- }
-/* L30: */
- }
- i__2 = ip[j + 1] - 1;
- for (k = ip[j]; k <= i__2; ++k) {
- dw[(*n << 1) + k] = fact - (d__1 = a[k], abs(d__1));
-/* L40: */
- }
-/* L50: */
- }
-/* B = DW(2N+1:2N+NE); IW(1:5N) and DW(1:2N) are workspaces */
- mc64wd_dist(n, ne, &ip[1], &irn[1], &dw[(*n << 1) + 1], &cperm[1],
- num, &iw[1], &iw[*n + 1], &iw[(*n << 1) + 1],
- &iw[*n * 3 + 1], &iw[(*n << 2) + 1], &dw[1], &dw[*n + 1]);
- goto L90;
- }
- if (*job == 5) {
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
- fact = 0.;
- i__2 = ip[j + 1] - 1;
- for (k = ip[j]; k <= i__2; ++k) {
- dw[*n * 3 + k] = (d__1 = a[k], abs(d__1));
- if (dw[*n * 3 + k] > fact) {
- fact = dw[*n * 3 + k];
- }
-/* L60: */
- }
- dw[(*n << 1) + j] = fact;
- if (fact != 0.) {
- fact = log(fact);
- } else {
- fact = rinf / *n;
- }
- i__2 = ip[j + 1] - 1;
- for (k = ip[j]; k <= i__2; ++k) {
- if (dw[*n * 3 + k] != 0.) {
- dw[*n * 3 + k] = fact - log(dw[*n * 3 + k]);
- } else {
- dw[*n * 3 + k] = rinf / *n;
- }
-/* L70: */
- }
-/* L75: */
- }
-/* B = DW(3N+1:3N+NE); IW(1:5N) and DW(1:2N) are workspaces */
- mc64wd_dist(n, ne, &ip[1], &irn[1], &dw[*n * 3 + 1], &cperm[1],
- num, &iw[1], &iw[*n + 1], &iw[(*n << 1) + 1],
- &iw[*n * 3 + 1], &iw[(*n << 2) + 1], &dw[1], &dw[*n + 1]);
- if (*num == *n) {
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
- if (dw[(*n << 1) + j] != 0.) {
- dw[*n + j] -= log(dw[(*n << 1) + j]);
- } else {
- dw[*n + j] = 0.;
- }
-/* L80: */
- }
- }
-/* Check size of scaling factors */
- fact = log(rinf) * .5f;
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
- if (dw[j] < fact && dw[*n + j] < fact) {
- goto L86;
- }
- info[1] = 2;
- goto L90;
-L86:
- ;
- }
-/* GO TO 90 */
- }
-L90:
- if (info[1] == 0 && *num < *n) {
-/* Matrix is structurally singular, return with warning */
- info[1] = 1;
- if (icntl[2] >= 0) {
- printf(" ****** Warning from MC64A/AD. INFO(1) = " IFMT
- " The matrix is structurally singular.\n", info[1]);
- }
- }
- if (info[1] == 2) {
-/* Scaling factors are large, return with warning */
- if (icntl[2] >= 0) {
- printf(" ****** Warning from MC64A/AD. INFO(1) = " IFMT "\n"
- " Some scaling factors may be too large.\n", info[1]);
- }
- }
-/* Print diagnostics on output */
- if (icntl[3] >= 0) {
- printf(" ****** Output parameters for MC64A/AD: INFO(1:2) = " IFMT IFMT "\n",
- info[1], info[2]);
- printf(" NUM = " IFMT, *num);
- printf(" CPERM(1:N) = ");
- for (j=1; j<=*n; ++j) {
- printf(IFMT, cperm[j]);
- if (j%8 == 0) printf("\n");
- }
- if (*job == 5) {
- printf("\n DW(1:N) = ");
- for (j=1; j<=*n; ++j) {
- printf("%11.3f", dw[j]);
- if (j%5 == 0) printf("\n");
- }
- printf("\n DW(N+1:2N) = ");
- for (j=1; j<=*n; ++j) {
- printf("%11.3f", dw[*n+j]);
- if (j%5 == 0) printf("\n");
- }
- printf("\n");
- }
- }
-/* Return from subroutine. */
-L99:
- return 0;
-} /* mc64ad_ */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64bd_dist(int_t *n, int_t *ne, int_t *ip, int_t *
- irn, double *a, int_t *iperm, int_t *num, int_t *jperm,
- int_t *pr, int_t *q, int_t *l, double *d__)
-{
- /* System generated locals */
- int_t i__1, i__2, i__3;
- double d__1, d__2, d__3;
-
- /* Local variables */
- int_t i__, j, k;
- double a0;
- int_t i0, q0;
- double ai, di;
- int_t ii, jj, kk;
- double bv;
- int_t up;
- double dq0;
- int_t kk1, kk2;
- double csp;
- int_t isp, jsp, low;
- double dnew;
- int_t jord, qlen, idum, jdum;
- double rinf;
- extern /* Subroutine */ int_t mc64dd_dist(int_t *, int_t *, int_t *,
- double *, int_t *, int_t *), mc64ed_dist(int_t *, int_t *,
- int_t *, double *, int_t *, int_t *), mc64fd_dist(int_t *
- , int_t *, int_t *, int_t *, double *, int_t *, int_t *);
-
-
-/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
-/* Research Councils *** */
-/* *** Although every effort has been made to ensure robustness and *** */
-/* *** reliability of the subroutines in this MC64 suite, we *** */
-/* *** disclaim any liability arising through the use or misuse of *** */
-/* *** any of the subroutines. *** */
-/* *** Any problems? Contact ... */
-/* Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no) *** */
-
-/* N, NE, IP, IRN are described in MC64A/AD. */
-/* A is a REAL (DOUBLE PRECISION in the D-version) array of length */
-/* NE. A(K), K=1..NE, must be set to the value of the entry */
-/* that corresponds to IRN(K). It is not altered. */
-/* IPERM is an INT_T array of length N. On exit, it contains the */
-/* matching: IPERM(I) = 0 or row I is matched to column IPERM(I). */
-/* NUM is INT_T variable. On exit, it contains the cardinality of the */
-/* matching stored in IPERM. */
-/* IW is an INT_T work array of length 4N. */
-/* DW is a REAL (DOUBLE PRECISION in D-version) work array of length N. */
-/* Local variables */
-/* Local parameters */
-/* Intrinsic functions */
-/* External subroutines and/or functions */
-/* EXTERNAL FD05AD,MC64DD,MC64ED,MC64FD, DMACH */
-/* DOUBLE PRECISION FD05AD, DMACH */
-/* Set RINF to largest positive real number */
-/* XSL RINF = FD05AD(5) */
- /* Parameter adjustments */
- --d__;
- --l;
- --q;
- --pr;
- --jperm;
- --iperm;
- --ip;
- --a;
- --irn;
-
- /* Function Body */
- rinf = dmach_dist("Overflow");
-/* Initialization */
- *num = 0;
- bv = rinf;
- i__1 = *n;
- for (k = 1; k <= i__1; ++k) {
- iperm[k] = 0;
- jperm[k] = 0;
- pr[k] = ip[k];
- d__[k] = 0.;
-/* L10: */
- }
-/* Scan columns of matrix; */
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
- a0 = -1.;
- i__2 = ip[j + 1] - 1;
- for (k = ip[j]; k <= i__2; ++k) {
- i__ = irn[k];
- ai = (d__1 = a[k], abs(d__1));
- if (ai > d__[i__]) {
- d__[i__] = ai;
- }
- if (jperm[j] != 0) {
- goto L30;
- }
- if (ai >= bv) {
- a0 = bv;
- if (iperm[i__] != 0) {
- goto L30;
- }
- jperm[j] = i__;
- iperm[i__] = j;
- ++(*num);
- } else {
- if (ai <= a0) {
- goto L30;
- }
- a0 = ai;
- i0 = i__;
- }
-L30:
- ;
- }
- if (a0 != -1. && a0 < bv) {
- bv = a0;
- if (iperm[i0] != 0) {
- goto L20;
- }
- iperm[i0] = j;
- jperm[j] = i0;
- ++(*num);
- }
-L20:
- ;
- }
-/* Update BV with smallest of all the largest maximum absolute values */
-/* of the rows. */
- i__1 = *n;
- for (i__ = 1; i__ <= i__1; ++i__) {
-/* Computing MIN */
- d__1 = bv, d__2 = d__[i__];
- bv = min(d__1,d__2);
-/* L25: */
- }
- if (*num == *n) {
- goto L1000;
- }
-/* Rescan unassigned columns; improve initial assignment */
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
- if (jperm[j] != 0) {
- goto L95;
- }
- i__2 = ip[j + 1] - 1;
- for (k = ip[j]; k <= i__2; ++k) {
- i__ = irn[k];
- ai = (d__1 = a[k], abs(d__1));
- if (ai < bv) {
- goto L50;
- }
- if (iperm[i__] == 0) {
- goto L90;
- }
- jj = iperm[i__];
- kk1 = pr[jj];
- kk2 = ip[jj + 1] - 1;
- if (kk1 > kk2) {
- goto L50;
- }
- i__3 = kk2;
- for (kk = kk1; kk <= i__3; ++kk) {
- ii = irn[kk];
- if (iperm[ii] != 0) {
- goto L70;
- }
- if ((d__1 = a[kk], abs(d__1)) >= bv) {
- goto L80;
- }
-L70:
- ;
- }
- pr[jj] = kk2 + 1;
-L50:
- ;
- }
- goto L95;
-L80:
- jperm[jj] = ii;
- iperm[ii] = jj;
- pr[jj] = kk + 1;
-L90:
- ++(*num);
- jperm[j] = i__;
- iperm[i__] = j;
- pr[j] = k + 1;
-L95:
- ;
- }
- if (*num == *n) {
- goto L1000;
- }
-/* Prepare for main loop */
- i__1 = *n;
- for (i__ = 1; i__ <= i__1; ++i__) {
- d__[i__] = -1.;
- l[i__] = 0;
-/* L99: */
- }
-/* Main loop ... each pass round this loop is similar to Dijkstra's */
-/* algorithm for solving the single source shortest path problem */
- i__1 = *n;
- for (jord = 1; jord <= i__1; ++jord) {
- if (jperm[jord] != 0) {
- goto L100;
- }
- qlen = 0;
- low = *n + 1;
- up = *n + 1;
-/* CSP is cost of shortest path to any unassigned row */
-/* ISP is matrix position of unassigned row element in shortest path */
-/* JSP is column index of unassigned row element in shortest path */
- csp = -1.;
-/* Build shortest path tree starting from unassigned column JORD */
- j = jord;
- pr[j] = -1;
-/* Scan column J */
- i__2 = ip[j + 1] - 1;
- for (k = ip[j]; k <= i__2; ++k) {
- i__ = irn[k];
- dnew = (d__1 = a[k], abs(d__1));
- if (csp >= dnew) {
- goto L115;
- }
- if (iperm[i__] == 0) {
-/* Row I is unassigned; update shortest path info */
- csp = dnew;
- isp = i__;
- jsp = j;
- if (csp >= bv) {
- goto L160;
- }
- } else {
- d__[i__] = dnew;
- if (dnew >= bv) {
-/* Add row I to Q2 */
- --low;
- q[low] = i__;
- } else {
-/* Add row I to Q, and push it */
- ++qlen;
- l[i__] = qlen;
- mc64dd_dist(&i__, n, &q[1], &d__[1], &l[1], &c__1);
- }
- jj = iperm[i__];
- pr[jj] = j;
- }
-L115:
- ;
- }
- i__2 = *num;
- for (jdum = 1; jdum <= i__2; ++jdum) {
-/* If Q2 is empty, extract new rows from Q */
- if (low == up) {
- if (qlen == 0) {
- goto L160;
- }
- i__ = q[1];
- if (csp >= d__[i__]) {
- goto L160;
- }
- bv = d__[i__];
- i__3 = *n;
- for (idum = 1; idum <= i__3; ++idum) {
- mc64ed_dist(&qlen, n, &q[1], &d__[1], &l[1], &c__1);
- l[i__] = 0;
- --low;
- q[low] = i__;
- if (qlen == 0) {
- goto L153;
- }
- i__ = q[1];
- if (d__[i__] != bv) {
- goto L153;
- }
-/* L152: */
- }
-/* End of dummy loop; this point is never reached */
- }
-/* Move row Q0 */
-L153:
- --up;
- q0 = q[up];
- dq0 = d__[q0];
- l[q0] = up;
-/* Scan column that matches with row Q0 */
- j = iperm[q0];
- i__3 = ip[j + 1] - 1;
- for (k = ip[j]; k <= i__3; ++k) {
- i__ = irn[k];
-/* Update D(I) */
- if (l[i__] >= up) {
- goto L155;
- }
-/* Computing MIN */
- d__2 = dq0, d__3 = (d__1 = a[k], abs(d__1));
- dnew = min(d__2,d__3);
- if (csp >= dnew) {
- goto L155;
- }
- if (iperm[i__] == 0) {
-/* Row I is unassigned; update shortest path info */
- csp = dnew;
- isp = i__;
- jsp = j;
- if (csp >= bv) {
- goto L160;
- }
- } else {
- di = d__[i__];
- if (di >= bv || di >= dnew) {
- goto L155;
- }
- d__[i__] = dnew;
- if (dnew >= bv) {
-/* Delete row I from Q (if necessary); add row I to Q2 */
- if (di != -1.) {
- mc64fd_dist(&l[i__], &qlen, n, &q[1], &d__[1], &l[1],
- &c__1);
- }
- l[i__] = 0;
- --low;
- q[low] = i__;
- } else {
-/* Add row I to Q (if necessary); push row I up Q */
- if (di == -1.) {
- ++qlen;
- l[i__] = qlen;
- }
- mc64dd_dist(&i__, n, &q[1], &d__[1], &l[1], &c__1);
- }
-/* Update tree */
- jj = iperm[i__];
- pr[jj] = j;
- }
-L155:
- ;
- }
-/* L150: */
- }
-/* If CSP = MINONE, no augmenting path is found */
-L160:
- if (csp == -1.) {
- goto L190;
- }
-/* Update bottleneck value */
- bv = min(bv,csp);
-/* Find augmenting path by tracing backward in PR; update IPERM,JPERM */
- ++(*num);
- i__ = isp;
- j = jsp;
- i__2 = *num + 1;
- for (jdum = 1; jdum <= i__2; ++jdum) {
- i0 = jperm[j];
- jperm[j] = i__;
- iperm[i__] = j;
- j = pr[j];
- if (j == -1) {
- goto L190;
- }
- i__ = i0;
-/* L170: */
- }
-/* End of dummy loop; this point is never reached */
-L190:
- i__2 = *n;
- for (kk = up; kk <= i__2; ++kk) {
- i__ = q[kk];
- d__[i__] = -1.;
- l[i__] = 0;
-/* L191: */
- }
- i__2 = up - 1;
- for (kk = low; kk <= i__2; ++kk) {
- i__ = q[kk];
- d__[i__] = -1.;
-/* L192: */
- }
- i__2 = qlen;
- for (kk = 1; kk <= i__2; ++kk) {
- i__ = q[kk];
- d__[i__] = -1.;
- l[i__] = 0;
-/* L193: */
- }
-L100:
- ;
- }
-/* End of main loop */
-/* BV is bottleneck value of final matching */
- if (*num == *n) {
- goto L1000;
- }
-/* Matrix is structurally singular, complete IPERM. */
-/* JPERM, PR are work arrays */
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
- jperm[j] = 0;
-/* L300: */
- }
- k = 0;
- i__1 = *n;
- for (i__ = 1; i__ <= i__1; ++i__) {
- if (iperm[i__] == 0) {
- ++k;
- pr[k] = i__;
- } else {
- j = iperm[i__];
- jperm[j] = i__;
- }
-/* L310: */
- }
- k = 0;
- i__1 = *n;
- for (i__ = 1; i__ <= i__1; ++i__) {
- if (jperm[i__] != 0) {
- goto L320;
- }
- ++k;
- jdum = pr[k];
- iperm[jdum] = i__;
-L320:
- ;
- }
-L1000:
- return 0;
-} /* mc64bd_ */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64dd_dist(int_t *i__, int_t *n, int_t *q, double
- *d__, int_t *l, int_t *iway)
-{
- /* System generated locals */
- int_t i__1;
-
- /* Local variables */
- double di;
- int_t qk, pos, idum, posk;
-
-
-/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
-/* Research Councils *** */
-/* *** Although every effort has been made to ensure robustness and *** */
-/* *** reliability of the subroutines in this MC64 suite, we *** */
-/* *** disclaim any liability arising through the use or misuse of *** */
-/* *** any of the subroutines. *** */
-/* *** Any problems? Contact ... */
-/* Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no) *** */
-
-/* Variables N,Q,D,L are described in MC64B/BD */
-/* IF IWAY is equal to 1, then */
-/* node I is pushed from its current position upwards */
-/* IF IWAY is not equal to 1, then */
-/* node I is pushed from its current position downwards */
-/* Local variables and parameters */
- /* Parameter adjustments */
- --l;
- --d__;
- --q;
-
- /* Function Body */
- di = d__[*i__];
- pos = l[*i__];
-/* POS is index of current position of I in the tree */
- if (*iway == 1) {
- i__1 = *n;
- for (idum = 1; idum <= i__1; ++idum) {
- if (pos <= 1) {
- goto L20;
- }
- posk = pos / 2;
- qk = q[posk];
- if (di <= d__[qk]) {
- goto L20;
- }
- q[pos] = qk;
- l[qk] = pos;
- pos = posk;
-/* L10: */
- }
-/* End of dummy loop; this point is never reached */
- } else {
- i__1 = *n;
- for (idum = 1; idum <= i__1; ++idum) {
- if (pos <= 1) {
- goto L20;
- }
- posk = pos / 2;
- qk = q[posk];
- if (di >= d__[qk]) {
- goto L20;
- }
- q[pos] = qk;
- l[qk] = pos;
- pos = posk;
-/* L15: */
- }
-/* End of dummy loop; this point is never reached */
- }
-/* End of dummy if; this point is never reached */
-L20:
- q[pos] = *i__;
- l[*i__] = pos;
- return 0;
-} /* mc64dd_dist */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64ed_dist(int_t *qlen, int_t *n, int_t *q,
- double *d__, int_t *l, int_t *iway)
-{
- /* System generated locals */
- int_t i__1;
-
- /* Local variables */
- int_t i__;
- double di, dk, dr;
- int_t pos, idum, posk;
-
-
-/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
-/* Research Councils *** */
-/* *** Although every effort has been made to ensure robustness and *** */
-/* *** reliability of the subroutines in this MC64 suite, we *** */
-/* *** disclaim any liability arising through the use or misuse of *** */
-/* *** any of the subroutines. *** */
-/* *** Any problems? Contact ... */
-/* Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no) *** */
-
-/* Variables QLEN,N,Q,D,L are described in MC64B/BD (IWAY = 1) or */
-/* MC64W/WD (IWAY = 2) */
-/* The root node is deleted from the binary heap. */
-/* Local variables and parameters */
-/* Move last element to begin of Q */
- /* Parameter adjustments */
- --l;
- --d__;
- --q;
-
- /* Function Body */
- i__ = q[*qlen];
- di = d__[i__];
- --(*qlen);
- pos = 1;
- if (*iway == 1) {
- i__1 = *n;
- for (idum = 1; idum <= i__1; ++idum) {
- posk = pos << 1;
- if (posk > *qlen) {
- goto L20;
- }
- dk = d__[q[posk]];
- if (posk < *qlen) {
- dr = d__[q[posk + 1]];
- if (dk < dr) {
- ++posk;
- dk = dr;
- }
- }
- if (di >= dk) {
- goto L20;
- }
-/* Exchange old last element with larger priority child */
- q[pos] = q[posk];
- l[q[pos]] = pos;
- pos = posk;
-/* L10: */
- }
-/* End of dummy loop; this point is never reached */
- } else {
- i__1 = *n;
- for (idum = 1; idum <= i__1; ++idum) {
- posk = pos << 1;
- if (posk > *qlen) {
- goto L20;
- }
- dk = d__[q[posk]];
- if (posk < *qlen) {
- dr = d__[q[posk + 1]];
- if (dk > dr) {
- ++posk;
- dk = dr;
- }
- }
- if (di <= dk) {
- goto L20;
- }
-/* Exchange old last element with smaller child */
- q[pos] = q[posk];
- l[q[pos]] = pos;
- pos = posk;
-/* L15: */
- }
-/* End of dummy loop; this point is never reached */
- }
-/* End of dummy if; this point is never reached */
-L20:
- q[pos] = i__;
- l[i__] = pos;
- return 0;
-} /* mc64ed_dist */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64fd_dist(int_t *pos0, int_t *qlen, int_t *n,
- int_t *q, double *d__, int_t *l, int_t *iway)
-{
- /* System generated locals */
- int_t i__1;
-
- /* Local variables */
- int_t i__;
- double di, dk, dr;
- int_t qk, pos, idum, posk;
-
-
-/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
-/* Research Councils *** */
-/* *** Although every effort has been made to ensure robustness and *** */
-/* *** reliability of the subroutines in this MC64 suite, we *** */
-/* *** disclaim any liability arising through the use or misuse of *** */
-/* *** any of the subroutines. *** */
-/* *** Any problems? Contact ... */
-/* Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no) *** */
-
-/* Variables QLEN,N,Q,D,L are described in MC64B/BD (IWAY = 1) or */
-/* MC64WD (IWAY = 2). */
-/* Move last element in the heap */
-/* Quick return, if possible */
- /* Parameter adjustments */
- --l;
- --d__;
- --q;
-
- /* Function Body */
- if (*qlen == *pos0) {
- --(*qlen);
- return 0;
- }
-/* Move last element from queue Q to position POS0 */
-/* POS is current position of node I in the tree */
- i__ = q[*qlen];
- di = d__[i__];
- --(*qlen);
- pos = *pos0;
- if (*iway == 1) {
- i__1 = *n;
- for (idum = 1; idum <= i__1; ++idum) {
- if (pos <= 1) {
- goto L20;
- }
- posk = pos / 2;
- qk = q[posk];
- if (di <= d__[qk]) {
- goto L20;
- }
- q[pos] = qk;
- l[qk] = pos;
- pos = posk;
-/* L10: */
- }
-/* End of dummy loop; this point is never reached */
-L20:
- q[pos] = i__;
- l[i__] = pos;
- i__1 = *n;
- for (idum = 1; idum <= i__1; ++idum) {
- posk = pos << 1;
- if (posk > *qlen) {
- goto L40;
- }
- dk = d__[q[posk]];
- if (posk < *qlen) {
- dr = d__[q[posk + 1]];
- if (dk < dr) {
- ++posk;
- dk = dr;
- }
- }
- if (di >= dk) {
- goto L40;
- }
- qk = q[posk];
- q[pos] = qk;
- l[qk] = pos;
- pos = posk;
-/* L30: */
- }
-/* End of dummy loop; this point is never reached */
- } else {
- i__1 = *n;
- for (idum = 1; idum <= i__1; ++idum) {
- if (pos <= 1) {
- goto L34;
- }
- posk = pos / 2;
- qk = q[posk];
- if (di >= d__[qk]) {
- goto L34;
- }
- q[pos] = qk;
- l[qk] = pos;
- pos = posk;
-/* L32: */
- }
-/* End of dummy loop; this point is never reached */
-L34:
- q[pos] = i__;
- l[i__] = pos;
- i__1 = *n;
- for (idum = 1; idum <= i__1; ++idum) {
- posk = pos << 1;
- if (posk > *qlen) {
- goto L40;
- }
- dk = d__[q[posk]];
- if (posk < *qlen) {
- dr = d__[q[posk + 1]];
- if (dk > dr) {
- ++posk;
- dk = dr;
- }
- }
- if (di <= dk) {
- goto L40;
- }
- qk = q[posk];
- q[pos] = qk;
- l[qk] = pos;
- pos = posk;
-/* L36: */
- }
-/* End of dummy loop; this point is never reached */
- }
-/* End of dummy if; this point is never reached */
-L40:
- q[pos] = i__;
- l[i__] = pos;
- return 0;
-} /* mc64fd_dist */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64rd_dist(int_t *n, int_t *ne, int_t *ip,
- int_t *irn, double *a)
-{
- /* System generated locals */
- int_t i__1, i__2, i__3;
-
- /* Local variables */
- int_t j, k, r__, s;
- double ha;
- int_t hi, td, mid, len, ipj;
- double key;
- int_t last, todo[50], first;
-
-
-/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
-/* Research Councils *** */
-/* *** Although every effort has been made to ensure robustness and *** */
-/* *** reliability of the subroutines in this MC64 suite, we *** */
-/* *** disclaim any liability arising through the use or misuse of *** */
-/* *** any of the subroutines. *** */
-/* *** Any problems? Contact ... */
-/* Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no) *** */
-
-/* This subroutine sorts the entries in each column of the */
-/* sparse matrix (defined by N,NE,IP,IRN,A) by decreasing */
-/* numerical value. */
-/* Local constants */
-/* Local variables */
-/* Local arrays */
- /* Parameter adjustments */
- --ip;
- --a;
- --irn;
-
- /* Function Body */
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
- len = ip[j + 1] - ip[j];
- if (len <= 1) {
- goto L100;
- }
- ipj = ip[j];
-/* Sort array roughly with partial quicksort */
- if (len < 15) {
- goto L400;
- }
- todo[0] = ipj;
- todo[1] = ipj + len;
- td = 2;
-L500:
- first = todo[td - 2];
- last = todo[td - 1];
-/* KEY is the smallest of two values present in interval [FIRST,LAST) */
- key = a[(first + last) / 2];
- i__2 = last - 1;
- for (k = first; k <= i__2; ++k) {
- ha = a[k];
- if (ha == key) {
- goto L475;
- }
- if (ha > key) {
- goto L470;
- }
- key = ha;
- goto L470;
-L475:
- ;
- }
-/* Only one value found in interval, so it is already sorted */
- td += -2;
- goto L425;
-/* Reorder interval [FIRST,LAST) such that entries before MID are gt KEY */
-L470:
- mid = first;
- i__2 = last - 1;
- for (k = first; k <= i__2; ++k) {
- if (a[k] <= key) {
- goto L450;
- }
- ha = a[mid];
- a[mid] = a[k];
- a[k] = ha;
- hi = irn[mid];
- irn[mid] = irn[k];
- irn[k] = hi;
- ++mid;
-L450:
- ;
- }
-/* Both subintervals [FIRST,MID), [MID,LAST) are nonempty */
-/* Stack the longest of the two subintervals first */
- if (mid - first >= last - mid) {
- todo[td + 1] = last;
- todo[td] = mid;
- todo[td - 1] = mid;
-/* TODO(TD-1) = FIRST */
- } else {
- todo[td + 1] = mid;
- todo[td] = first;
- todo[td - 1] = last;
- todo[td - 2] = mid;
- }
- td += 2;
-L425:
- if (td == 0) {
- goto L400;
- }
-/* There is still work to be done */
- if (todo[td - 1] - todo[td - 2] >= 15) {
- goto L500;
- }
-/* Next interval is already short enough for straightforward insertion */
- td += -2;
- goto L425;
-/* Complete sorting with straightforward insertion */
-L400:
- i__2 = ipj + len - 1;
- for (r__ = ipj + 1; r__ <= i__2; ++r__) {
- if (a[r__ - 1] < a[r__]) {
- ha = a[r__];
- hi = irn[r__];
- a[r__] = a[r__ - 1];
- irn[r__] = irn[r__ - 1];
- i__3 = ipj + 1;
- for (s = r__ - 1; s >= i__3; --s) {
- if (a[s - 1] < ha) {
- a[s] = a[s - 1];
- irn[s] = irn[s - 1];
- } else {
- a[s] = ha;
- irn[s] = hi;
- goto L200;
- }
-/* L300: */
- }
- a[ipj] = ha;
- irn[ipj] = hi;
- }
-L200:
- ;
- }
-L100:
- ;
- }
- return 0;
-} /* mc64rd_ */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64sd_dist(int_t *n, int_t *ne, int_t *ip, int_t *
- irn, double *a, int_t *iperm, int_t *numx, int_t *w,
- int_t *len, int_t *lenl, int_t *lenh, int_t *fc, int_t *iw,
- int_t *iw4)
-{
- /* System generated locals */
- int_t i__1, i__2, i__3, i__4;
-
- /* Local variables */
- int_t i__, j, k, l, ii, mod, cnt, num;
- double bval, bmin, bmax, rinf;
- int_t nval, wlen, idum1, idum2, idum3;
- extern /* Subroutine */ int_t mc64qd_dist(int_t *, int_t *, int_t *,
- int_t *, int_t *, double *, int_t *, double *),
- mc64ud_dist(int_t *, int_t *, int_t *, int_t *, int_t *,
- int_t *, int_t *, int_t *, int_t *, int_t *, int_t *,
- int_t *, int_t *, int_t *, int_t *);
-
-/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
-/* Research Councils *** */
-/* *** Although every effort has been made to ensure robustness and *** */
-/* *** reliability of the subroutines in this MC64 suite, we *** */
-/* *** disclaim any liability arising through the use or misuse of *** */
-/* *** any of the subroutines. *** */
-/* *** Any problems? Contact ... */
-/* Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no) *** */
-
-/* N, NE, IP, IRN, are described in MC64A/AD. */
-/* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */
-/* A(K), K=1..NE, must be set to the value of the entry that */
-/* corresponds to IRN(k). The entries in each column must be */
-/* non-negative and ordered by decreasing value. */
-/* IPERM is an INT_T array of length N. On exit, it contains the */
-/* bottleneck matching: IPERM(I) - 0 or row I is matched to column */
-/* IPERM(I). */
-/* NUMX is an INT_T variable. On exit, it contains the cardinality */
-/* of the matching stored in IPERM. */
-/* IW is an INT_T work array of length 10N. */
-/* FC is an int_t array of length N that contains the list of */
-/* unmatched columns. */
-/* LEN(J), LENL(J), LENH(J) are int_t arrays of length N that point */
-/* to entries in matrix column J. */
-/* In the matrix defined by the column parts IP(J)+LENL(J) we know */
-/* a matching does not exist; in the matrix defined by the column */
-/* parts IP(J)+LENH(J) we know one exists. */
-/* LEN(J) lies between LENL(J) and LENH(J) and determines the matrix */
-/* that is tested for a maximum matching. */
-/* W is an int_t array of length N and contains the indices of the */
-/* columns for which LENL ne LENH. */
-/* WLEN is number of indices stored in array W. */
-/* IW is int_t work array of length N. */
-/* IW4 is int_t work array of length 4N used by MC64U/UD. */
-/* EXTERNAL FD05AD,MC64QD,MC64UD */
-/* DOUBLE PRECISION FD05AD */
-/* BMIN and BMAX are such that a maximum matching exists for the input */
-/* matrix in which all entries smaller than BMIN are dropped. */
-/* For BMAX, a maximum matching does not exist. */
-/* BVAL is a value between BMIN and BMAX. */
-/* CNT is the number of calls made to MC64U/UD so far. */
-/* NUM is the cardinality of last matching found. */
-/* Set RINF to largest positive real number */
-/* XSL RINF = FD05AD(5) */
- /* Parameter adjustments */
- --iw4;
- --iw;
- --fc;
- --lenh;
- --lenl;
- --len;
- --w;
- --iperm;
- --ip;
- --a;
- --irn;
-
- /* Function Body */
- rinf = dmach_dist("Overflow");
-/* Compute a first maximum matching from scratch on whole matrix. */
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
- fc[j] = j;
- iw[j] = 0;
- len[j] = ip[j + 1] - ip[j];
-/* L20: */
- }
-/* The first call to MC64U/UD */
- cnt = 1;
- mod = 1;
- *numx = 0;
- mc64ud_dist(&cnt, &mod, n, &irn[1], ne, &ip[1], &len[1], &fc[1], &iw[1],
- numx, n, &iw4[1], &iw4[*n + 1], &iw4[(*n << 1) + 1],
- &iw4[*n * 3 + 1]);
-/* IW contains a maximum matching of length NUMX. */
- num = *numx;
- if (num != *n) {
-/* Matrix is structurally singular */
- bmax = rinf;
- } else {
-/* Matrix is structurally nonsingular, NUM=NUMX=N; */
-/* Set BMAX just above the smallest of all the maximum absolute */
-/* values of the columns */
- bmax = rinf;
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
- bval = 0.f;
- i__2 = ip[j + 1] - 1;
- for (k = ip[j]; k <= i__2; ++k) {
- if (a[k] > bval) {
- bval = a[k];
- }
-/* L25: */
- }
- if (bval < bmax) {
- bmax = bval;
- }
-/* L30: */
- }
- bmax *= 1.001f;
- }
-/* Initialize BVAL,BMIN */
- bval = 0.f;
- bmin = 0.f;
-/* Initialize LENL,LEN,LENH,W,WLEN according to BMAX. */
-/* Set LEN(J), LENH(J) just after last entry in column J. */
-/* Set LENL(J) just after last entry in column J with value ge BMAX. */
- wlen = 0;
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
- l = ip[j + 1] - ip[j];
- lenh[j] = l;
- len[j] = l;
- i__2 = ip[j + 1] - 1;
- for (k = ip[j]; k <= i__2; ++k) {
- if (a[k] < bmax) {
- goto L46;
- }
-/* L45: */
- }
-/* Column J is empty or all entries are ge BMAX */
- k = ip[j + 1];
-L46:
- lenl[j] = k - ip[j];
-/* Add J to W if LENL(J) ne LENH(J) */
- if (lenl[j] == l) {
- goto L48;
- }
- ++wlen;
- w[wlen] = j;
-L48:
- ;
- }
-/* Main loop */
- i__1 = *ne;
- for (idum1 = 1; idum1 <= i__1; ++idum1) {
- if (num == *numx) {
-/* We have a maximum matching in IW; store IW in IPERM */
- i__2 = *n;
- for (i__ = 1; i__ <= i__2; ++i__) {
- iperm[i__] = iw[i__];
-/* L50: */
- }
-/* Keep going round this loop until matching IW is no longer maximum. */
- i__2 = *ne;
- for (idum2 = 1; idum2 <= i__2; ++idum2) {
- bmin = bval;
- if (bmax == bmin) {
- goto L99;
- }
-/* Find splitting value BVAL */
- mc64qd_dist(&ip[1], &lenl[1], &len[1], &w[1], &wlen,
- &a[1], &nval, &bval);
- if (nval <= 1) {
- goto L99;
- }
-/* Set LEN such that all matrix entries with value lt BVAL are */
-/* discarded. Store old LEN in LENH. Do this for all columns W(K). */
-/* Each step, either K is incremented or WLEN is decremented. */
- k = 1;
- i__3 = *n;
- for (idum3 = 1; idum3 <= i__3; ++idum3) {
- if (k > wlen) {
- goto L71;
- }
- j = w[k];
- i__4 = ip[j] + lenl[j];
- for (ii = ip[j] + len[j] - 1; ii >= i__4; --ii) {
- if (a[ii] >= bval) {
- goto L60;
- }
- i__ = irn[ii];
- if (iw[i__] != j) {
- goto L55;
- }
-/* Remove entry from matching */
- iw[i__] = 0;
- --num;
- fc[*n - num] = j;
-L55:
- ;
- }
-L60:
- lenh[j] = len[j];
-/* IP(J)+LEN(J)-1 is last entry in column ge BVAL */
- len[j] = ii - ip[j] + 1;
-/* If LENH(J) = LENL(J), remove J from W */
- if (lenl[j] == lenh[j]) {
- w[k] = w[wlen];
- --wlen;
- } else {
- ++k;
- }
-/* L70: */
- }
-L71:
- if (num < *numx) {
- goto L81;
- }
-/* L80: */
- }
-/* End of dummy loop; this point is never reached */
-/* Set mode for next call to MC64U/UD */
-L81:
- mod = 1;
- } else {
-/* We do not have a maximum matching in IW. */
- bmax = bval;
-/* BMIN is the bottleneck value of a maximum matching; */
-/* for BMAX the matching is not maximum, so BMAX>BMIN */
-/* IF (BMAX .EQ. BMIN) GO TO 99 */
-/* Find splitting value BVAL */
- mc64qd_dist(&ip[1], &len[1], &lenh[1], &w[1], &wlen, &a[1],
- &nval, &bval);
- if (nval == 0 || bval == bmin) {
- goto L99;
- }
-/* Set LEN such that all matrix entries with value ge BVAL are */
-/* inside matrix. Store old LEN in LENL. Do this for all columns W(K). */
-/* Each step, either K is incremented or WLEN is decremented. */
- k = 1;
- i__2 = *n;
- for (idum3 = 1; idum3 <= i__2; ++idum3) {
- if (k > wlen) {
- goto L88;
- }
- j = w[k];
- i__3 = ip[j] + lenh[j] - 1;
- for (ii = ip[j] + len[j]; ii <= i__3; ++ii) {
- if (a[ii] < bval) {
- goto L86;
- }
-/* L85: */
- }
-L86:
- lenl[j] = len[j];
- len[j] = ii - ip[j];
- if (lenl[j] == lenh[j]) {
- w[k] = w[wlen];
- --wlen;
- } else {
- ++k;
- }
-/* L87: */
- }
-/* End of dummy loop; this point is never reached */
-/* Set mode for next call to MC64U/UD */
-L88:
- mod = 0;
- }
- ++cnt;
- mc64ud_dist(&cnt, &mod, n, &irn[1], ne, &ip[1], &len[1], &fc[1],
- &iw[1], &num, numx, &iw4[1], &iw4[*n + 1],
- &iw4[(*n << 1) + 1], &iw4[*n * 3 + 1]);
-/* IW contains maximum matching of length NUM */
-/* L90: */
- }
-/* End of dummy loop; this point is never reached */
-/* BMIN is bottleneck value of final matching */
-L99:
- if (*numx == *n) {
- goto L1000;
- }
-/* The matrix is structurally singular, complete IPERM */
-/* W, IW are work arrays */
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
- w[j] = 0;
-/* L300: */
- }
- k = 0;
- i__1 = *n;
- for (i__ = 1; i__ <= i__1; ++i__) {
- if (iperm[i__] == 0) {
- ++k;
- iw[k] = i__;
- } else {
- j = iperm[i__];
- w[j] = i__;
- }
-/* L310: */
- }
- k = 0;
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
- if (w[j] != 0) {
- goto L320;
- }
- ++k;
- idum1 = iw[k];
- iperm[idum1] = j;
-L320:
- ;
- }
-L1000:
- return 0;
-} /* mc64sd_ */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64qd_dist(int_t *ip, int_t *lenl, int_t *lenh,
- int_t *w, int_t *wlen, double *a, int_t *nval, double *val)
-{
- /* System generated locals */
- int_t i__1, i__2, i__3;
-
- /* Local variables */
- int_t j, k, s;
- double ha;
- int_t ii, pos;
- double split[10];
-
-
-/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
-/* Research Councils *** */
-/* *** Although every effort has been made to ensure robustness and *** */
-/* *** reliability of the subroutines in this MC64 suite, we *** */
-/* *** disclaim any liability arising through the use or misuse of *** */
-/* *** any of the subroutines. *** */
-/* *** Any problems? Contact ... */
-/* Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no) *** */
-
-/* This routine searches for at most XX different numerical values */
-/* in the columns W(1:WLEN). XX>=2. */
-/* Each column J is scanned between IP(J)+LENL(J) and IP(J)+LENH(J)-1 */
-/* until XX values are found or all columns have been considered. */
-/* On output, NVAL is the number of different values that is found */
-/* and SPLIT(1:NVAL) contains the values in decreasing order. */
-/* If NVAL > 0, the routine returns VAL = SPLIT((NVAL+1)/2). */
-
-/* Scan columns in W(1:WLEN). For each encountered value, if value not */
-/* already present in SPLIT(1:NVAL), insert value such that SPLIT */
-/* remains sorted by decreasing value. */
-/* The sorting is done by straightforward insertion; therefore the use */
-/* of this routine should be avoided for large XX (XX < 20). */
- /* Parameter adjustments */
- --a;
- --w;
- --lenh;
- --lenl;
- --ip;
-
- /* Function Body */
- *nval = 0;
- i__1 = *wlen;
- for (k = 1; k <= i__1; ++k) {
- j = w[k];
- i__2 = ip[j] + lenh[j] - 1;
- for (ii = ip[j] + lenl[j]; ii <= i__2; ++ii) {
- ha = a[ii];
- if (*nval == 0) {
- split[0] = ha;
- *nval = 1;
- } else {
-/* Check presence of HA in SPLIT */
- for (s = *nval; s >= 1; --s) {
- if (split[s - 1] == ha) {
- goto L15;
- }
- if (split[s - 1] > ha) {
- pos = s + 1;
- goto L21;
- }
-/* L20: */
- }
- pos = 1;
-/* The insertion */
-L21:
- i__3 = pos;
- for (s = *nval; s >= i__3; --s) {
- split[s] = split[s - 1];
-/* L22: */
- }
- split[pos - 1] = ha;
- ++(*nval);
- }
-/* Exit loop if XX values are found */
- if (*nval == 10) {
- goto L11;
- }
-L15:
- ;
- }
-/* L10: */
- }
-/* Determine VAL */
-L11:
- if (*nval > 0) {
- *val = split[(*nval + 1) / 2 - 1];
- }
- return 0;
-} /* mc64qd_ */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64ud_dist(int_t *id, int_t *mod, int_t *n, int_t *
- irn, int_t *lirn, int_t *ip, int_t *lenc, int_t *fc, int_t *
- iperm, int_t *num, int_t *numx, int_t *pr, int_t *arp,
- int_t *cv, int_t *out)
-{
- /* System generated locals */
- int_t i__1, i__2, i__3, i__4;
-
- /* Local variables */
- int_t i__, j, k, j1, ii, kk, id0, id1, in1, in2, nfc, num0, num1, num2,
- jord, last;
-
-
-/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
-/* Research Councils *** */
-/* *** Although every effort has been made to ensure robustness and *** */
-/* *** reliability of the subroutines in this MC64 suite, we *** */
-/* *** disclaim any liability arising through the use or misuse of *** */
-/* *** any of the subroutines. *** */
-/* *** Any problems? Contact ... */
-/* Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no) *** */
-
-/* PR(J) is the previous column to J in the depth first search. */
-/* Array PR is used as workspace in the sorting algorithm. */
-/* Elements (I,IPERM(I)) I=1,..,N are entries at the end of the */
-/* algorithm unless N assignments have not been made in which case */
-/* N-NUM pairs (I,IPERM(I)) will not be entries in the matrix. */
-/* CV(I) is the most recent loop number (ID+JORD) at which row I */
-/* was visited. */
-/* ARP(J) is the number of entries in column J which have been scanned */
-/* when looking for a cheap assignment. */
-/* OUT(J) is one less than the number of entries in column J which have */
-/* not been scanned during one pass through the main loop. */
-/* NUMX is maximum possible size of matching. */
- /* Parameter adjustments */
- --out;
- --cv;
- --arp;
- --pr;
- --iperm;
- --fc;
- --lenc;
- --ip;
- --irn;
-
- /* Function Body */
- if (*id == 1) {
-/* The first call to MC64U/UD. */
-/* Initialize CV and ARP; parameters MOD, NUMX are not accessed */
- i__1 = *n;
- for (i__ = 1; i__ <= i__1; ++i__) {
- cv[i__] = 0;
- arp[i__] = 0;
-/* L5: */
- }
- num1 = *n;
- num2 = *n;
- } else {
-/* Not the first call to MC64U/UD. */
-/* Re-initialize ARP if entries were deleted since last call to MC64U/UD */
- if (*mod == 1) {
- i__1 = *n;
- for (i__ = 1; i__ <= i__1; ++i__) {
- arp[i__] = 0;
-/* L8: */
- }
- }
- num1 = *numx;
- num2 = *n - *numx;
- }
- num0 = *num;
-/* NUM0 is size of input matching */
-/* NUM1 is maximum possible size of matching */
-/* NUM2 is maximum allowed number of unassigned rows/columns */
-/* NUM is size of current matching */
-/* Quick return if possible */
-/* IF (NUM.EQ.N) GO TO 199 */
-/* NFC is number of rows/columns that could not be assigned */
- nfc = 0;
-/* Integers ID0+1 to ID0+N are unique numbers for call ID to MC64U/UD, */
-/* so 1st call uses 1..N, 2nd call uses N+1..2N, etc */
- id0 = (*id - 1) * *n;
-/* Main loop. Each pass round this loop either results in a new */
-/* assignment or gives a column with no assignment */
- i__1 = *n;
- for (jord = num0 + 1; jord <= i__1; ++jord) {
-/* Each pass uses unique number ID1 */
- id1 = id0 + jord;
-/* J is unmatched column */
- j = fc[jord - num0];
- pr[j] = -1;
- i__2 = jord;
- for (k = 1; k <= i__2; ++k) {
-/* Look for a cheap assignment */
- if (arp[j] >= lenc[j]) {
- goto L30;
- }
- in1 = ip[j] + arp[j];
- in2 = ip[j] + lenc[j] - 1;
- i__3 = in2;
- for (ii = in1; ii <= i__3; ++ii) {
- i__ = irn[ii];
- if (iperm[i__] == 0) {
- goto L80;
- }
-/* L20: */
- }
-/* No cheap assignment in row */
- arp[j] = lenc[j];
-/* Begin looking for assignment chain starting with row J */
-L30:
- out[j] = lenc[j] - 1;
-/* Inner loop. Extends chain by one or backtracks */
- i__3 = jord;
- for (kk = 1; kk <= i__3; ++kk) {
- in1 = out[j];
- if (in1 < 0) {
- goto L50;
- }
- in2 = ip[j] + lenc[j] - 1;
- in1 = in2 - in1;
-/* Forward scan */
- i__4 = in2;
- for (ii = in1; ii <= i__4; ++ii) {
- i__ = irn[ii];
- if (cv[i__] == id1) {
- goto L40;
- }
-/* Column J has not yet been accessed during this pass */
- j1 = j;
- j = iperm[i__];
- cv[i__] = id1;
- pr[j] = j1;
- out[j1] = in2 - ii - 1;
- goto L70;
-L40:
- ;
- }
-/* Backtracking step. */
-L50:
- j1 = pr[j];
- if (j1 == -1) {
-/* No augmenting path exists for column J. */
- ++nfc;
- fc[nfc] = j;
- if (nfc > num2) {
-/* A matching of maximum size NUM1 is not possible */
- last = jord;
- goto L101;
- }
- goto L100;
- }
- j = j1;
-/* L60: */
- }
-/* End of dummy loop; this point is never reached */
-L70:
- ;
- }
-/* End of dummy loop; this point is never reached */
-/* New assignment is made. */
-L80:
- iperm[i__] = j;
- arp[j] = ii - ip[j] + 1;
- ++(*num);
- i__2 = jord;
- for (k = 1; k <= i__2; ++k) {
- j = pr[j];
- if (j == -1) {
- goto L95;
- }
- ii = ip[j] + lenc[j] - out[j] - 2;
- i__ = irn[ii];
- iperm[i__] = j;
-/* L90: */
- }
-/* End of dummy loop; this point is never reached */
-L95:
- if (*num == num1) {
-/* A matching of maximum size NUM1 is found */
- last = jord;
- goto L101;
- }
-
-L100:
- ;
- }
-/* All unassigned columns have been considered */
- last = *n;
-/* Now, a transversal is computed or is not possible. */
-/* Complete FC before returning. */
-L101:
- i__1 = *n;
- for (jord = last + 1; jord <= i__1; ++jord) {
- ++nfc;
- fc[nfc] = fc[jord - num0];
-/* L110: */
- }
-/* 199 RETURN */
- return 0;
-} /* mc64ud_ */
-
-/* ********************************************************************** */
-/* Subroutine */ int_t mc64wd_dist(int_t *n, int_t *ne, int_t *ip, int_t *
- irn, double *a, int_t *iperm, int_t *num, int_t *jperm,
- int_t *out, int_t *pr, int_t *q, int_t *l, double *u,
- double *d__)
-{
- /* System generated locals */
- int_t i__1, i__2, i__3;
-
- /* Local variables */
- int_t i__, j, k, i0, k0, k1, k2, q0;
- double di;
- int_t ii, jj, kk;
- double vj;
- int_t up;
- double dq0;
- int_t kk1, kk2;
- double csp;
- int_t isp, jsp, low;
- double dmin__, dnew;
- int_t jord, qlen, jdum;
- double rinf;
- extern /* Subroutine */ int_t mc64dd_dist(int_t *, int_t *, int_t *,
- double *, int_t *, int_t *), mc64ed_dist(int_t *, int_t *,
- int_t *, double *, int_t *, int_t *), mc64fd_dist(int_t *
- , int_t *, int_t *, int_t *, double *, int_t *,
- int_t *);
-
-
-/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
-/* Research Councils *** */
-/* *** Although every effort has been made to ensure robustness and *** */
-/* *** reliability of the subroutines in this MC64 suite, we *** */
-/* *** disclaim any liability arising through the use or misuse of *** */
-/* *** any of the subroutines. *** */
-/* *** Any problems? Contact ... */
-/* Iain Duff (I.Duff at rl.ac.uk) or Jacko Koster (jak at ii.uib.no) *** */
-
-/* N, NE, IP, IRN are described in MC64A/AD. */
-/* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */
-/* A(K), K=1..NE, must be set to the value of the entry that */
-/* corresponds to IRN(K). It is not altered. */
-/* All values A(K) must be non-negative. */
-/* IPERM is an INT_T array of length N. On exit, it contains the */
-/* weighted matching: IPERM(I) = 0 or row I is matched to column */
-/* IPERM(I). */
-/* NUM is an INT_T variable. On exit, it contains the cardinality of */
-/* the matching stored in IPERM. */
-/* IW is an INT_T work array of length 5N. */
-/* DW is a REAL (DOUBLE PRECISION in the D-version) array of length 2N. */
-/* On exit, U = D(1:N) contains the dual row variable and */
-/* V = D(N+1:2N) contains the dual column variable. If the matrix */
-/* is structurally nonsingular (NUM = N), the following holds: */
-/* U(I)+V(J) <= A(I,J) if IPERM(I) |= J */
-/* U(I)+V(J) = A(I,J) if IPERM(I) = J */
-/* U(I) = 0 if IPERM(I) = 0 */
-/* V(J) = 0 if there is no I for which IPERM(I) = J */
-/* Local variables */
-/* Local parameters */
-/* External subroutines and/or functions */
-/* EXTERNAL FD05AD,MC64DD,MC64ED,MC64FD */
-/* DOUBLE PRECISION FD05AD */
-/* Set RINF to largest positive real number */
-/* XSL RINF = FD05AD(5) */
- /* Parameter adjustments */
- --d__;
- --u;
- --l;
- --q;
- --pr;
- --out;
- --jperm;
- --iperm;
- --ip;
- --a;
- --irn;
-
- /* Function Body */
- rinf = dmach_dist("Overflow");
-/* Initialization */
- *num = 0;
- i__1 = *n;
- for (k = 1; k <= i__1; ++k) {
- u[k] = rinf;
- d__[k] = 0.;
- iperm[k] = 0;
- jperm[k] = 0;
- pr[k] = ip[k];
- l[k] = 0;
-/* L10: */
- }
-/* Initialize U(I) */
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
- i__2 = ip[j + 1] - 1;
- for (k = ip[j]; k <= i__2; ++k) {
- i__ = irn[k];
- if (a[k] > u[i__]) {
- goto L20;
- }
- u[i__] = a[k];
- iperm[i__] = j;
- l[i__] = k;
-L20:
- ;
- }
-/* L30: */
- }
- i__1 = *n;
- for (i__ = 1; i__ <= i__1; ++i__) {
- j = iperm[i__];
- if (j == 0) {
- goto L40;
- }
-/* Row I is not empty */
- iperm[i__] = 0;
- if (jperm[j] != 0) {
- goto L40;
- }
-/* Assignment of column J to row I */
- ++(*num);
- iperm[i__] = j;
- jperm[j] = l[i__];
-L40:
- ;
- }
- if (*num == *n) {
- goto L1000;
- }
-/* Scan unassigned columns; improve assignment */
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
-/* JPERM(J) ne 0 iff column J is already assigned */
- if (jperm[j] != 0) {
- goto L95;
- }
- k1 = ip[j];
- k2 = ip[j + 1] - 1;
-/* Continue only if column J is not empty */
- if (k1 > k2) {
- goto L95;
- }
- vj = rinf;
- i__2 = k2;
- for (k = k1; k <= i__2; ++k) {
- i__ = irn[k];
- di = a[k] - u[i__];
- if (di > vj) {
- goto L50;
- }
- if (di < vj || di == rinf) {
- goto L55;
- }
- if (iperm[i__] != 0 || iperm[i0] == 0) {
- goto L50;
- }
-L55:
- vj = di;
- i0 = i__;
- k0 = k;
-L50:
- ;
- }
- d__[j] = vj;
- k = k0;
- i__ = i0;
- if (iperm[i__] == 0) {
- goto L90;
- }
- i__2 = k2;
- for (k = k0; k <= i__2; ++k) {
- i__ = irn[k];
- if (a[k] - u[i__] > vj) {
- goto L60;
- }
- jj = iperm[i__];
-/* Scan remaining part of assigned column JJ */
- kk1 = pr[jj];
- kk2 = ip[jj + 1] - 1;
- if (kk1 > kk2) {
- goto L60;
- }
- i__3 = kk2;
- for (kk = kk1; kk <= i__3; ++kk) {
- ii = irn[kk];
- if (iperm[ii] > 0) {
- goto L70;
- }
- if (a[kk] - u[ii] <= d__[jj]) {
- goto L80;
- }
-L70:
- ;
- }
- pr[jj] = kk2 + 1;
-L60:
- ;
- }
- goto L95;
-L80:
- jperm[jj] = kk;
- iperm[ii] = jj;
- pr[jj] = kk + 1;
-L90:
- ++(*num);
- jperm[j] = k;
- iperm[i__] = j;
- pr[j] = k + 1;
-L95:
- ;
- }
- if (*num == *n) {
- goto L1000;
- }
-/* Prepare for main loop */
- i__1 = *n;
- for (i__ = 1; i__ <= i__1; ++i__) {
- d__[i__] = rinf;
- l[i__] = 0;
-/* L99: */
- }
-/* Main loop ... each pass round this loop is similar to Dijkstra's */
-/* algorithm for solving the single source shortest path problem */
- i__1 = *n;
- for (jord = 1; jord <= i__1; ++jord) {
- if (jperm[jord] != 0) {
- goto L100;
- }
-/* JORD is next unmatched column */
-/* DMIN is the length of shortest path in the tree */
- dmin__ = rinf;
- qlen = 0;
- low = *n + 1;
- up = *n + 1;
-/* CSP is the cost of the shortest augmenting path to unassigned row */
-/* IRN(ISP). The corresponding column index is JSP. */
- csp = rinf;
-/* Build shortest path tree starting from unassigned column (root) JORD */
- j = jord;
- pr[j] = -1;
-/* Scan column J */
- i__2 = ip[j + 1] - 1;
- for (k = ip[j]; k <= i__2; ++k) {
- i__ = irn[k];
- dnew = a[k] - u[i__];
- if (dnew >= csp) {
- goto L115;
- }
- if (iperm[i__] == 0) {
- csp = dnew;
- isp = k;
- jsp = j;
- } else {
- if (dnew < dmin__) {
- dmin__ = dnew;
- }
- d__[i__] = dnew;
- ++qlen;
- q[qlen] = k;
- }
-L115:
- ;
- }
-/* Initialize heap Q and Q2 with rows held in Q(1:QLEN) */
- q0 = qlen;
- qlen = 0;
- i__2 = q0;
- for (kk = 1; kk <= i__2; ++kk) {
- k = q[kk];
- i__ = irn[k];
- if (csp <= d__[i__]) {
- d__[i__] = rinf;
- goto L120;
- }
- if (d__[i__] <= dmin__) {
- --low;
- q[low] = i__;
- l[i__] = low;
- } else {
- ++qlen;
- l[i__] = qlen;
- mc64dd_dist(&i__, n, &q[1], &d__[1], &l[1], &c__2);
- }
-/* Update tree */
- jj = iperm[i__];
- out[jj] = k;
- pr[jj] = j;
-L120:
- ;
- }
- i__2 = *num;
- for (jdum = 1; jdum <= i__2; ++jdum) {
-/* If Q2 is empty, extract rows from Q */
- if (low == up) {
- if (qlen == 0) {
- goto L160;
- }
- i__ = q[1];
- if (d__[i__] >= csp) {
- goto L160;
- }
- dmin__ = d__[i__];
-L152:
- mc64ed_dist(&qlen, n, &q[1], &d__[1], &l[1], &c__2);
- --low;
- q[low] = i__;
- l[i__] = low;
- if (qlen == 0) {
- goto L153;
- }
- i__ = q[1];
- if (d__[i__] > dmin__) {
- goto L153;
- }
- goto L152;
- }
-/* Q0 is row whose distance D(Q0) to the root is smallest */
-L153:
- q0 = q[up - 1];
- dq0 = d__[q0];
-/* Exit loop if path to Q0 is longer than the shortest augmenting path */
- if (dq0 >= csp) {
- goto L160;
- }
- --up;
-/* Scan column that matches with row Q0 */
- j = iperm[q0];
- vj = dq0 - a[jperm[j]] + u[q0];
- i__3 = ip[j + 1] - 1;
- for (k = ip[j]; k <= i__3; ++k) {
- i__ = irn[k];
- if (l[i__] >= up) {
- goto L155;
- }
-/* DNEW is new cost */
- dnew = vj + a[k] - u[i__];
-/* Do not update D(I) if DNEW ge cost of shortest path */
- if (dnew >= csp) {
- goto L155;
- }
- if (iperm[i__] == 0) {
-/* Row I is unmatched; update shortest path info */
- csp = dnew;
- isp = k;
- jsp = j;
- } else {
-/* Row I is matched; do not update D(I) if DNEW is larger */
- di = d__[i__];
- if (di <= dnew) {
- goto L155;
- }
- if (l[i__] >= low) {
- goto L155;
- }
- d__[i__] = dnew;
- if (dnew <= dmin__) {
- if (l[i__] != 0) {
- mc64fd_dist(&l[i__], &qlen, n, &q[1], &d__[1], &l[1],
- &c__2);
- }
- --low;
- q[low] = i__;
- l[i__] = low;
- } else {
- if (l[i__] == 0) {
- ++qlen;
- l[i__] = qlen;
- }
- mc64dd_dist(&i__, n, &q[1], &d__[1], &l[1], &c__2);
- }
-/* Update tree */
- jj = iperm[i__];
- out[jj] = k;
- pr[jj] = j;
- }
-L155:
- ;
- }
-/* L150: */
- }
-/* If CSP = RINF, no augmenting path is found */
-L160:
- if (csp == rinf) {
- goto L190;
- }
-/* Find augmenting path by tracing backward in PR; update IPERM,JPERM */
- ++(*num);
- i__ = irn[isp];
- iperm[i__] = jsp;
- jperm[jsp] = isp;
- j = jsp;
- i__2 = *num;
- for (jdum = 1; jdum <= i__2; ++jdum) {
- jj = pr[j];
- if (jj == -1) {
- goto L180;
- }
- k = out[j];
- i__ = irn[k];
- iperm[i__] = jj;
- jperm[jj] = k;
- j = jj;
-/* L170: */
- }
-/* End of dummy loop; this point is never reached */
-/* Update U for rows in Q(UP:N) */
-L180:
- i__2 = *n;
- for (kk = up; kk <= i__2; ++kk) {
- i__ = q[kk];
- u[i__] = u[i__] + d__[i__] - csp;
-/* L185: */
- }
-L190:
- i__2 = *n;
- for (kk = low; kk <= i__2; ++kk) {
- i__ = q[kk];
- d__[i__] = rinf;
- l[i__] = 0;
-/* L191: */
- }
- i__2 = qlen;
- for (kk = 1; kk <= i__2; ++kk) {
- i__ = q[kk];
- d__[i__] = rinf;
- l[i__] = 0;
-/* L193: */
- }
-L100:
- ;
- }
-/* End of main loop */
-/* Set dual column variable in D(1:N) */
-L1000:
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
- k = jperm[j];
- if (k != 0) {
- d__[j] = a[k] - u[irn[k]];
- } else {
- d__[j] = 0.;
- }
- if (iperm[j] == 0) {
- u[j] = 0.;
- }
-/* L200: */
- }
- if (*num == *n) {
- goto L1100;
- }
-/* The matrix is structurally singular, complete IPERM. */
-/* JPERM, OUT are work arrays */
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
- jperm[j] = 0;
-/* L300: */
- }
- k = 0;
- i__1 = *n;
- for (i__ = 1; i__ <= i__1; ++i__) {
- if (iperm[i__] == 0) {
- ++k;
- out[k] = i__;
- } else {
- j = iperm[i__];
- jperm[j] = i__;
- }
-/* L310: */
- }
- k = 0;
- i__1 = *n;
- for (j = 1; j <= i__1; ++j) {
- if (jperm[j] != 0) {
- goto L320;
- }
- ++k;
- jdum = out[k];
- iperm[jdum] = j;
-L320:
- ;
- }
-L1100:
- return 0;
-} /* mc64wd_ */
-
-
diff --git a/SRC/memory.c b/SRC/memory.c
index fd54862..4846242 100644
--- a/SRC/memory.c
+++ b/SRC/memory.c
@@ -12,9 +12,12 @@ at the top-level directory.
* \brief Memory utilities
*
* <pre>
- * -- Distributed SuperLU routine (version 1.0) --
+ * -- Distributed SuperLU routine (version 5.2) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* September 1, 1999
+ *
+ * Modified:
+ * September 30, 2017, add aligned malloc for Intel
* </pre>
*/
@@ -112,17 +115,31 @@ void superlu_free_dist(void *addr)
#else /* The production mode. */
-void *superlu_malloc_dist(size_t size)
-{
+#if defined (__INTEL_COMPILER)
+#include <immintrin.h>
+void * superlu_malloc_dist(size_t size) {
+ void* ptr;
+ int alignment = 1<<12; // align at 4K page
+ if (size > 1<<19 ) { alignment=1<<21; }
+ return (_mm_malloc(size, alignment));
+}
+void superlu_free_dist(void * ptr) { _mm_free(ptr); }
+
+// #elif (_POSIX_C_SOURCE>=200112L)
+//
+// void * MALLOC(size_t size) {void* ptr;int alignment=1<<12;if(size>1<<19){alignment=1<<21;}posix_memalign( (void**)&(ptr), alignment, size );return(ptr);}
+//void FREE(void * ptr) {free(ptr);}
+
+#else // normal malloc/free
+
+void *superlu_malloc_dist(size_t size) {
void *buf;
buf = (void *) malloc(size);
return (buf);
}
+void superlu_free_dist(void *addr) { free (addr); }
-void superlu_free_dist(void *addr)
-{
- free (addr);
-}
+#endif
#endif /* End debug malloc/free. */
diff --git a/SRC/pdgssvx.c b/SRC/pdgssvx.c
index dc1bff5..080cd60 100644
--- a/SRC/pdgssvx.c
+++ b/SRC/pdgssvx.c
@@ -650,8 +650,10 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
}
/* ------------------------------------------------------------
- Diagonal scaling to equilibrate the matrix. (simple scheme)
- ------------------------------------------------------------*/
+ * Diagonal scaling to equilibrate the matrix. (simple scheme)
+ * for row i = 1:n, A(i,:) <- A(i,:) / max(abs(A(i,:));
+ * for column j = 1:n, A(:,j) <- A(:, j) / max(abs(A(:,j))
+ * ------------------------------------------------------------*/
if ( Equil ) {
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Enter equil");
@@ -727,7 +729,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
#if ( PRNTlevel>=1 )
if ( !iam ) {
printf(".. equilibrated? *equed = %c\n", *equed);
- /*fflush(stdout);*/
+ fflush(stdout);
}
#endif
} /* end if Fact ... */
@@ -896,8 +898,10 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
t = SuperLU_timer_() - t;
stat->utime[ROWPERM] = t;
#if ( PRNTlevel>=1 )
- if ( !iam ) printf(".. LDPERM job " IFMT "\t time: %.2f\n",
- job, t);
+ if ( !iam ) {
+ printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t);
+ fflush(stdout);
+ }
#endif
} /* end if Fact ... */
@@ -916,7 +920,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
else *(unsigned char *)norm = 'I';
anorm = pdlangs(norm, A, grid);
#if ( PRNTlevel>=1 )
- if ( !iam ) printf(".. anorm %e\n", anorm);
+ if ( !iam ) { printf(".. anorm %e\n", anorm); fflush(stdout); }
#endif
}
@@ -1020,9 +1024,11 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
/* Perform a symbolic factorization on Pc*Pr*A*Pc^T and set up
the nonzero data structures for L & U. */
#if ( PRNTlevel>=1 )
- if ( !iam )
- printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
+ if ( !iam ) {
+ printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
+ fflush(stdout);
+ }
#endif
t = SuperLU_timer_();
if ( !(Glu_freeable = (Glu_freeable_t *)
@@ -1048,6 +1054,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
symb_mem_usage.for_lu*1e-6,
symb_mem_usage.total*1e-6,
symb_mem_usage.expansions);
+ fflush(stdout);
}
#endif
} else { /* symbfact out of memory */
@@ -1216,6 +1223,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
avg / grid->nprow / grid->npcol * 1e-6,
max * 1e-6);
printf("**************************************************\n");
+ fflush(stdout);
}
} /* end printing stats */
diff --git a/SRC/pdgstrf.c b/SRC/pdgstrf.c
index 00aaeba..98bdd7e 100644
--- a/SRC/pdgstrf.c
+++ b/SRC/pdgstrf.c
@@ -14,7 +14,7 @@ at the top-level directory.
* \brief Performs LU factorization in parallel
*
* <pre>
- * -- Distributed SuperLU routine (version 4.3) --
+ * -- Distributed SuperLU routine (version 5.2) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* October 1, 2014
*
@@ -25,7 +25,8 @@ at the top-level directory.
* July 12, 2011 static scheduling and arbitrary look-ahead
* March 13, 2013 change NTAGS to MPI_TAG_UB value
* September 24, 2015 replace xLAMCH by xMACH, using C99 standard.
- * December 31, 2015 rename xMACH to xMACH_DIST
+ * December 31, 2015 rename xMACH to xMACH_DIST.
+ * September 30, 2017 optimization for Intel Knights Landing (KNL) node .
*
* Sketch of the algorithm
*
@@ -139,6 +140,14 @@ at the top-level directory.
*/
#define PHI_FRAMEWORK
+#if 0
+#define CACHELINE 64 /* bytes, Xeon Phi KNL */
+#else
+#define CACHELINE 0 /* not worry about false sharing of different threads */
+#endif
+//#define GEMM_PADLEN 1
+#define GEMM_PADLEN 8
+
#define PDGSTRF2 pdgstrf2_trsm
#define PDGSTRS2 pdgstrs2_omp
@@ -275,7 +284,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
int_t *iuip, *ruip; /* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */
double *ucol;
int *indirect, *indirect2;
- double *tempv, *tempv2d;
+ int_t *tempi;
+ double *tempu, *tempv, *tempr;
+ /* double *tempv2d, *tempU2d; Sherry */
int iinfo;
int *ToRecv, *ToSendD, **ToSendR;
Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
@@ -283,8 +294,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
superlu_scope_t *scp;
float s_eps;
double thresh;
- double *tempU2d, *tempu;
- int full, ldt, ldu, lead_zero, ncols, ncb, nrb, p, pr, pc, nblocks;
+ /*int full;*/
+ int ldt, ldu, lead_zero, ncols, ncb, nrb, p, pr, pc, nblocks;
int_t *etree_supno_l, *etree_supno, *blocks, *blockr, *Ublock, *Urows,
*Lblock, *Lrows, *perm_u, *sf_block, *sf_block_l, *nnodes_l,
*nnodes_u, *edag_supno_l, *recvbuf, **edag_supno;
@@ -298,10 +309,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
* 2 : transferred in Usub_buf[]
* 3 : transferred in Uval_buf[]
*/
- int **msgcnts, **msgcntsU; /* counts for each panel in the
- look-ahead window */
- int *factored; /* factored[j]==0 : L col panel j is factorized */
- int *factoredU; /* factoredU[i]==1 : U row panel i is factorized */
+ int **msgcnts, **msgcntsU; /* counts in the look-ahead window */
+ int *factored; /* factored[j] == 0 : L col panel j is factorized. */
+ int *factoredU; /* factoredU[i] == 1 : U row panel i is factorized. */
int nnodes, *sendcnts, *sdispls, *recvcnts, *rdispls, *srows, *rrows;
etree_node *head, *tail, *ptr;
int *num_child;
@@ -314,16 +324,19 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
void *attr_val;
int flag;
+ /* The following variables are used to pad GEMM dimensions so that
+ each is a multiple of vector length (8 doubles for KNL) */
+ int gemm_m_pad = GEMM_PADLEN, gemm_k_pad = GEMM_PADLEN,
+ gemm_n_pad = GEMM_PADLEN;
+ int gemm_padding = 0;
+
int iword = sizeof (int_t);
int dword = sizeof (double);
- /* For measuring load imbalence in omp threads*/
+ /* For measuring load imbalence in omp threads */
double omp_load_imblc = 0.0;
double *omp_loop_time;
- double CPUOffloadTimer = 0;
- double CPUOffloadFlop = 0;
- double CPUOffloadMop = 0;
double schur_flop_timer = 0.0;
double pdgstrf2_timer = 0.0;
double pdgstrs2_timer = 0.0;
@@ -331,8 +344,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
double InitTimer = 0.0; /* including compute schedule, malloc */
double tt_start, tt_end;
-#if !defined( GPU_ACC )
- /* Counter for couting memory operations */
+/* #if !defined( GPU_ACC ) */
+ /* Counters for memory operations and timings */
double scatter_mem_op_counter = 0.0;
double scatter_mem_op_timer = 0.0;
double scatterL_mem_op_counter = 0.0;
@@ -340,6 +353,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
double scatterU_mem_op_counter = 0.0;
double scatterU_mem_op_timer = 0.0;
+ /* Counters for flops/gather/scatter and timings */
double GatherLTimer = 0.0;
double LookAheadRowSepMOP = 0.0;
double GatherUTimer = 0.0;
@@ -349,10 +363,11 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
double LookAheadScatterTimer = 0.0;
double LookAheadScatterMOP = 0.0;
double RemainGEMMTimer = 0.0;
+ double RemainGEMM_flops = 0.0;
double RemainScatterTimer = 0.0;
double NetSchurUpTimer = 0.0;
double schur_flop_counter = 0.0;
-#endif
+/* #endif */
#if ( PRNTlevel>= 1)
/* count GEMM max dimensions */
@@ -368,6 +383,15 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
#if ( PROFlevel>=1 )
double t1, t2;
float msg_vol = 0, msg_cnt = 0;
+ double comm_wait_time = 0.0;
+ /* Record GEMM dimensions and times */
+ FILE *fopen(), *fgemm;
+ int gemm_count = 0;
+ typedef struct {
+ int m, n, k;
+ double microseconds;
+ } gemm_profile;
+ gemm_profile *gemm_stats;
#endif
/* Test the input parameters. */
@@ -383,6 +407,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
/* Quick return if possible. */
if (m == 0 || n == 0) return 0;
+
+ double tt1 = SuperLU_timer_ ();
/*
* Initialization.
@@ -405,8 +431,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
int tag_ub = *(int *) attr_val;
#if ( PRNTlevel>=1 )
- if (!iam)
- printf ("MPI tag upper bound = %d\n", tag_ub);
+ if (!iam) {
+ printf ("MPI tag upper bound = %d\n", tag_ub); fflush(stdout);
+ }
#endif
#if ( DEBUGlevel>=1 )
@@ -414,6 +441,11 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
printf (" ***** warning s_eps = %e *****\n", s_eps);
CHECK_MALLOC (iam, "Enter pdgstrf()");
#endif
+#if (PROFlevel >= 1 )
+ gemm_stats = (gemm_profile *) SUPERLU_MALLOC(nsupers * sizeof(gemm_profile));
+ if (iam == 0) fgemm = fopen("dgemm_mnk.dat", "w");
+ int *prof_sendR = intCalloc_dist(nsupers);
+#endif
stat->ops[FACT] = 0.0;
stat->current_buffer = 0.0;
@@ -435,29 +467,37 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
if (i != 0) {
if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist ((num_look_aheads + 1) * ((size_t) i))) )
ABORT ("Malloc fails for Lsub_buf.");
+ tempi = Llu->Lsub_buf_2[0];
for (jj = 0; jj < num_look_aheads; jj++)
- Llu->Lsub_buf_2[jj + 1] = Llu->Lsub_buf_2[jj] + i;
+ Llu->Lsub_buf_2[jj+1] = tempi + i*(jj+1); /* vectorize */
+ //Llu->Lsub_buf_2[jj + 1] = Llu->Lsub_buf_2[jj] + i;
}
i = Llu->bufmax[1];
if (i != 0) {
if (!(Llu->Lval_buf_2[0] = doubleMalloc_dist ((num_look_aheads + 1) * ((size_t) i))))
ABORT ("Malloc fails for Lval_buf[].");
+ tempr = Llu->Lval_buf_2[0];
for (jj = 0; jj < num_look_aheads; jj++)
- Llu->Lval_buf_2[jj + 1] = Llu->Lval_buf_2[jj] + i;
+ Llu->Lval_buf_2[jj+1] = tempr + i*(jj+1); /* vectorize */
+ //Llu->Lval_buf_2[jj + 1] = Llu->Lval_buf_2[jj] + i;
}
i = Llu->bufmax[2];
if (i != 0) {
if (!(Llu->Usub_buf_2[0] = intMalloc_dist ((num_look_aheads + 1) * i)))
ABORT ("Malloc fails for Usub_buf_2[].");
+ tempi = Llu->Usub_buf_2[0];
for (jj = 0; jj < num_look_aheads; jj++)
- Llu->Usub_buf_2[jj + 1] = Llu->Usub_buf_2[jj] + i;
+ Llu->Usub_buf_2[jj+1] = tempi + i*(jj+1); /* vectorize */
+ //Llu->Usub_buf_2[jj + 1] = Llu->Usub_buf_2[jj] + i;
}
i = Llu->bufmax[3];
if (i != 0) {
if (!(Llu->Uval_buf_2[0] = doubleMalloc_dist ((num_look_aheads + 1) * i)))
ABORT ("Malloc fails for Uval_buf_2[].");
+ tempr = Llu->Uval_buf_2[0];
for (jj = 0; jj < num_look_aheads; jj++)
- Llu->Uval_buf_2[jj + 1] = Llu->Uval_buf_2[jj] + i;
+ Llu->Uval_buf_2[jj+1] = tempr + i*(jj+1); /* vectorize */
+ //Llu->Uval_buf_2[jj + 1] = Llu->Uval_buf_2[jj] + i;
}
}
@@ -519,15 +559,16 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
if (!(factoredU = SUPERLU_MALLOC (nsupers * sizeof (int_t))))
ABORT ("Malloc fails for factoredU[].");
for (i = 0; i < nsupers; i++) factored[i] = factoredU[i] = -1;
+
log_memory(2 * nsupers * iword, stat);
int num_threads = 1;
#ifdef _OPENMP
#pragma omp parallel default(shared)
+ #pragma omp master
{
- if (omp_get_thread_num () == 0) {
- num_threads = omp_get_num_threads ();
- }
+ //if (omp_get_thread_num () == 0)
+ num_threads = omp_get_num_threads ();
}
#endif
@@ -538,9 +579,11 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
#endif
#if ( PRNTlevel>=1 )
- if(!iam) printf(".. Starting with %d OpenMP threads \n", num_threads );
+ if(!iam) {
+ printf(".. Starting with %d OpenMP threads \n", num_threads );
+ fflush(stdout);
+ }
#endif
- double tt1 = SuperLU_timer_ ();
nblocks = 0;
ncb = nsupers / Pc; /* number of column blocks, horizontal */
@@ -556,10 +599,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
full_u_cols = SUPERLU_MALLOC(ncb * sizeof(int));
blk_ldu = SUPERLU_MALLOC(ncb * sizeof(int));
#endif
- log_memory(2 * ncb * iword, stat);
-
- /* insert a check condition here */
+ log_memory(2 * ncb * iword, stat);
#if 0 /* Sherry: not used? */
/* This bunch is used for static scheduling */
@@ -595,11 +636,12 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
look_ahead_l = SUPERLU_MALLOC (nsupers * sizeof (int));
look_ahead = SUPERLU_MALLOC (nsupers * sizeof (int));
- for (lb = 0; lb < nsupers; lb++) look_ahead_l[lb] = -1;
+ for (lb = 0; lb < nsupers; lb++) look_ahead_l[lb] = -1; /* vectorized */
log_memory(3 * nsupers * iword, stat);
- /* go through U-factor */
- for (lb = 0; lb < nrb; ++lb) {
+ /* Sherry: omp parallel?
+ not worth doing, due to concurrent write to look_ahead_l[jb] */
+ for (lb = 0; lb < nrb; ++lb) { /* go through U-factor */
ib = lb * Pr + myrow;
index = Llu->Ufstnz_br_ptr[lb];
if (index) { /* Not an empty row */
@@ -613,7 +655,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
}
}
}
- if (myrow < nsupers % grid->nprow) {
+ if (myrow < nsupers % grid->nprow) { /* leftover block rows */
ib = nrb * Pr + myrow;
index = Llu->Ufstnz_br_ptr[nrb];
if (index) { /* Not an empty row */
@@ -629,8 +671,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
}
if (options->SymPattern == NO) {
- /* go through L-factor */
- for (lb = 0; lb < ncb; lb++) {
+ /* Sherry: omp parallel?
+ not worth doing, due to concurrent write to look_ahead_l[jb] */
+ for (lb = 0; lb < ncb; lb++) { /* go through L-factor */
ib = lb * Pc + mycol;
index = Llu->Lrowind_bc_ptr[lb];
if (index) {
@@ -644,7 +687,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
}
}
}
- if (mycol < nsupers % grid->npcol) {
+ if (mycol < nsupers % grid->npcol) { /* leftover block columns */
ib = ncb * Pc + mycol;
index = Llu->Lrowind_bc_ptr[ncb];
if (index) {
@@ -678,8 +721,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
/* Instead of half storage, we'll do full storage */
if (!(Llu->ujrow = doubleCalloc_dist (k * k)))
ABORT ("Malloc fails for ujrow[].");
- log_memory(k * k * iword, stat);
#endif
+ log_memory(k * k * iword, stat);
#if ( PRNTlevel>=1 )
if (!iam) {
@@ -690,6 +733,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
(long int) Llu->bufmax[0], (long int) Llu->bufmax[1],
(long int) Llu->bufmax[2], (long int) Llu->bufmax[3],
(long int) Llu->bufmax[4]);
+ fflush(stdout);
}
#endif
@@ -704,26 +748,30 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
ldt = sp_ienv_dist (3); /* Size of maximum supernode */
k = CEILING (nsupers, Pr); /* Number of local block rows */
- /* Following circuit is for finding maximum block size */
+ /* Following code is for finding maximum row dimension of all L panels */
int local_max_row_size = 0;
int max_row_size;
- for (int i = 0; i < nsupers; ++i) {
- int tpc = PCOL (i, grid);
- if (mycol == tpc) {
- lk = LBj (i, grid);
- lsub = Lrowind_bc_ptr[lk];
- if (lsub != NULL) {
- local_max_row_size = SUPERLU_MAX (local_max_row_size, lsub[1]);
- }
- }
+#if 0
+#if defined _OPENMP // Sherry: parallel reduction -- seems slower?
+#pragma omp parallel for reduction(max :local_max_row_size) private(lk,lsub)
+#endif
+#endif
+ for (int i = mycol; i < nsupers; i += Pc) { /* grab my local columns */
+ //int tpc = PCOL (i, grid);
+ lk = LBj (i, grid);
+ lsub = Lrowind_bc_ptr[lk];
+ if (lsub != NULL) {
+ if (lsub[1] > local_max_row_size) local_max_row_size = lsub[1];
+ }
}
- /* Max row size is global reduction of within A row */
- MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX, (grid->rscp.comm));
+ /* Max row size is global reduction within a row */
+ MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX,
+ (grid->rscp.comm));
- /* Buffer size is max of look ahead window */
+ /* Buffer size is max of look-ahead window */
/* int_t buffer_size =
SUPERLU_MAX (max_row_size * num_threads * ldt,
get_max_buffer_size ()); */
@@ -758,15 +806,24 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
Glu_persist, grid, perm_u );
#endif
+ /* +16 to avoid cache line false sharing */
+ int_t bigv_size = SUPERLU_MAX(max_row_size * (bigu_size / ldt),
+ (ldt*ldt + CACHELINE / dword) * num_threads);
+
/* bigU and bigV are either on CPU or on GPU, not both. */
double* bigU; /* for storing entire U(k,:) panel, prepare for GEMM.
- bigU has the same size either on CPU or on CPU. */
- double* bigV; /* for GEMM output matrix, i.e. update matrix.
- On CPU, bigV is small for block-by-block update.
- On GPU, bigV is large to hold the aggregate GEMM output.*/
+ bigU has the same size either on CPU or on CPU. */
+ double* bigV; /* for storing GEMM output matrix, i.e. update matrix.
+ bigV is large to hold the aggregate GEMM output.*/
#if ( PRNTlevel>=1 )
- if(!iam) printf("[%d] .. BIG U bigu_size " IFMT " (same either on CPU or GPU)\n", iam, bigu_size);
+ if(!iam) {
+ printf("max_nrows in L panel %d\n", max_row_size);
+ printf("\t.. GEMM buffer size: max_nrows X max_ncols = %d x %d\n",
+ max_row_size, (bigu_size / ldt));
+ printf(".. BIG U size %d\t BIG V size %d\n", bigu_size, bigv_size);
+ fflush(stdout);
+ }
#endif
#ifdef GPU_ACC
@@ -774,7 +831,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
if ( checkCuda(cudaHostAlloc((void**)&bigU, bigu_size * sizeof(double), cudaHostAllocDefault)) )
ABORT("Malloc fails for dgemm buffer U ");
- int bigv_size = buffer_size;
+ bigv_size = buffer_size;
#if ( PRNTlevel>=1 )
if (!iam) printf("[%d] .. BIG V bigv_size %d, using buffer_size %d (on GPU)\n", iam, bigv_size, buffer_size);
#endif
@@ -830,18 +887,24 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
stat->gpu_buffer += ( max_row_size * sp_ienv_dist(3)
+ bigu_size + buffer_size ) * dword;
-#else /* not to use GPU */
+#else /* not CUDA */
+ // for GEMM padding 0
+ j = bigu_size / ldt;
+ bigu_size += (gemm_k_pad * (j + ldt + gemm_n_pad));
+ bigv_size += (gemm_m_pad * (j + max_row_size + gemm_n_pad));
+
+#ifdef __INTEL_COMPILER
+ bigU = _mm_malloc(bigu_size * sizeof(double), 1<<12); // align at 4K page
+ bigV = _mm_malloc(bigv_size * sizeof(double), 1<<12);
+#else
if ( !(bigU = doubleMalloc_dist(bigu_size)) )
- ABORT ("Malloc fails for dgemm u buff U");
+ ABORT ("Malloc fails for dgemm U buffer");
//Maximum size of bigU= sqrt(buffsize) ?
-
- int bigv_size = 8 * ldt * ldt * num_threads;
-#if ( PRNTlevel>=1 )
- if (!iam) printf("[%d] .. BIG V size (on CPU) %d\n", iam, bigv_size);
-#endif
+ // int bigv_size = 8 * ldt * ldt * num_threads;
if ( !(bigV = doubleMalloc_dist(bigv_size)) )
- ABORT ("Malloc failed for dgemm buffer V");
+ ABORT ("Malloc failed for dgemm V buffer");
+#endif
#endif /* end ifdef GPU_ACC */
@@ -853,21 +916,27 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
if(!iam) {
printf (" Max row size is %d \n", max_row_size);
printf (" Threads per process %d \n", num_threads);
- /* printf (" Using buffer_size of %d \n", buffer_size); */
+ fflush(stdout);
}
+
#endif
+#if 0 /* Sherry */
if (!(tempv2d = doubleCalloc_dist (2 * ((size_t) ldt) * ldt)))
ABORT ("Calloc fails for tempv2d[].");
tempU2d = tempv2d + ldt * ldt;
- if (!(indirect = SUPERLU_MALLOC (ldt * num_threads * sizeof(int))))
+#endif
+ /* Sherry: (ldt + 16), avoid cache line false sharing.
+ KNL cacheline size = 64 bytes = 16 int */
+ iinfo = ldt + CACHELINE / sizeof(int);
+ if (!(indirect = SUPERLU_MALLOC (iinfo * num_threads * sizeof(int))))
ABORT ("Malloc fails for indirect[].");
- if (!(indirect2 = SUPERLU_MALLOC (ldt * num_threads * sizeof(int))))
+ if (!(indirect2 = SUPERLU_MALLOC (iinfo * num_threads * sizeof(int))))
ABORT ("Malloc fails for indirect[].");
if (!(iuip = intMalloc_dist (k))) ABORT ("Malloc fails for iuip[].");
if (!(ruip = intMalloc_dist (k))) ABORT ("Malloc fails for ruip[].");
- log_memory(2 * ldt *ldt * dword + 2 * ldt * num_threads * iword
+ log_memory(2 * ldt*ldt * dword + 2 * iinfo * num_threads * iword
+ 2 * k * iword, stat);
int_t *lookAheadFullRow,*lookAheadStRow,*lookAhead_lptr,*lookAhead_ib,
@@ -897,13 +966,12 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
#else
Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb*sizeof(Remain_info_t));
#endif
- log_memory(4 * mrb * iword + mrb * sizeof(Remain_info_t), stat);
- double *lookAhead_L_buff, *Remain_L_buff;
+ double *lookAhead_L_buff, *Remain_L_buff; /* Stores entire L-panel */
Ublock_info_t *Ublock_info;
- ldt = sp_ienv_dist (3); /* max supernode size */
+ ldt = sp_ienv_dist (3); /* max supernode size */
+ /* The following is quite loose */
lookAhead_L_buff = doubleMalloc_dist(ldt*ldt* (num_look_aheads+1) );
- log_memory(ldt * ldt * (num_look_aheads+1) * dword, stat);
#if 0
Remain_L_buff = (double *) _mm_malloc( sizeof(double)*(Llu->bufmax[1]),64);
@@ -912,13 +980,18 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
int * Ublock_info_rukp = (int *) _mm_malloc(mcb*sizeof(int),64);
int * Ublock_info_jb = (int *) _mm_malloc(mcb*sizeof(int),64);
#else
- Remain_L_buff = doubleMalloc_dist(Llu->bufmax[1]);
+ j = gemm_m_pad * (ldt + max_row_size + gemm_k_pad);
+ Remain_L_buff = doubleMalloc_dist(Llu->bufmax[1] + j); /* This is loose */
Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb*sizeof(Ublock_info_t));
int *Ublock_info_iukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
int *Ublock_info_rukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
int *Ublock_info_jb = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
#endif
- log_memory(Llu->bufmax[1] * dword, stat);
+
+ long long alloc_mem = 4 * mrb * iword + mrb * sizeof(Remain_info_t)
+ + ldt * ldt * (num_look_aheads+1) * dword
+ + Llu->bufmax[1] * dword ;
+ log_memory(alloc_mem, stat);
InitTimer = SuperLU_timer_() - tt1;
@@ -928,7 +1001,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
** Handle first block column separately to start the pipeline. **
################################################################## */
look_id = 0;
- msgcnt = msgcnts[0]; /* First count in the window */
+ msgcnt = msgcnts[0]; /* Lsub[0] to be transferred */
send_req = send_reqs[0];
recv_req = recv_reqs[0];
@@ -952,7 +1025,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
lsub = Lrowind_bc_ptr[lk];
lusup = Lnzval_bc_ptr[lk];
if (lsub) {
+ /* number of entries in Lsub_buf[] to be transferred */
msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+ /* number of entries in Lval_buf[] to be transferred */
msgcnt[1] = lsub[1] * SuperSize (k);
} else {
msgcnt[0] = msgcnt[1] = 0;
@@ -964,9 +1039,11 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
TIC (t1);
#endif
- MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj, SLU_MPI_TAG (0, 0) /* 0 */ ,
+ MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj,
+ SLU_MPI_TAG (0, 0) /* 0 */,
scp->comm, &send_req[pj]);
- MPI_Isend (lusup, msgcnt[1], MPI_DOUBLE, pj, SLU_MPI_TAG (1, 0) /* 1 */ ,
+ MPI_Isend (lusup, msgcnt[1], MPI_DOUBLE, pj,
+ SLU_MPI_TAG (1, 0) /* 1 */,
scp->comm, &send_req[pj + Pc]);
#if ( DEBUGlevel>=2 )
printf ("[%d] first block cloumn Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
@@ -976,6 +1053,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
#if ( PROFlevel>=1 )
TOC (t2, t1);
stat->utime[COMM] += t2;
+ stat->utime[COMM_RIGHT] += t2;
+ ++prof_sendR[lk];
msg_cnt += 2;
msg_vol += msgcnt[0] * iword + msgcnt[1] * dword;
#endif
@@ -984,12 +1063,20 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
} else { /* Post immediate receives. */
if (ToRecv[k] >= 1) { /* Recv block column L(:,0). */
scp = &grid->rscp; /* The scope of process row. */
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
MPI_Irecv (Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, kcol,
SLU_MPI_TAG (0, 0) /* 0 */ ,
scp->comm, &recv_req[0]);
MPI_Irecv (Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, kcol,
SLU_MPI_TAG (1, 0) /* 1 */ ,
scp->comm, &recv_req[1]);
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_RIGHT] += t2;
+#endif
}
} /* end if mycol == 0 */
@@ -1001,12 +1088,20 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
scp = &grid->cscp; /* The scope of process column. */
Usub_buf = Llu->Usub_buf_2[0];
Uval_buf = Llu->Uval_buf_2[0];
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
SLU_MPI_TAG (2, 0) /* 2%tag_ub */ ,
scp->comm, &recv_reqs_u[0][0]);
MPI_Irecv (Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow,
SLU_MPI_TAG (3, 0) /* 3%tag_ub */ ,
scp->comm, &recv_reqs_u[0][1]);
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_DOWN] += t2;
+#endif
}
}
@@ -1034,7 +1129,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
kk = perm_c_supno[kk0]; /* use the ordering from static schedule */
look_id = kk0 % (1 + num_look_aheads); /* which column in window */
- if (look_ahead[kk] < k0) { /* does not depend on current column */
+ if (look_ahead[kk] < k0) { /* does not depend on current column k */
kcol = PCOL (kk, grid);
if (mycol == kcol) { /* I own this panel */
@@ -1053,7 +1148,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
msgcnt = msgcnts[look_id]; /* point to the proper count array */
send_req = send_reqs[look_id];
- lk = LBj (kk, grid); /* Local block number in L */
+ lk = LBj (kk, grid); /* Local block number in L. */
lsub1 = Lrowind_bc_ptr[lk];
if (lsub1) {
msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR; /* size of metadata */
@@ -1066,12 +1161,21 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
for (pj = 0; pj < Pc; ++pj) {
if (ToSendR[lk][pj] != EMPTY) {
lusup1 = Lnzval_bc_ptr[lk];
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
scp->comm, &send_req[pj]);
MPI_Isend (lusup1, msgcnt[1], MPI_DOUBLE, pj,
SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
scp->comm, &send_req[pj + Pc]);
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_RIGHT] += t2;
+ ++prof_sendR[lk];
+#endif
#if ( DEBUGlevel>=2 )
printf ("[%d] -1- Send L(:,%4d): #lsub1 %4d, #lusup1 %4d right to Pj %2d\n",
iam, kk, msgcnt[0], msgcnt[1], pj);
@@ -1084,7 +1188,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
if (ToRecv[kk] >= 1) {
scp = &grid->rscp; /* The scope of process row. */
recv_req = recv_reqs[look_id];
-
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
MPI_Irecv (Lsub_buf_2[look_id], Llu->bufmax[0],
mpi_int_t, kcol, SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
scp->comm, &recv_req[0]);
@@ -1092,29 +1198,41 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
MPI_DOUBLE, kcol,
SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
scp->comm, &recv_req[1]);
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_RIGHT] += t2;
+#endif
}
/* stat->time10 += SuperLU_timer_() - ttt1; */
} /* end if mycol == Pc(kk) */
- } /* end if look-ahead in L supernodes */
+ } /* end if look-ahead in L panels */
- /* post irecv for U-row look-ahead */
+ /* Pre-post irecv for U-row look-ahead */
krow = PROW (kk, grid);
if (myrow != krow) {
if (ToRecv[kk] == 2) { /* post iRecv block row U(kk,:). */
scp = &grid->cscp; /* The scope of process column. */
Usub_buf = Llu->Usub_buf_2[look_id];
Uval_buf = Llu->Uval_buf_2[look_id];
-
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
SLU_MPI_TAG (2, kk0) /* (4*kk0+2)%tag_ub */ ,
scp->comm, &recv_reqs_u[look_id][0]);
MPI_Irecv (Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow,
SLU_MPI_TAG (3, kk0) /* (4*kk0+3)%tag_ub */ ,
scp->comm, &recv_reqs_u[look_id][1]);
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_DOWN] += t2;
+#endif
}
}
- } /* end for each column in look-ahead window for L supernodes */
+ } /* end for each column in look-ahead window for L panels */
/* stat->time4 += SuperLU_timer_()-tt1; */
@@ -1126,6 +1244,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
for (kk0 = kk1; kk0 < kk2; kk0++) {
kk = perm_c_supno[kk0]; /* order determined from static schedule */
if (factoredU[kk0] != 1 && look_ahead[kk] < k0) {
+ /* does not depend on current column k */
kcol = PCOL (kk, grid);
krow = PROW (kk, grid);
lk = LBj (kk, grid); /* Local block number across row. NOT USED?? -- Sherry */
@@ -1146,6 +1265,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
} else { /* Check to receive L(:,kk) from the left */
flag0 = flag1 = 0;
if ( ToRecv[kk] >= 1 ) {
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
if ( recv_req[0] != MPI_REQUEST_NULL ) {
MPI_Test (&recv_req[0], &flag0, &status);
if ( flag0 ) {
@@ -1161,7 +1283,14 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
recv_req[1] = MPI_REQUEST_NULL;
}
} else flag1 = 1;
- } else msgcnt[0] = 0;
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_RIGHT] += t2;
+#endif
+ } else {
+ msgcnt[0] = 0;
+ }
}
if (flag0 && flag1) { /* L(:,kk) is ready */
@@ -1171,10 +1300,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
factoredU[kk0] = 1;
/* Parallel triangular solve across process row *krow* --
U(k,j) = L(k,k) \ A(k,j). */
- /* double ttt2 = SuperLU_timer_(); */
double ttt2 = SuperLU_timer_();
#ifdef _OPENMP
-#pragma omp parallel
+/* #pragma omp parallel */ /* Sherry -- parallel done inside pdgstrs2 */
#endif
{
PDGSTRS2 (kk0, kk, Glu_persist, grid, Llu,
@@ -1226,7 +1354,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
/* stat->time2 += SuperLU_timer_()-tt1; */
} /* end if myrow == krow */
- } /* end if flag0 ... */
+ } /* end if flag0 & flag1 ... */
} /* end if factoredU[] ... */
} /* end for kk0 ... */
@@ -1248,13 +1376,21 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
if (mycol == kcol) {
lk = LBj (k, grid); /* Local block number in L */
+#if ( PROFlevel>=1 )
+ TIC(t1);
+#endif
for (pj = 0; pj < Pc; ++pj) {
- /* Wait for Isend to complete before using lsub/lusup buffer */
+ /* Wait for Isend to complete before using lsub/lusup buffer. */
if (ToSendR[lk][pj] != EMPTY) {
MPI_Wait (&send_req[pj], &status);
MPI_Wait (&send_req[pj + Pc], &status);
}
}
+#if ( PROFlevel>=1 )
+ TOC(t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_RIGHT] += t2;
+#endif
lsub = Lrowind_bc_ptr[lk];
lusup = Lnzval_bc_ptr[lk];
} else {
@@ -1265,8 +1401,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
/* ============================================= *
* Waiting for L(:,kk) for outer-product uptate *
* if iam in U(kk,:), then the diagonal block *
- * did not reach in time for panel factorization *
- * of U(k,:) *
+ * did not reach in time for panel factorization *
+ * of U(k,:). *
* ============================================= */
#if ( PROFlevel>=1 )
TIC (t1);
@@ -1298,6 +1434,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
#if ( PROFlevel>=1 )
TOC (t2, t1);
stat->utime[COMM] += t2;
+ stat->utime[COMM_RIGHT] += t2;
#endif
#if ( DEBUGlevel>=2 )
printf("[%d] Recv L(:,%4d): #lsub %4d, #lusup %4d from Pc %2d\n",
@@ -1315,7 +1452,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
lsub = Lsub_buf_2[look_id];
lusup = Lval_buf_2[look_id];
- } /* if mycol = Pc(k) */
+ } /* else if mycol = Pc(k) */
/* stat->time1 += SuperLU_timer_()-tt1; */
scp = &grid->cscp; /* The scope of process column. */
@@ -1331,7 +1468,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
U(k,j) = L(k,k) \ A(k,j). */
double ttt2 = SuperLU_timer_();
#ifdef _OPENMP
-#pragma omp parallel
+/* #pragma omp parallel */ /* Sherry -- parallel done inside pdgstrs2 */
#endif
{
PDGSTRS2 (k0, k, Glu_persist, grid, Llu, stat);
@@ -1350,7 +1487,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
if (ToSendD[lk] == YES) {
for (pi = 0; pi < Pr; ++pi) {
- if (pi != myrow) {
+ if (pi != myrow) { /* Matching recv was pre-posted before */
#if ( PROFlevel>=1 )
TIC (t1);
#endif
@@ -1363,6 +1500,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
#if ( PROFlevel>=1 )
TOC (t2, t1);
stat->utime[COMM] += t2;
+ stat->utime[COMM_DOWN] += t2;
msg_cnt += 2;
msg_vol += msgcnt[2] * iword + msgcnt[3] * dword;
#endif
@@ -1373,20 +1511,28 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
} /* for pi ... */
} /* if ToSendD ... */
- } else { /* Panel U(k,:) already factorized */
+ } else { /* Panel U(k,:) already factorized from previous look-ahead */
/* ================================================ *
- * Wait for downward sending of U(k,:) to complete *
- * for outer-product update *
- * =============================================== */
+ * Wait for downward sending of U(k,:) to complete *
+ * for outer-product update. *
+ * ================================================ */
if (ToSendD[lk] == YES) {
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
for (pi = 0; pi < Pr; ++pi) {
if (pi != myrow) {
MPI_Wait (&send_reqs_u[look_id][pi], &status);
MPI_Wait (&send_reqs_u[look_id][pi + Pr], &status);
}
}
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_DOWN] += t2;
+#endif
}
msgcnt[2] = msgcntsU[look_id][2];
msgcnt[3] = msgcntsU[look_id][3];
@@ -1395,9 +1541,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
} else { /* myrow != krow */
- /* ========================================= *
- * wait for U(k,:) for outer-product updates *
- * ========================================= */
+ /* ========================================== *
+ * Wait for U(k,:) for outer-product updates. *
+ * ========================================== */
if (ToRecv[k] == 2) { /* Recv block row U(k,:). */
#if ( PROFlevel>=1 )
@@ -1411,6 +1557,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
#if ( PROFlevel>=1 )
TOC (t2, t1);
stat->utime[COMM] += t2;
+ stat->utime[COMM_DOWN] += t2;
#endif
usub = Usub_buf;
uval = Uval_buf;
@@ -1484,8 +1631,12 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
j = jj0 = 0;
/************************************************************************/
+#if 0
+ for (jj = 0; jj < nub; ++jj) assert(perm_u[jj] == jj); /* Sherry */
+#endif
double ttx =SuperLU_timer_();
+//#include "dlook_ahead_update_v4.c"
#include "dlook_ahead_update.c"
lookaheadupdatetimer += SuperLU_timer_() - ttx;
@@ -1512,6 +1663,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
look_id = kk0 % (1 + num_look_aheads);
recv_req = recv_reqs[look_id];
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
MPI_Irecv (Lsub_buf_2[look_id], Llu->bufmax[0],
mpi_int_t, kcol, SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
scp->comm, &recv_req[0]);
@@ -1519,6 +1673,11 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
MPI_DOUBLE, kcol,
SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
scp->comm, &recv_req[1]);
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_RIGHT] += t2;
+#endif
}
} else {
lk = LBj (kk, grid); /* Local block number. */
@@ -1551,15 +1710,24 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
scp = &grid->rscp; /* The scope of process row. */
for (pj = 0; pj < Pc; ++pj) {
if (ToSendR[lk][pj] != EMPTY) {
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
scp->comm, &send_req[pj]);
MPI_Isend (lusup1, msgcnt[1], MPI_DOUBLE, pj,
SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
scp->comm, &send_req[pj + Pc]);
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_RIGHT] += t2;
+ ++prof_sendR[lk];
+#endif
}
- }
- } /* for pj ... */
+ } /* end for pj ... */
+ } /* if factored[kk] ... */
}
}
}
@@ -1575,6 +1743,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
#else
/*#include "SchCompUdt--Phi-2Ddynamic-alt.c"*/
+//#include "dSchCompUdt-2Ddynamic_v6.c"
+
#include "dSchCompUdt-2Ddynamic.c"
#endif
@@ -1584,7 +1754,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
NetSchurUpTimer += SuperLU_timer_() - tsch;
- } /* for k0 = 0, ... */
+ } /* MAIN LOOP for k0 = 0, ... */
/* ##################################################################
** END MAIN LOOP: for k0 = ...
@@ -1592,12 +1762,20 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
pxgstrfTimer = SuperLU_timer_() - pxgstrfTimer;
- /* updating total flops */
#if ( PRNTlevel>=1 )
+ /* Print detailed statistics */
+ /* Updating total flops */
+ double allflops;
+ MPI_Reduce(&RemainGEMM_flops, &allflops, 1, MPI_DOUBLE, MPI_SUM,
+ 0, grid->comm);
if ( iam==0 ) {
printf("\nInitialization time\t%8.2lf seconds\n"
"\t Serial: compute static schedule, allocate storage\n", InitTimer);
- printf("\n---- Time breakdown in factorization ----\n");
+ printf("\n==== Time breakdown in factorization (rank 0) ====\n");
+ printf("Panel factorization \t %8.2lf seconds\n",
+ pdgstrf2_timer + pdgstrs2_timer);
+ printf(".. L-panel pxgstrf2 \t %8.2lf seconds\n", pdgstrf2_timer);
+ printf(".. U-panel pxgstrs2 \t %8.2lf seconds\n", pdgstrs2_timer);
printf("Time in Look-ahead update \t %8.2lf seconds\n", lookaheadupdatetimer);
printf("Time in Schur update \t\t %8.2lf seconds\n", NetSchurUpTimer);
printf(".. Time to Gather L buffer\t %8.2lf (Separate L panel by Lookahead/Remain)\n", GatherLTimer);
@@ -1606,21 +1784,20 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
printf(".. Time in GEMM %8.2lf \n",
LookAheadGEMMTimer + RemainGEMMTimer);
printf("\t* Look-ahead\t %8.2lf \n", LookAheadGEMMTimer);
- printf("\t* Remain\t %8.2lf \n", RemainGEMMTimer);
-
+ printf("\t* Remain\t %8.2lf\tFlops %8.2le\tGflops %8.2lf\n",
+ RemainGEMMTimer, allflops, allflops/RemainGEMMTimer*1e-9);
printf(".. Time to Scatter %8.2lf \n",
LookAheadScatterTimer + RemainScatterTimer);
printf("\t* Look-ahead\t %8.2lf \n", LookAheadScatterTimer);
printf("\t* Remain\t %8.2lf \n", RemainScatterTimer);
- printf("Total Time in Factorization \t: %8.2lf seconds, \n", pxgstrfTimer);
- printf("Total time in Schur update with offload\t %8.2lf seconds,\n",CPUOffloadTimer );
+ printf("Total factorization time \t: %8.2lf seconds, \n", pxgstrfTimer);
printf("--------\n");
printf("GEMM maximum block: %d-%d-%d\n", gemm_max_m, gemm_max_k, gemm_max_n);
}
#endif
-#if ( DEBUGlevel>=2 )
+#if ( DEBUGlevel>=3 )
for (i = 0; i < Pr * Pc; ++i) {
if (iam == i) {
dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu);
@@ -1632,8 +1809,6 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
}
#endif
- // printf("Debug : MPI buffers 1\n");
-
/********************************************************
* Free memory *
********************************************************/
@@ -1673,7 +1848,6 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
SUPERLU_FREE (factored);
log_memory(-(6 * nsupers * iword), stat);
-
for (i = 0; i <= num_look_aheads; i++) {
SUPERLU_FREE (msgcnts[i]);
SUPERLU_FREE (msgcntsU[i]);
@@ -1693,8 +1867,6 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
SUPERLU_FREE (recv_reqs);
SUPERLU_FREE (send_reqs);
- // printf("Debug : MPI buffers 3\n");
-
#ifdef GPU_ACC
checkCuda (cudaFreeHost (bigV));
checkCuda (cudaFreeHost (bigU));
@@ -1705,15 +1877,19 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
SUPERLU_FREE( streams );
SUPERLU_FREE( stream_end_col );
#else
+ #ifdef __INTEL_COMPILER
+ _mm_free (bigU);
+ _mm_free (bigV);
+ #else
SUPERLU_FREE (bigV);
SUPERLU_FREE (bigU);
-#endif
-
+ #endif
+ /* Decrement freed memory from memory stat. */
log_memory(-(bigv_size + bigu_size) * dword, stat);
- // printf("Debug : MPI buffers 5\n");
+#endif
SUPERLU_FREE (Llu->ujrow);
- SUPERLU_FREE (tempv2d);
+ // SUPERLU_FREE (tempv2d);/* Sherry */
SUPERLU_FREE (indirect);
SUPERLU_FREE (indirect2); /* Sherry added */
SUPERLU_FREE (iuip);
@@ -1727,7 +1903,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
SUPERLU_FREE(omp_loop_time);
SUPERLU_FREE(full_u_cols);
SUPERLU_FREE(blk_ldu);
+#if ( PRNTlevel>=1 )
log_memory(-2 * ncb * dword, stat);
+#endif
SUPERLU_FREE(lookAheadFullRow);
SUPERLU_FREE(lookAheadStRow);
@@ -1761,8 +1939,6 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
if ( iinfo == n + 1 ) *info = 0;
else *info = iinfo;
- // printf("test out\n");
-
#if ( PROFlevel>=1 )
TOC (t2, t1);
stat->utime[COMM] += t2;
@@ -1777,13 +1953,29 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
MPI_Reduce (&msg_vol, &msg_vol_max,
1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
- if (!iam) {
+ if ( iam==0 ) {
printf ("\tPDGSTRF comm stat:"
"\tAvg\tMax\t\tAvg\tMax\n"
"\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
msg_cnt_sum / Pr / Pc, msg_cnt_max,
msg_vol_sum / Pr / Pc * 1e-6, msg_vol_max * 1e-6);
+ printf("\t\tcomm time on task 0: %8.2lf\n"
+ "\t\t\tcomm down DIAG block %8.2lf\n"
+ "\t\t\tcomm right L panel %8.2lf\n"
+ "\t\t\tcomm down U panel %8.2lf\n",
+ stat->utime[COMM], stat->utime[COMM_DIAG],
+ stat->utime[COMM_RIGHT], stat->utime[COMM_DOWN]);
+ //#include <float.h>
+ //int Digs = DECIMAL_DIG;
+ printf("gemm_count %d\n", gemm_count);
+ for (i = 0; i < gemm_count; ++i)
+ fprintf(fgemm, "%8d%8d%8d\t %20.16e\t%8d\n", gemm_stats[i].m, gemm_stats[i].n,
+ gemm_stats[i].k, gemm_stats[i].microseconds, prof_sendR[i]);
+
+ fclose(fgemm);
}
+ SUPERLU_FREE(gemm_stats);
+ SUPERLU_FREE(prof_sendR);
}
#endif
@@ -1796,7 +1988,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
printf (".. # total msg\t%d\n", iinfo);
#endif
-#if ( DEBUGlevel>=2 )
+#if ( DEBUGlevel>=3 )
for (i = 0; i < Pr * Pc; ++i) {
if (iam == i) {
dPrintLblocks (iam, nsupers, grid, Glu_persist, Llu);
diff --git a/SRC/pdgstrf2.c b/SRC/pdgstrf2.c
index 06f0f37..bdff2eb 100644
--- a/SRC/pdgstrf2.c
+++ b/SRC/pdgstrf2.c
@@ -14,10 +14,13 @@ at the top-level directory.
* \brief Performs panel LU factorization.
*
* <pre>
- * -- Distributed SuperLU routine (version 4.0) --
+ * -- Distributed SuperLU routine (version 5.2) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* August 15, 2014
*
+ * Modified:
+ * September 30, 2017
+ *
* <pre>
* Purpose
* =======
@@ -97,6 +100,7 @@ pdgstrf2_trsm
int_t Pr;
MPI_Status status;
MPI_Comm comm = (grid->cscp).comm;
+ double t1, t2;
/* Initialization. */
iam = grid->iam;
@@ -128,16 +132,25 @@ pdgstrf2_trsm
if ( U_diag_blk_send_req &&
U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL ) {
/* There are pending sends - wait for all Isend to complete */
- for (pr = 0; pr < Pr; ++pr)
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
+ for (pr = 0; pr < Pr; ++pr) {
if (pr != myrow) {
MPI_Wait (U_diag_blk_send_req + pr, &status);
}
-
+ }
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_DIAG] += t2;
+#endif
/* flag no more outstanding send request. */
U_diag_blk_send_req[myrow] = MPI_REQUEST_NULL;
}
if (iam == pkk) { /* diagonal process */
+ /* ++++ First step compute diagonal block ++++++++++ */
for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */
/* Diagonal pivot */
i = luptr;
@@ -196,13 +209,16 @@ pdgstrf2_trsm
} /* for column j ... first loop */
- /* ++++++++++second step ====== */
+ /* ++++ Second step compute off-diagonal block with communication ++*/
ublk_ptr = ujrow = Llu->ujrow;
- if (U_diag_blk_send_req && iam == pkk) { /* Send the U block */
+ if (U_diag_blk_send_req && iam == pkk) { /* Send the U block downward */
/** ALWAYS SEND TO ALL OTHERS - TO FIX **/
- for (pr = 0; pr < Pr; ++pr)
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
+ for (pr = 0; pr < Pr; ++pr) {
if (pr != krow) {
/* tag = ((k0<<2)+2) % tag_ub; */
/* tag = (4*(nsupers+k0)+2) % tag_ub; */
@@ -211,6 +227,12 @@ pdgstrf2_trsm
comm, U_diag_blk_send_req + pr);
}
+ }
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_DIAG] += t2;
+#endif
/* flag outstanding Isend */
U_diag_blk_send_req[krow] = (MPI_Request) TRUE; /* Sherry */
@@ -218,8 +240,6 @@ pdgstrf2_trsm
/* pragma below would be changed by an MKL call */
- char uplo = 'u', side = 'r', transa = 'n', diag = 'n';
-
l = nsupr - nsupc;
// n = nsupc;
double alpha = 1.0;
@@ -229,32 +249,36 @@ pdgstrf2_trsm
#endif
#if defined (USE_VENDOR_BLAS)
- dtrsm_ (&side, &uplo, &transa, &diag,
- &l, &nsupc,
+ dtrsm_ ("R", "U", "N", "N", &l, &nsupc,
&alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr,
1, 1, 1, 1);
#else
- dtrsm_ (&side, &uplo, &transa, &diag,
- &l, &nsupc,
+ dtrsm_ ("R", "U", "N", "N", &l, &nsupc,
&alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr);
#endif
-
+ stat->ops[FACT] += (flops_t) nsupc * (nsupc+1) * l;
} else { /* non-diagonal process */
- /* ================================================ *
- * Receive the diagonal block of U *
- * for panel factorization of L(:,k) *
- * note: we block for panel factorization of L(:,k) *
- * but panel factorization of U(:,k) don't *
- * ================================================ */
+ /* ================================================================== *
+ * Receive the diagonal block of U for panel factorization of L(:,k). *
+ * Note: we block for panel factorization of L(:,k), but panel *
+ * factorization of U(:,k) do not block *
+ * ================================================================== */
/* tag = ((k0<<2)+2) % tag_ub; */
/* tag = (4*(nsupers+k0)+2) % tag_ub; */
// printf("hello message receiving%d %d\n",(nsupc*(nsupc+1))>>1,SLU_MPI_TAG(4,k0));
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
MPI_Recv (ublk_ptr, (nsupc * nsupc), MPI_DOUBLE, krow,
SLU_MPI_TAG (4, k0) /* tag */ ,
comm, &status);
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_DIAG] += t2;
+#endif
if (nsupr > 0) {
- char uplo = 'u', side = 'r', transa = 'n', diag = 'n';
double alpha = 1.0;
#ifdef PI_DEBUG
@@ -263,17 +287,16 @@ pdgstrf2_trsm
printf (" Rank :%d \t Empty block column occured :\n", iam);
#endif
#if defined (USE_VENDOR_BLAS)
- dtrsm_ (&side, &uplo, &transa, &diag,
- &nsupr, &nsupc,
+ dtrsm_ ("R", "U", "N", "N", &nsupr, &nsupc,
&alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr, 1, 1, 1, 1);
#else
- dtrsm_ (&side, &uplo, &transa, &diag,
- &nsupr, &nsupc,
+ dtrsm_ ("R", "U", "N", "N", &nsupr, &nsupc,
&alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr);
#endif
+ stat->ops[FACT] += (flops_t) nsupc * (nsupc+1) * nsupr;
}
- } /* end if pkk ... */
+ } /* end if pkk ... */
/* printf("exiting pdgstrf2 %d \n", grid->iam); */
@@ -300,12 +323,10 @@ void pdgstrs2_omp
int_t *usub;
double *lusup, *uval;
-#ifdef _OPENMP
- int thread_id = omp_get_thread_num ();
- int num_thread = omp_get_num_threads ();
-#else
- int thread_id = 0;
- int num_thread = 1;
+#if 0
+ //#ifdef USE_VTUNE
+ __SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+ __itt_resume(); // start VTune, again use 2 underscores
#endif
/* Quick return. */
@@ -315,15 +336,12 @@ void pdgstrs2_omp
/* Initialization. */
iam = grid->iam;
pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
- int k_row_cycle = k / grid->nprow; /* for which cycle k exist (to assign rowwise thread blocking) */
- int gb_col_cycle; /* cycle through block columns */
+ //int k_row_cycle = k / grid->nprow; /* for which cycle k exist (to assign rowwise thread blocking) */
+ //int gb_col_cycle; /* cycle through block columns */
klst = FstBlockC (k + 1);
knsupc = SuperSize (k);
usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */
uval = Llu->Unzval_br_ptr[lk];
- nb = usub[0];
- iukp = BR_HEADER;
- rukp = 0;
if (iam == pkk) {
lk = LBj (k, grid);
nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
@@ -333,28 +351,45 @@ void pdgstrs2_omp
lusup = Llu->Lval_buf_2[k0 % (1 + stat->num_look_aheads)];
}
- /* Loop through all the row blocks. */
- for (b = 0; b < nb; ++b) {
- /* assuming column cyclic distribution of data among threads */
- gb = usub[iukp];
- gb_col_cycle = gb / grid->npcol;
- nsupc = SuperSize (gb);
- iukp += UB_DESCRIPTOR;
+ /////////////////////new-test//////////////////////////
+ /* !! Taken from Carl/SuperLU_DIST_5.1.0/EXAMPLE/pdgstrf2_v3.c !! */
+
+ /* Master thread: set up pointers to each block in the row */
+ nb = usub[0];
+ iukp = BR_HEADER;
+ rukp = 0;
+
+ int* blocks_index_pointers = SUPERLU_MALLOC (3 * nb * sizeof(int));
+ int* blocks_value_pointers = blocks_index_pointers + nb;
+ int* nsupc_temp = blocks_value_pointers + nb;
+ for (b = 0; b < nb; b++) { /* set up pointers to each block */
+ blocks_index_pointers[b] = iukp + UB_DESCRIPTOR;
+ blocks_value_pointers[b] = rukp;
+ gb = usub[iukp];
+ rukp += usub[iukp+1];
+ nsupc = SuperSize( gb );
+ nsupc_temp[b] = nsupc;
+ iukp += (UB_DESCRIPTOR + nsupc); /* move to the next block */
+ }
+
+ // Sherry: this version is more NUMA friendly compared to pdgstrf2_v2.c
+ // https://stackoverflow.com/questions/13065943/task-based-programming-pragma-omp-task-versus-pragma-omp-parallel-for
+#pragma omp parallel for schedule(static) default(shared) \
+ private(b,j,iukp,rukp,segsize)
+ /* Loop through all the blocks in the row. */
+ for (b = 0; b < nb; ++b) {
+ iukp = blocks_index_pointers[b];
+ rukp = blocks_value_pointers[b];
/* Loop through all the segments in the block. */
- for (j = 0; j < nsupc; ++j) {
-#ifdef PI_DEBUG
- printf("segsize %d klst %d usub[%d] : %d",segsize,klst ,iukp,usub[iukp]);
-#endif
+ for (j = 0; j < nsupc_temp[b]; j++) {
segsize = klst - usub[iukp++];
- if (segsize) { /* Nonzero segment. */
- luptr = (knsupc - segsize) * (nsupr + 1);
+ if (segsize) {
+#pragma omp task default(shared) firstprivate(segsize,rukp) if (segsize > 30)
+ { /* Nonzero segment. */
+ int_t luptr = (knsupc - segsize) * (nsupr + 1);
+ //printf("[2] segsize %d, nsupr %d\n", segsize, nsupr);
- /* if gb belongs to present thread then do the factorize */
- if ((gb_col_cycle + k_row_cycle + 1) % num_thread == thread_id) {
-#ifdef PI_DEBUG
- printf ("dtrsv param 4 %d param 6 %d\n", segsize, nsupr);
-#endif
#if defined (USE_VENDOR_BLAS)
dtrsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr,
&uval[rukp], &incx, 1, 1, 1);
@@ -362,14 +397,22 @@ void pdgstrs2_omp
dtrsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr,
&uval[rukp], &incx);
#endif
- }
+ } /* end task */
+ rukp += segsize;
+ stat->ops[FACT] += segsize * (segsize + 1);
+ } /* end if segsize > 0 */
+ } /* end for j in parallel ... */
+/* #pragma omp taskwait */
+ } /* end for b ... */
- if (thread_id == 0)
- stat->ops[FACT] += segsize * (segsize + 1); // master thread updated the stats
- rukp += segsize;
- }
- }
- } /* for b ... */
+ /* Deallocate memory */
+ SUPERLU_FREE(blocks_index_pointers);
+
+#if 0
+ //#ifdef USE_VTUNE
+ __itt_pause(); // stop VTune
+ __SSC_MARK(0x222); // stop SDE tracing
+#endif
} /* PDGSTRS2_omp */
diff --git a/SRC/psymbfact.h b/SRC/psymbfact.h
index b65f382..549e51e 100644
--- a/SRC/psymbfact.h
+++ b/SRC/psymbfact.h
@@ -279,8 +279,10 @@ typedef struct {
/* Code for the type of the memory to expand */
#define USUB_PR 0
#define LSUB_PR 1
+/* Sherry: the following are already defined in superlu_enum_const.h
#define USUB 0
#define LSUB 1
+*/
/*
* Code for the type of computation - right looking (RL_SYMB); left
@@ -297,6 +299,3 @@ typedef struct {
#endif /* __SUPERLU_DIST_PSYMBFACT */
-
-
-
diff --git a/SRC/pzgssvx.c b/SRC/pzgssvx.c
index 288e6eb..cf92c9f 100644
--- a/SRC/pzgssvx.c
+++ b/SRC/pzgssvx.c
@@ -649,8 +649,10 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
}
/* ------------------------------------------------------------
- Diagonal scaling to equilibrate the matrix. (simple scheme)
- ------------------------------------------------------------*/
+ * Diagonal scaling to equilibrate the matrix. (simple scheme)
+ * for row i = 1:n, A(i,:) <- A(i,:) / max(abs(A(i,:));
+ * for column j = 1:n, A(:,j) <- A(:, j) / max(abs(A(:,j))
+ * ------------------------------------------------------------*/
if ( Equil ) {
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Enter equil");
@@ -727,7 +729,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
#if ( PRNTlevel>=1 )
if ( !iam ) {
printf(".. equilibrated? *equed = %c\n", *equed);
- /*fflush(stdout);*/
+ fflush(stdout);
}
#endif
} /* end if Fact ... */
@@ -897,8 +899,10 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
t = SuperLU_timer_() - t;
stat->utime[ROWPERM] = t;
#if ( PRNTlevel>=1 )
- if ( !iam ) printf(".. LDPERM job " IFMT "\t time: %.2f\n",
- job, t);
+ if ( !iam ) {
+ printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t);
+ fflush(stdout);
+ }
#endif
} /* end if Fact ... */
@@ -917,7 +921,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
else *(unsigned char *)norm = 'I';
anorm = pzlangs(norm, A, grid);
#if ( PRNTlevel>=1 )
- if ( !iam ) printf(".. anorm %e\n", anorm);
+ if ( !iam ) { printf(".. anorm %e\n", anorm); fflush(stdout); }
#endif
}
@@ -1021,9 +1025,11 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
/* Perform a symbolic factorization on Pc*Pr*A*Pc^T and set up
the nonzero data structures for L & U. */
#if ( PRNTlevel>=1 )
- if ( !iam )
- printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
+ if ( !iam ) {
+ printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
+ fflush(stdout);
+ }
#endif
t = SuperLU_timer_();
if ( !(Glu_freeable = (Glu_freeable_t *)
@@ -1049,6 +1055,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
symb_mem_usage.for_lu*1e-6,
symb_mem_usage.total*1e-6,
symb_mem_usage.expansions);
+ fflush(stdout);
}
#endif
} else { /* symbfact out of memory */
@@ -1217,6 +1224,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
avg / grid->nprow / grid->npcol * 1e-6,
max * 1e-6);
printf("**************************************************\n");
+ fflush(stdout);
}
} /* end printing stats */
diff --git a/SRC/pzgstrf.c b/SRC/pzgstrf.c
index 61c3aa4..2c1eda0 100644
--- a/SRC/pzgstrf.c
+++ b/SRC/pzgstrf.c
@@ -13,7 +13,7 @@ at the top-level directory.
* \brief Performs LU factorization in parallel
*
* <pre>
- * -- Distributed SuperLU routine (version 4.3) --
+ * -- Distributed SuperLU routine (version 5.2) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* October 1, 2014
*
@@ -24,7 +24,8 @@ at the top-level directory.
* July 12, 2011 static scheduling and arbitrary look-ahead
* March 13, 2013 change NTAGS to MPI_TAG_UB value
* September 24, 2015 replace xLAMCH by xMACH, using C99 standard.
- * December 31, 2015 rename xMACH to xMACH_DIST
+ * December 31, 2015 rename xMACH to xMACH_DIST.
+ * September 30, 2017 optimization for Intel Knights Landing (KNL) node .
*
* Sketch of the algorithm
*
@@ -138,6 +139,14 @@ at the top-level directory.
*/
#define PHI_FRAMEWORK
+#if 0
+#define CACHELINE 64 /* bytes, Xeon Phi KNL */
+#else
+#define CACHELINE 0 /* not worry about false sharing of different threads */
+#endif
+//#define GEMM_PADLEN 1
+#define GEMM_PADLEN 8
+
#define PZGSTRF2 pzgstrf2_trsm
#define PZGSTRS2 pzgstrs2_omp
@@ -275,7 +284,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
int_t *iuip, *ruip; /* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */
doublecomplex *ucol;
int *indirect, *indirect2;
- doublecomplex *tempv, *tempv2d;
+ int_t *tempi;
+ doublecomplex *tempu, *tempv, *tempr;
+ /* doublecomplex *tempv2d, *tempU2d; Sherry */
int iinfo;
int *ToRecv, *ToSendD, **ToSendR;
Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
@@ -283,8 +294,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
superlu_scope_t *scp;
float s_eps;
double thresh;
- doublecomplex *tempU2d, *tempu;
- int full, ldt, ldu, lead_zero, ncols, ncb, nrb, p, pr, pc, nblocks;
+ /*int full;*/
+ int ldt, ldu, lead_zero, ncols, ncb, nrb, p, pr, pc, nblocks;
int_t *etree_supno_l, *etree_supno, *blocks, *blockr, *Ublock, *Urows,
*Lblock, *Lrows, *perm_u, *sf_block, *sf_block_l, *nnodes_l,
*nnodes_u, *edag_supno_l, *recvbuf, **edag_supno;
@@ -298,10 +309,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
* 2 : transferred in Usub_buf[]
* 3 : transferred in Uval_buf[]
*/
- int **msgcnts, **msgcntsU; /* counts for each panel in the
- look-ahead window */
- int *factored; /* factored[j]==0 : L col panel j is factorized */
- int *factoredU; /* factoredU[i]==1 : U row panel i is factorized */
+ int **msgcnts, **msgcntsU; /* counts in the look-ahead window */
+ int *factored; /* factored[j] == 0 : L col panel j is factorized. */
+ int *factoredU; /* factoredU[i] == 1 : U row panel i is factorized. */
int nnodes, *sendcnts, *sdispls, *recvcnts, *rdispls, *srows, *rrows;
etree_node *head, *tail, *ptr;
int *num_child;
@@ -314,16 +324,19 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
void *attr_val;
int flag;
+ /* The following variables are used to pad GEMM dimensions so that
+ each is a multiple of vector length (8 doubles for KNL) */
+ int gemm_m_pad = GEMM_PADLEN, gemm_k_pad = GEMM_PADLEN,
+ gemm_n_pad = GEMM_PADLEN;
+ int gemm_padding = 0;
+
int iword = sizeof (int_t);
int dword = sizeof (doublecomplex);
- /* For measuring load imbalence in omp threads*/
+ /* For measuring load imbalence in omp threads */
double omp_load_imblc = 0.0;
double *omp_loop_time;
- double CPUOffloadTimer = 0;
- double CPUOffloadFlop = 0;
- double CPUOffloadMop = 0;
double schur_flop_timer = 0.0;
double pdgstrf2_timer = 0.0;
double pdgstrs2_timer = 0.0;
@@ -331,8 +344,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
double InitTimer = 0.0; /* including compute schedule, malloc */
double tt_start, tt_end;
-#if !defined( GPU_ACC )
- /* Counter for couting memory operations */
+/* #if !defined( GPU_ACC ) */
+ /* Counters for memory operations and timings */
double scatter_mem_op_counter = 0.0;
double scatter_mem_op_timer = 0.0;
double scatterL_mem_op_counter = 0.0;
@@ -340,6 +353,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
double scatterU_mem_op_counter = 0.0;
double scatterU_mem_op_timer = 0.0;
+ /* Counters for flops/gather/scatter and timings */
double GatherLTimer = 0.0;
double LookAheadRowSepMOP = 0.0;
double GatherUTimer = 0.0;
@@ -349,10 +363,11 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
double LookAheadScatterTimer = 0.0;
double LookAheadScatterMOP = 0.0;
double RemainGEMMTimer = 0.0;
+ double RemainGEMM_flops = 0.0;
double RemainScatterTimer = 0.0;
double NetSchurUpTimer = 0.0;
double schur_flop_counter = 0.0;
-#endif
+/* #endif */
#if ( PRNTlevel>= 1)
/* count GEMM max dimensions */
@@ -368,6 +383,15 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
#if ( PROFlevel>=1 )
double t1, t2;
float msg_vol = 0, msg_cnt = 0;
+ double comm_wait_time = 0.0;
+ /* Record GEMM dimensions and times */
+ FILE *fopen(), *fgemm;
+ int gemm_count = 0;
+ typedef struct {
+ int m, n, k;
+ double microseconds;
+ } gemm_profile;
+ gemm_profile *gemm_stats;
#endif
/* Test the input parameters. */
@@ -383,6 +407,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
/* Quick return if possible. */
if (m == 0 || n == 0) return 0;
+
+ double tt1 = SuperLU_timer_ ();
/*
* Initialization.
@@ -405,8 +431,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
int tag_ub = *(int *) attr_val;
#if ( PRNTlevel>=1 )
- if (!iam)
- printf ("MPI tag upper bound = %d\n", tag_ub);
+ if (!iam) {
+ printf ("MPI tag upper bound = %d\n", tag_ub); fflush(stdout);
+ }
#endif
#if ( DEBUGlevel>=1 )
@@ -414,6 +441,11 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
printf (" ***** warning s_eps = %e *****\n", s_eps);
CHECK_MALLOC (iam, "Enter pdgstrf()");
#endif
+#if (PROFlevel >= 1 )
+ gemm_stats = (gemm_profile *) SUPERLU_MALLOC(nsupers * sizeof(gemm_profile));
+ if (iam == 0) fgemm = fopen("dgemm_mnk.dat", "w");
+ int *prof_sendR = intCalloc_dist(nsupers);
+#endif
stat->ops[FACT] = 0.0;
stat->current_buffer = 0.0;
@@ -435,29 +467,37 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
if (i != 0) {
if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist ((num_look_aheads + 1) * ((size_t) i))) )
ABORT ("Malloc fails for Lsub_buf.");
+ tempi = Llu->Lsub_buf_2[0];
for (jj = 0; jj < num_look_aheads; jj++)
- Llu->Lsub_buf_2[jj + 1] = Llu->Lsub_buf_2[jj] + i;
+ Llu->Lsub_buf_2[jj+1] = tempi + i*(jj+1); /* vectorize */
+ //Llu->Lsub_buf_2[jj + 1] = Llu->Lsub_buf_2[jj] + i;
}
i = Llu->bufmax[1];
if (i != 0) {
if (!(Llu->Lval_buf_2[0] = doublecomplexMalloc_dist ((num_look_aheads + 1) * ((size_t) i))))
ABORT ("Malloc fails for Lval_buf[].");
+ tempr = Llu->Lval_buf_2[0];
for (jj = 0; jj < num_look_aheads; jj++)
- Llu->Lval_buf_2[jj + 1] = Llu->Lval_buf_2[jj] + i;
+ Llu->Lval_buf_2[jj+1] = tempr + i*(jj+1); /* vectorize */
+ //Llu->Lval_buf_2[jj + 1] = Llu->Lval_buf_2[jj] + i;
}
i = Llu->bufmax[2];
if (i != 0) {
if (!(Llu->Usub_buf_2[0] = intMalloc_dist ((num_look_aheads + 1) * i)))
ABORT ("Malloc fails for Usub_buf_2[].");
+ tempi = Llu->Usub_buf_2[0];
for (jj = 0; jj < num_look_aheads; jj++)
- Llu->Usub_buf_2[jj + 1] = Llu->Usub_buf_2[jj] + i;
+ Llu->Usub_buf_2[jj+1] = tempi + i*(jj+1); /* vectorize */
+ //Llu->Usub_buf_2[jj + 1] = Llu->Usub_buf_2[jj] + i;
}
i = Llu->bufmax[3];
if (i != 0) {
if (!(Llu->Uval_buf_2[0] = doublecomplexMalloc_dist ((num_look_aheads + 1) * i)))
ABORT ("Malloc fails for Uval_buf_2[].");
+ tempr = Llu->Uval_buf_2[0];
for (jj = 0; jj < num_look_aheads; jj++)
- Llu->Uval_buf_2[jj + 1] = Llu->Uval_buf_2[jj] + i;
+ Llu->Uval_buf_2[jj+1] = tempr + i*(jj+1); /* vectorize */
+ //Llu->Uval_buf_2[jj + 1] = Llu->Uval_buf_2[jj] + i;
}
}
@@ -519,15 +559,16 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
if (!(factoredU = SUPERLU_MALLOC (nsupers * sizeof (int_t))))
ABORT ("Malloc fails for factoredU[].");
for (i = 0; i < nsupers; i++) factored[i] = factoredU[i] = -1;
+
log_memory(2 * nsupers * iword, stat);
int num_threads = 1;
#ifdef _OPENMP
#pragma omp parallel default(shared)
+ #pragma omp master
{
- if (omp_get_thread_num () == 0) {
- num_threads = omp_get_num_threads ();
- }
+ //if (omp_get_thread_num () == 0)
+ num_threads = omp_get_num_threads ();
}
#endif
@@ -538,9 +579,11 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
#endif
#if ( PRNTlevel>=1 )
- if(!iam) printf(".. Starting with %d OpenMP threads \n", num_threads );
+ if(!iam) {
+ printf(".. Starting with %d OpenMP threads \n", num_threads );
+ fflush(stdout);
+ }
#endif
- double tt1 = SuperLU_timer_ ();
nblocks = 0;
ncb = nsupers / Pc; /* number of column blocks, horizontal */
@@ -556,10 +599,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
full_u_cols = SUPERLU_MALLOC(ncb * sizeof(int));
blk_ldu = SUPERLU_MALLOC(ncb * sizeof(int));
#endif
- log_memory(2 * ncb * iword, stat);
-
- /* insert a check condition here */
+ log_memory(2 * ncb * iword, stat);
#if 0 /* Sherry: not used? */
/* This bunch is used for static scheduling */
@@ -595,11 +636,12 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
look_ahead_l = SUPERLU_MALLOC (nsupers * sizeof (int));
look_ahead = SUPERLU_MALLOC (nsupers * sizeof (int));
- for (lb = 0; lb < nsupers; lb++) look_ahead_l[lb] = -1;
+ for (lb = 0; lb < nsupers; lb++) look_ahead_l[lb] = -1; /* vectorized */
log_memory(3 * nsupers * iword, stat);
- /* go through U-factor */
- for (lb = 0; lb < nrb; ++lb) {
+ /* Sherry: omp parallel?
+ not worth doing, due to concurrent write to look_ahead_l[jb] */
+ for (lb = 0; lb < nrb; ++lb) { /* go through U-factor */
ib = lb * Pr + myrow;
index = Llu->Ufstnz_br_ptr[lb];
if (index) { /* Not an empty row */
@@ -613,7 +655,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
}
}
}
- if (myrow < nsupers % grid->nprow) {
+ if (myrow < nsupers % grid->nprow) { /* leftover block rows */
ib = nrb * Pr + myrow;
index = Llu->Ufstnz_br_ptr[nrb];
if (index) { /* Not an empty row */
@@ -629,8 +671,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
}
if (options->SymPattern == NO) {
- /* go through L-factor */
- for (lb = 0; lb < ncb; lb++) {
+ /* Sherry: omp parallel?
+ not worth doing, due to concurrent write to look_ahead_l[jb] */
+ for (lb = 0; lb < ncb; lb++) { /* go through L-factor */
ib = lb * Pc + mycol;
index = Llu->Lrowind_bc_ptr[lb];
if (index) {
@@ -644,7 +687,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
}
}
}
- if (mycol < nsupers % grid->npcol) {
+ if (mycol < nsupers % grid->npcol) { /* leftover block columns */
ib = ncb * Pc + mycol;
index = Llu->Lrowind_bc_ptr[ncb];
if (index) {
@@ -678,8 +721,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
/* Instead of half storage, we'll do full storage */
if (!(Llu->ujrow = doublecomplexCalloc_dist (k * k)))
ABORT ("Malloc fails for ujrow[].");
- log_memory(k * k * iword, stat);
#endif
+ log_memory(k * k * iword, stat);
#if ( PRNTlevel>=1 )
if (!iam) {
@@ -690,6 +733,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
(long int) Llu->bufmax[0], (long int) Llu->bufmax[1],
(long int) Llu->bufmax[2], (long int) Llu->bufmax[3],
(long int) Llu->bufmax[4]);
+ fflush(stdout);
}
#endif
@@ -704,26 +748,30 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
ldt = sp_ienv_dist (3); /* Size of maximum supernode */
k = CEILING (nsupers, Pr); /* Number of local block rows */
- /* Following circuit is for finding maximum block size */
+ /* Following code is for finding maximum row dimension of all L panels */
int local_max_row_size = 0;
int max_row_size;
- for (int i = 0; i < nsupers; ++i) {
- int tpc = PCOL (i, grid);
- if (mycol == tpc) {
- lk = LBj (i, grid);
- lsub = Lrowind_bc_ptr[lk];
- if (lsub != NULL) {
- local_max_row_size = SUPERLU_MAX (local_max_row_size, lsub[1]);
- }
- }
+#if 0
+#if defined _OPENMP // Sherry: parallel reduction -- seems slower?
+#pragma omp parallel for reduction(max :local_max_row_size) private(lk,lsub)
+#endif
+#endif
+ for (int i = mycol; i < nsupers; i += Pc) { /* grab my local columns */
+ //int tpc = PCOL (i, grid);
+ lk = LBj (i, grid);
+ lsub = Lrowind_bc_ptr[lk];
+ if (lsub != NULL) {
+ if (lsub[1] > local_max_row_size) local_max_row_size = lsub[1];
+ }
}
- /* Max row size is global reduction of within A row */
- MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX, (grid->rscp.comm));
+ /* Max row size is global reduction within a row */
+ MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX,
+ (grid->rscp.comm));
- /* Buffer size is max of look ahead window */
+ /* Buffer size is max of look-ahead window */
/* int_t buffer_size =
SUPERLU_MAX (max_row_size * num_threads * ldt,
get_max_buffer_size ()); */
@@ -758,15 +806,24 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
Glu_persist, grid, perm_u );
#endif
+ /* +16 to avoid cache line false sharing */
+ int_t bigv_size = SUPERLU_MAX(max_row_size * (bigu_size / ldt),
+ (ldt*ldt + CACHELINE / dword) * num_threads);
+
/* bigU and bigV are either on CPU or on GPU, not both. */
doublecomplex* bigU; /* for storing entire U(k,:) panel, prepare for GEMM.
- bigU has the same size either on CPU or on CPU. */
- doublecomplex* bigV; /* for GEMM output matrix, i.e. update matrix.
- On CPU, bigV is small for block-by-block update.
- On GPU, bigV is large to hold the aggregate GEMM output.*/
+ bigU has the same size either on CPU or on CPU. */
+ doublecomplex* bigV; /* for storing GEMM output matrix, i.e. update matrix.
+ bigV is large to hold the aggregate GEMM output.*/
#if ( PRNTlevel>=1 )
- if(!iam) printf("[%d] .. BIG U bigu_size " IFMT " (same either on CPU or GPU)\n", iam, bigu_size);
+ if(!iam) {
+ printf("max_nrows in L panel %d\n", max_row_size);
+ printf("\t.. GEMM buffer size: max_nrows X max_ncols = %d x %d\n",
+ max_row_size, (bigu_size / ldt));
+ printf(".. BIG U size %d\t BIG V size %d\n", bigu_size, bigv_size);
+ fflush(stdout);
+ }
#endif
#ifdef GPU_ACC
@@ -774,7 +831,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
if ( checkCuda(cudaHostAlloc((void**)&bigU, bigu_size * sizeof(doublecomplex), cudaHostAllocDefault)) )
ABORT("Malloc fails for zgemm buffer U ");
- int bigv_size = buffer_size;
+ bigv_size = buffer_size;
#if ( PRNTlevel>=1 )
if (!iam) printf("[%d] .. BIG V bigv_size %d, using buffer_size %d (on GPU)\n", iam, bigv_size, buffer_size);
#endif
@@ -830,18 +887,24 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
stat->gpu_buffer += ( max_row_size * sp_ienv_dist(3)
+ bigu_size + buffer_size ) * dword;
-#else /* not to use GPU */
+#else /* not CUDA */
+ // for GEMM padding 0
+ j = bigu_size / ldt;
+ bigu_size += (gemm_k_pad * (j + ldt + gemm_n_pad));
+ bigv_size += (gemm_m_pad * (j + max_row_size + gemm_n_pad));
+
+#ifdef __INTEL_COMPILER
+ bigU = _mm_malloc(bigu_size * sizeof(doublecomplex), 1<<12); // align at 4K page
+ bigV = _mm_malloc(bigv_size * sizeof(doublecomplex), 1<<12);
+#else
if ( !(bigU = doublecomplexMalloc_dist(bigu_size)) )
- ABORT ("Malloc fails for zgemm u buff U");
+ ABORT ("Malloc fails for zgemm U buffer");
//Maximum size of bigU= sqrt(buffsize) ?
-
- int bigv_size = 8 * ldt * ldt * num_threads;
-#if ( PRNTlevel>=1 )
- if (!iam) printf("[%d] .. BIG V size (on CPU) %d\n", iam, bigv_size);
-#endif
+ // int bigv_size = 8 * ldt * ldt * num_threads;
if ( !(bigV = doublecomplexMalloc_dist(bigv_size)) )
- ABORT ("Malloc failed for zgemm buffer V");
+ ABORT ("Malloc failed for zgemm V buffer");
+#endif
#endif /* end ifdef GPU_ACC */
@@ -853,21 +916,27 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
if(!iam) {
printf (" Max row size is %d \n", max_row_size);
printf (" Threads per process %d \n", num_threads);
- /* printf (" Using buffer_size of %d \n", buffer_size); */
+ fflush(stdout);
}
+
#endif
+#if 0 /* Sherry */
if (!(tempv2d = doublecomplexCalloc_dist (2 * ((size_t) ldt) * ldt)))
ABORT ("Calloc fails for tempv2d[].");
tempU2d = tempv2d + ldt * ldt;
- if (!(indirect = SUPERLU_MALLOC (ldt * num_threads * sizeof(int))))
+#endif
+ /* Sherry: (ldt + 16), avoid cache line false sharing.
+ KNL cacheline size = 64 bytes = 16 int */
+ iinfo = ldt + CACHELINE / sizeof(int);
+ if (!(indirect = SUPERLU_MALLOC (iinfo * num_threads * sizeof(int))))
ABORT ("Malloc fails for indirect[].");
- if (!(indirect2 = SUPERLU_MALLOC (ldt * num_threads * sizeof(int))))
+ if (!(indirect2 = SUPERLU_MALLOC (iinfo * num_threads * sizeof(int))))
ABORT ("Malloc fails for indirect[].");
if (!(iuip = intMalloc_dist (k))) ABORT ("Malloc fails for iuip[].");
if (!(ruip = intMalloc_dist (k))) ABORT ("Malloc fails for ruip[].");
- log_memory(2 * ldt *ldt * dword + 2 * ldt * num_threads * iword
+ log_memory(2 * ldt*ldt * dword + 2 * iinfo * num_threads * iword
+ 2 * k * iword, stat);
int_t *lookAheadFullRow,*lookAheadStRow,*lookAhead_lptr,*lookAhead_ib,
@@ -897,13 +966,12 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
#else
Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb*sizeof(Remain_info_t));
#endif
- log_memory(4 * mrb * iword + mrb * sizeof(Remain_info_t), stat);
- doublecomplex *lookAhead_L_buff, *Remain_L_buff;
+ doublecomplex *lookAhead_L_buff, *Remain_L_buff; /* Stores entire L-panel */
Ublock_info_t *Ublock_info;
- ldt = sp_ienv_dist (3); /* max supernode size */
+ ldt = sp_ienv_dist (3); /* max supernode size */
+ /* The following is quite loose */
lookAhead_L_buff = doublecomplexMalloc_dist(ldt*ldt* (num_look_aheads+1) );
- log_memory(ldt * ldt * (num_look_aheads+1) * dword, stat);
#if 0
Remain_L_buff = (doublecomplex *) _mm_malloc( sizeof(doublecomplex)*(Llu->bufmax[1]),64);
@@ -912,13 +980,18 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
int * Ublock_info_rukp = (int *) _mm_malloc(mcb*sizeof(int),64);
int * Ublock_info_jb = (int *) _mm_malloc(mcb*sizeof(int),64);
#else
- Remain_L_buff = doublecomplexMalloc_dist(Llu->bufmax[1]);
+ j = gemm_m_pad * (ldt + max_row_size + gemm_k_pad);
+ Remain_L_buff = doublecomplexMalloc_dist(Llu->bufmax[1] + j); /* This is loose */
Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb*sizeof(Ublock_info_t));
int *Ublock_info_iukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
int *Ublock_info_rukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
int *Ublock_info_jb = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
#endif
- log_memory(Llu->bufmax[1] * dword, stat);
+
+ long long alloc_mem = 4 * mrb * iword + mrb * sizeof(Remain_info_t)
+ + ldt * ldt * (num_look_aheads+1) * dword
+ + Llu->bufmax[1] * dword ;
+ log_memory(alloc_mem, stat);
InitTimer = SuperLU_timer_() - tt1;
@@ -928,7 +1001,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
** Handle first block column separately to start the pipeline. **
################################################################## */
look_id = 0;
- msgcnt = msgcnts[0]; /* First count in the window */
+ msgcnt = msgcnts[0]; /* Lsub[0] to be transferred */
send_req = send_reqs[0];
recv_req = recv_reqs[0];
@@ -952,7 +1025,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
lsub = Lrowind_bc_ptr[lk];
lusup = Lnzval_bc_ptr[lk];
if (lsub) {
+ /* number of entries in Lsub_buf[] to be transferred */
msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+ /* number of entries in Lval_buf[] to be transferred */
msgcnt[1] = lsub[1] * SuperSize (k);
} else {
msgcnt[0] = msgcnt[1] = 0;
@@ -964,9 +1039,11 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
TIC (t1);
#endif
- MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj, SLU_MPI_TAG (0, 0) /* 0 */ ,
+ MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj,
+ SLU_MPI_TAG (0, 0) /* 0 */,
scp->comm, &send_req[pj]);
- MPI_Isend (lusup, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj, SLU_MPI_TAG (1, 0) /* 1 */ ,
+ MPI_Isend (lusup, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj,
+ SLU_MPI_TAG (1, 0) /* 1 */,
scp->comm, &send_req[pj + Pc]);
#if ( DEBUGlevel>=2 )
printf ("[%d] first block cloumn Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
@@ -976,6 +1053,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
#if ( PROFlevel>=1 )
TOC (t2, t1);
stat->utime[COMM] += t2;
+ stat->utime[COMM_RIGHT] += t2;
+ ++prof_sendR[lk];
msg_cnt += 2;
msg_vol += msgcnt[0] * iword + msgcnt[1] * dword;
#endif
@@ -984,12 +1063,20 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
} else { /* Post immediate receives. */
if (ToRecv[k] >= 1) { /* Recv block column L(:,0). */
scp = &grid->rscp; /* The scope of process row. */
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
MPI_Irecv (Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, kcol,
SLU_MPI_TAG (0, 0) /* 0 */ ,
scp->comm, &recv_req[0]);
MPI_Irecv (Lval_buf_2[0], Llu->bufmax[1], SuperLU_MPI_DOUBLE_COMPLEX, kcol,
SLU_MPI_TAG (1, 0) /* 1 */ ,
scp->comm, &recv_req[1]);
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_RIGHT] += t2;
+#endif
}
} /* end if mycol == 0 */
@@ -1001,12 +1088,20 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
scp = &grid->cscp; /* The scope of process column. */
Usub_buf = Llu->Usub_buf_2[0];
Uval_buf = Llu->Uval_buf_2[0];
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
SLU_MPI_TAG (2, 0) /* 2%tag_ub */ ,
scp->comm, &recv_reqs_u[0][0]);
MPI_Irecv (Uval_buf, Llu->bufmax[3], SuperLU_MPI_DOUBLE_COMPLEX, krow,
SLU_MPI_TAG (3, 0) /* 3%tag_ub */ ,
scp->comm, &recv_reqs_u[0][1]);
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_DOWN] += t2;
+#endif
}
}
@@ -1034,7 +1129,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
kk = perm_c_supno[kk0]; /* use the ordering from static schedule */
look_id = kk0 % (1 + num_look_aheads); /* which column in window */
- if (look_ahead[kk] < k0) { /* does not depend on current column */
+ if (look_ahead[kk] < k0) { /* does not depend on current column k */
kcol = PCOL (kk, grid);
if (mycol == kcol) { /* I own this panel */
@@ -1053,7 +1148,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
msgcnt = msgcnts[look_id]; /* point to the proper count array */
send_req = send_reqs[look_id];
- lk = LBj (kk, grid); /* Local block number in L */
+ lk = LBj (kk, grid); /* Local block number in L. */
lsub1 = Lrowind_bc_ptr[lk];
if (lsub1) {
msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR; /* size of metadata */
@@ -1066,12 +1161,21 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
for (pj = 0; pj < Pc; ++pj) {
if (ToSendR[lk][pj] != EMPTY) {
lusup1 = Lnzval_bc_ptr[lk];
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
scp->comm, &send_req[pj]);
MPI_Isend (lusup1, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj,
SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
scp->comm, &send_req[pj + Pc]);
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_RIGHT] += t2;
+ ++prof_sendR[lk];
+#endif
#if ( DEBUGlevel>=2 )
printf ("[%d] -1- Send L(:,%4d): #lsub1 %4d, #lusup1 %4d right to Pj %2d\n",
iam, kk, msgcnt[0], msgcnt[1], pj);
@@ -1084,7 +1188,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
if (ToRecv[kk] >= 1) {
scp = &grid->rscp; /* The scope of process row. */
recv_req = recv_reqs[look_id];
-
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
MPI_Irecv (Lsub_buf_2[look_id], Llu->bufmax[0],
mpi_int_t, kcol, SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
scp->comm, &recv_req[0]);
@@ -1092,29 +1198,41 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
SuperLU_MPI_DOUBLE_COMPLEX, kcol,
SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
scp->comm, &recv_req[1]);
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_RIGHT] += t2;
+#endif
}
/* stat->time10 += SuperLU_timer_() - ttt1; */
} /* end if mycol == Pc(kk) */
- } /* end if look-ahead in L supernodes */
+ } /* end if look-ahead in L panels */
- /* post irecv for U-row look-ahead */
+ /* Pre-post irecv for U-row look-ahead */
krow = PROW (kk, grid);
if (myrow != krow) {
if (ToRecv[kk] == 2) { /* post iRecv block row U(kk,:). */
scp = &grid->cscp; /* The scope of process column. */
Usub_buf = Llu->Usub_buf_2[look_id];
Uval_buf = Llu->Uval_buf_2[look_id];
-
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
SLU_MPI_TAG (2, kk0) /* (4*kk0+2)%tag_ub */ ,
scp->comm, &recv_reqs_u[look_id][0]);
MPI_Irecv (Uval_buf, Llu->bufmax[3], SuperLU_MPI_DOUBLE_COMPLEX, krow,
SLU_MPI_TAG (3, kk0) /* (4*kk0+3)%tag_ub */ ,
scp->comm, &recv_reqs_u[look_id][1]);
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_DOWN] += t2;
+#endif
}
}
- } /* end for each column in look-ahead window for L supernodes */
+ } /* end for each column in look-ahead window for L panels */
/* stat->time4 += SuperLU_timer_()-tt1; */
@@ -1126,6 +1244,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
for (kk0 = kk1; kk0 < kk2; kk0++) {
kk = perm_c_supno[kk0]; /* order determined from static schedule */
if (factoredU[kk0] != 1 && look_ahead[kk] < k0) {
+ /* does not depend on current column k */
kcol = PCOL (kk, grid);
krow = PROW (kk, grid);
lk = LBj (kk, grid); /* Local block number across row. NOT USED?? -- Sherry */
@@ -1146,6 +1265,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
} else { /* Check to receive L(:,kk) from the left */
flag0 = flag1 = 0;
if ( ToRecv[kk] >= 1 ) {
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
if ( recv_req[0] != MPI_REQUEST_NULL ) {
MPI_Test (&recv_req[0], &flag0, &status);
if ( flag0 ) {
@@ -1161,7 +1283,14 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
recv_req[1] = MPI_REQUEST_NULL;
}
} else flag1 = 1;
- } else msgcnt[0] = 0;
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_RIGHT] += t2;
+#endif
+ } else {
+ msgcnt[0] = 0;
+ }
}
if (flag0 && flag1) { /* L(:,kk) is ready */
@@ -1171,10 +1300,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
factoredU[kk0] = 1;
/* Parallel triangular solve across process row *krow* --
U(k,j) = L(k,k) \ A(k,j). */
- /* double ttt2 = SuperLU_timer_(); */
double ttt2 = SuperLU_timer_();
#ifdef _OPENMP
-#pragma omp parallel
+/* #pragma omp parallel */ /* Sherry -- parallel done inside pzgstrs2 */
#endif
{
PZGSTRS2 (kk0, kk, Glu_persist, grid, Llu,
@@ -1226,7 +1354,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
/* stat->time2 += SuperLU_timer_()-tt1; */
} /* end if myrow == krow */
- } /* end if flag0 ... */
+ } /* end if flag0 & flag1 ... */
} /* end if factoredU[] ... */
} /* end for kk0 ... */
@@ -1248,13 +1376,21 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
if (mycol == kcol) {
lk = LBj (k, grid); /* Local block number in L */
+#if ( PROFlevel>=1 )
+ TIC(t1);
+#endif
for (pj = 0; pj < Pc; ++pj) {
- /* Wait for Isend to complete before using lsub/lusup buffer */
+ /* Wait for Isend to complete before using lsub/lusup buffer. */
if (ToSendR[lk][pj] != EMPTY) {
MPI_Wait (&send_req[pj], &status);
MPI_Wait (&send_req[pj + Pc], &status);
}
}
+#if ( PROFlevel>=1 )
+ TOC(t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_RIGHT] += t2;
+#endif
lsub = Lrowind_bc_ptr[lk];
lusup = Lnzval_bc_ptr[lk];
} else {
@@ -1265,8 +1401,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
/* ============================================= *
* Waiting for L(:,kk) for outer-product uptate *
* if iam in U(kk,:), then the diagonal block *
- * did not reach in time for panel factorization *
- * of U(k,:) *
+ * did not reach in time for panel factorization *
+ * of U(k,:). *
* ============================================= */
#if ( PROFlevel>=1 )
TIC (t1);
@@ -1298,6 +1434,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
#if ( PROFlevel>=1 )
TOC (t2, t1);
stat->utime[COMM] += t2;
+ stat->utime[COMM_RIGHT] += t2;
#endif
#if ( DEBUGlevel>=2 )
printf("[%d] Recv L(:,%4d): #lsub %4d, #lusup %4d from Pc %2d\n",
@@ -1315,7 +1452,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
lsub = Lsub_buf_2[look_id];
lusup = Lval_buf_2[look_id];
- } /* if mycol = Pc(k) */
+ } /* else if mycol = Pc(k) */
/* stat->time1 += SuperLU_timer_()-tt1; */
scp = &grid->cscp; /* The scope of process column. */
@@ -1331,7 +1468,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
U(k,j) = L(k,k) \ A(k,j). */
double ttt2 = SuperLU_timer_();
#ifdef _OPENMP
-#pragma omp parallel
+/* #pragma omp parallel */ /* Sherry -- parallel done inside pzgstrs2 */
#endif
{
PZGSTRS2 (k0, k, Glu_persist, grid, Llu, stat);
@@ -1350,7 +1487,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
if (ToSendD[lk] == YES) {
for (pi = 0; pi < Pr; ++pi) {
- if (pi != myrow) {
+ if (pi != myrow) { /* Matching recv was pre-posted before */
#if ( PROFlevel>=1 )
TIC (t1);
#endif
@@ -1363,6 +1500,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
#if ( PROFlevel>=1 )
TOC (t2, t1);
stat->utime[COMM] += t2;
+ stat->utime[COMM_DOWN] += t2;
msg_cnt += 2;
msg_vol += msgcnt[2] * iword + msgcnt[3] * dword;
#endif
@@ -1373,20 +1511,28 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
} /* for pi ... */
} /* if ToSendD ... */
- } else { /* Panel U(k,:) already factorized */
+ } else { /* Panel U(k,:) already factorized from previous look-ahead */
/* ================================================ *
- * Wait for downward sending of U(k,:) to complete *
- * for outer-product update *
- * =============================================== */
+ * Wait for downward sending of U(k,:) to complete *
+ * for outer-product update. *
+ * ================================================ */
if (ToSendD[lk] == YES) {
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
for (pi = 0; pi < Pr; ++pi) {
if (pi != myrow) {
MPI_Wait (&send_reqs_u[look_id][pi], &status);
MPI_Wait (&send_reqs_u[look_id][pi + Pr], &status);
}
}
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_DOWN] += t2;
+#endif
}
msgcnt[2] = msgcntsU[look_id][2];
msgcnt[3] = msgcntsU[look_id][3];
@@ -1395,9 +1541,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
} else { /* myrow != krow */
- /* ========================================= *
- * wait for U(k,:) for outer-product updates *
- * ========================================= */
+ /* ========================================== *
+ * Wait for U(k,:) for outer-product updates. *
+ * ========================================== */
if (ToRecv[k] == 2) { /* Recv block row U(k,:). */
#if ( PROFlevel>=1 )
@@ -1411,6 +1557,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
#if ( PROFlevel>=1 )
TOC (t2, t1);
stat->utime[COMM] += t2;
+ stat->utime[COMM_DOWN] += t2;
#endif
usub = Usub_buf;
uval = Uval_buf;
@@ -1484,8 +1631,12 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
j = jj0 = 0;
/************************************************************************/
+#if 0
+ for (jj = 0; jj < nub; ++jj) assert(perm_u[jj] == jj); /* Sherry */
+#endif
double ttx =SuperLU_timer_();
+//#include "zlook_ahead_update_v4.c"
#include "zlook_ahead_update.c"
lookaheadupdatetimer += SuperLU_timer_() - ttx;
@@ -1512,6 +1663,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
look_id = kk0 % (1 + num_look_aheads);
recv_req = recv_reqs[look_id];
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
MPI_Irecv (Lsub_buf_2[look_id], Llu->bufmax[0],
mpi_int_t, kcol, SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
scp->comm, &recv_req[0]);
@@ -1519,6 +1673,11 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
SuperLU_MPI_DOUBLE_COMPLEX, kcol,
SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
scp->comm, &recv_req[1]);
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_RIGHT] += t2;
+#endif
}
} else {
lk = LBj (kk, grid); /* Local block number. */
@@ -1551,15 +1710,24 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
scp = &grid->rscp; /* The scope of process row. */
for (pj = 0; pj < Pc; ++pj) {
if (ToSendR[lk][pj] != EMPTY) {
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
scp->comm, &send_req[pj]);
MPI_Isend (lusup1, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj,
SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
scp->comm, &send_req[pj + Pc]);
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_RIGHT] += t2;
+ ++prof_sendR[lk];
+#endif
}
- }
- } /* for pj ... */
+ } /* end for pj ... */
+ } /* if factored[kk] ... */
}
}
}
@@ -1575,6 +1743,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
#else
/*#include "SchCompUdt--Phi-2Ddynamic-alt.c"*/
+//#include "zSchCompUdt-2Ddynamic_v6.c"
+
#include "zSchCompUdt-2Ddynamic.c"
#endif
@@ -1584,7 +1754,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
NetSchurUpTimer += SuperLU_timer_() - tsch;
- } /* for k0 = 0, ... */
+ } /* MAIN LOOP for k0 = 0, ... */
/* ##################################################################
** END MAIN LOOP: for k0 = ...
@@ -1592,12 +1762,20 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
pxgstrfTimer = SuperLU_timer_() - pxgstrfTimer;
- /* updating total flops */
#if ( PRNTlevel>=1 )
+ /* Print detailed statistics */
+ /* Updating total flops */
+ double allflops;
+ MPI_Reduce(&RemainGEMM_flops, &allflops, 1, MPI_DOUBLE, MPI_SUM,
+ 0, grid->comm);
if ( iam==0 ) {
printf("\nInitialization time\t%8.2lf seconds\n"
"\t Serial: compute static schedule, allocate storage\n", InitTimer);
- printf("\n---- Time breakdown in factorization ----\n");
+ printf("\n==== Time breakdown in factorization (rank 0) ====\n");
+ printf("Panel factorization \t %8.2lf seconds\n",
+ pdgstrf2_timer + pdgstrs2_timer);
+ printf(".. L-panel pxgstrf2 \t %8.2lf seconds\n", pdgstrf2_timer);
+ printf(".. U-panel pxgstrs2 \t %8.2lf seconds\n", pdgstrs2_timer);
printf("Time in Look-ahead update \t %8.2lf seconds\n", lookaheadupdatetimer);
printf("Time in Schur update \t\t %8.2lf seconds\n", NetSchurUpTimer);
printf(".. Time to Gather L buffer\t %8.2lf (Separate L panel by Lookahead/Remain)\n", GatherLTimer);
@@ -1606,21 +1784,20 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
printf(".. Time in GEMM %8.2lf \n",
LookAheadGEMMTimer + RemainGEMMTimer);
printf("\t* Look-ahead\t %8.2lf \n", LookAheadGEMMTimer);
- printf("\t* Remain\t %8.2lf \n", RemainGEMMTimer);
-
+ printf("\t* Remain\t %8.2lf\tFlops %8.2le\tGflops %8.2lf\n",
+ RemainGEMMTimer, allflops, allflops/RemainGEMMTimer*1e-9);
printf(".. Time to Scatter %8.2lf \n",
LookAheadScatterTimer + RemainScatterTimer);
printf("\t* Look-ahead\t %8.2lf \n", LookAheadScatterTimer);
printf("\t* Remain\t %8.2lf \n", RemainScatterTimer);
- printf("Total Time in Factorization \t: %8.2lf seconds, \n", pxgstrfTimer);
- printf("Total time in Schur update with offload\t %8.2lf seconds,\n",CPUOffloadTimer );
+ printf("Total factorization time \t: %8.2lf seconds, \n", pxgstrfTimer);
printf("--------\n");
printf("GEMM maximum block: %d-%d-%d\n", gemm_max_m, gemm_max_k, gemm_max_n);
}
#endif
-#if ( DEBUGlevel>=2 )
+#if ( DEBUGlevel>=3 )
for (i = 0; i < Pr * Pc; ++i) {
if (iam == i) {
zPrintLblocks(iam, nsupers, grid, Glu_persist, Llu);
@@ -1632,8 +1809,6 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
}
#endif
- // printf("Debug : MPI buffers 1\n");
-
/********************************************************
* Free memory *
********************************************************/
@@ -1673,7 +1848,6 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
SUPERLU_FREE (factored);
log_memory(-(6 * nsupers * iword), stat);
-
for (i = 0; i <= num_look_aheads; i++) {
SUPERLU_FREE (msgcnts[i]);
SUPERLU_FREE (msgcntsU[i]);
@@ -1693,8 +1867,6 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
SUPERLU_FREE (recv_reqs);
SUPERLU_FREE (send_reqs);
- // printf("Debug : MPI buffers 3\n");
-
#ifdef GPU_ACC
checkCuda (cudaFreeHost (bigV));
checkCuda (cudaFreeHost (bigU));
@@ -1705,15 +1877,19 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
SUPERLU_FREE( streams );
SUPERLU_FREE( stream_end_col );
#else
+ #ifdef __INTEL_COMPILER
+ _mm_free (bigU);
+ _mm_free (bigV);
+ #else
SUPERLU_FREE (bigV);
SUPERLU_FREE (bigU);
-#endif
-
+ #endif
+ /* Decrement freed memory from memory stat. */
log_memory(-(bigv_size + bigu_size) * dword, stat);
- // printf("Debug : MPI buffers 5\n");
+#endif
SUPERLU_FREE (Llu->ujrow);
- SUPERLU_FREE (tempv2d);
+ // SUPERLU_FREE (tempv2d);/* Sherry */
SUPERLU_FREE (indirect);
SUPERLU_FREE (indirect2); /* Sherry added */
SUPERLU_FREE (iuip);
@@ -1727,7 +1903,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
SUPERLU_FREE(omp_loop_time);
SUPERLU_FREE(full_u_cols);
SUPERLU_FREE(blk_ldu);
+#if ( PRNTlevel>=1 )
log_memory(-2 * ncb * dword, stat);
+#endif
SUPERLU_FREE(lookAheadFullRow);
SUPERLU_FREE(lookAheadStRow);
@@ -1761,8 +1939,6 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
if ( iinfo == n + 1 ) *info = 0;
else *info = iinfo;
- // printf("test out\n");
-
#if ( PROFlevel>=1 )
TOC (t2, t1);
stat->utime[COMM] += t2;
@@ -1777,13 +1953,29 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
MPI_Reduce (&msg_vol, &msg_vol_max,
1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
- if (!iam) {
+ if ( iam==0 ) {
printf ("\tPZGSTRF comm stat:"
"\tAvg\tMax\t\tAvg\tMax\n"
"\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
msg_cnt_sum / Pr / Pc, msg_cnt_max,
msg_vol_sum / Pr / Pc * 1e-6, msg_vol_max * 1e-6);
+ printf("\t\tcomm time on task 0: %8.2lf\n"
+ "\t\t\tcomm down DIAG block %8.2lf\n"
+ "\t\t\tcomm right L panel %8.2lf\n"
+ "\t\t\tcomm down U panel %8.2lf\n",
+ stat->utime[COMM], stat->utime[COMM_DIAG],
+ stat->utime[COMM_RIGHT], stat->utime[COMM_DOWN]);
+ //#include <float.h>
+ //int Digs = DECIMAL_DIG;
+ printf("gemm_count %d\n", gemm_count);
+ for (i = 0; i < gemm_count; ++i)
+ fprintf(fgemm, "%8d%8d%8d\t %20.16e\t%8d\n", gemm_stats[i].m, gemm_stats[i].n,
+ gemm_stats[i].k, gemm_stats[i].microseconds, prof_sendR[i]);
+
+ fclose(fgemm);
}
+ SUPERLU_FREE(gemm_stats);
+ SUPERLU_FREE(prof_sendR);
}
#endif
@@ -1796,7 +1988,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
printf (".. # total msg\t%d\n", iinfo);
#endif
-#if ( DEBUGlevel>=2 )
+#if ( DEBUGlevel>=3 )
for (i = 0; i < Pr * Pc; ++i) {
if (iam == i) {
zPrintLblocks (iam, nsupers, grid, Glu_persist, Llu);
diff --git a/SRC/pzgstrf2.c b/SRC/pzgstrf2.c
index 3f63915..b4e3aca 100644
--- a/SRC/pzgstrf2.c
+++ b/SRC/pzgstrf2.c
@@ -13,10 +13,13 @@ at the top-level directory.
* \brief Performs panel LU factorization.
*
* <pre>
- * -- Distributed SuperLU routine (version 4.0) --
+ * -- Distributed SuperLU routine (version 5.2) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* August 15, 2014
*
+ * Modified:
+ * September 30, 2017
+ *
* <pre>
* Purpose
* =======
@@ -96,6 +99,7 @@ pzgstrf2_trsm
int_t Pr;
MPI_Status status;
MPI_Comm comm = (grid->cscp).comm;
+ double t1, t2;
/* Initialization. */
iam = grid->iam;
@@ -127,16 +131,25 @@ pzgstrf2_trsm
if ( U_diag_blk_send_req &&
U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL ) {
/* There are pending sends - wait for all Isend to complete */
- for (pr = 0; pr < Pr; ++pr)
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
+ for (pr = 0; pr < Pr; ++pr) {
if (pr != myrow) {
MPI_Wait (U_diag_blk_send_req + pr, &status);
}
-
+ }
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_DIAG] += t2;
+#endif
/* flag no more outstanding send request. */
U_diag_blk_send_req[myrow] = MPI_REQUEST_NULL;
}
if (iam == pkk) { /* diagonal process */
+ /* ++++ First step compute diagonal block ++++++++++ */
for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */
/* Diagonal pivot */
i = luptr;
@@ -197,13 +210,16 @@ pzgstrf2_trsm
} /* for column j ... first loop */
- /* ++++++++++second step ====== */
+ /* ++++ Second step compute off-diagonal block with communication ++*/
ublk_ptr = ujrow = Llu->ujrow;
- if (U_diag_blk_send_req && iam == pkk) { /* Send the U block */
+ if (U_diag_blk_send_req && iam == pkk) { /* Send the U block downward */
/** ALWAYS SEND TO ALL OTHERS - TO FIX **/
- for (pr = 0; pr < Pr; ++pr)
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
+ for (pr = 0; pr < Pr; ++pr) {
if (pr != krow) {
/* tag = ((k0<<2)+2) % tag_ub; */
/* tag = (4*(nsupers+k0)+2) % tag_ub; */
@@ -212,6 +228,12 @@ pzgstrf2_trsm
comm, U_diag_blk_send_req + pr);
}
+ }
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_DIAG] += t2;
+#endif
/* flag outstanding Isend */
U_diag_blk_send_req[krow] = (MPI_Request) TRUE; /* Sherry */
@@ -219,8 +241,6 @@ pzgstrf2_trsm
/* pragma below would be changed by an MKL call */
- char uplo = 'u', side = 'r', transa = 'n', diag = 'n';
-
l = nsupr - nsupc;
// n = nsupc;
doublecomplex alpha = {1.0, 0.0};
@@ -230,32 +250,36 @@ pzgstrf2_trsm
#endif
#if defined (USE_VENDOR_BLAS)
- ztrsm_ (&side, &uplo, &transa, &diag,
- &l, &nsupc,
+ ztrsm_ ("R", "U", "N", "N", &l, &nsupc,
&alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr,
1, 1, 1, 1);
#else
- ztrsm_ (&side, &uplo, &transa, &diag,
- &l, &nsupc,
+ ztrsm_ ("R", "U", "N", "N", &l, &nsupc,
&alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr);
#endif
-
+ stat->ops[FACT] += 4.0 * ((flops_t) nsupc * (nsupc+1) * l);
} else { /* non-diagonal process */
- /* ================================================ *
- * Receive the diagonal block of U *
- * for panel factorization of L(:,k) *
- * note: we block for panel factorization of L(:,k) *
- * but panel factorization of U(:,k) don't *
- * ================================================ */
+ /* ================================================================== *
+ * Receive the diagonal block of U for panel factorization of L(:,k). *
+ * Note: we block for panel factorization of L(:,k), but panel *
+ * factorization of U(:,k) do not block *
+ * ================================================================== */
/* tag = ((k0<<2)+2) % tag_ub; */
/* tag = (4*(nsupers+k0)+2) % tag_ub; */
// printf("hello message receiving%d %d\n",(nsupc*(nsupc+1))>>1,SLU_MPI_TAG(4,k0));
+#if ( PROFlevel>=1 )
+ TIC (t1);
+#endif
MPI_Recv (ublk_ptr, (nsupc * nsupc), SuperLU_MPI_DOUBLE_COMPLEX, krow,
SLU_MPI_TAG (4, k0) /* tag */ ,
comm, &status);
+#if ( PROFlevel>=1 )
+ TOC (t2, t1);
+ stat->utime[COMM] += t2;
+ stat->utime[COMM_DIAG] += t2;
+#endif
if (nsupr > 0) {
- char uplo = 'u', side = 'r', transa = 'n', diag = 'n';
doublecomplex alpha = {1.0, 0.0};
#ifdef PI_DEBUG
@@ -264,17 +288,16 @@ pzgstrf2_trsm
printf (" Rank :%d \t Empty block column occured :\n", iam);
#endif
#if defined (USE_VENDOR_BLAS)
- ztrsm_ (&side, &uplo, &transa, &diag,
- &nsupr, &nsupc,
+ ztrsm_ ("R", "U", "N", "N", &nsupr, &nsupc,
&alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr, 1, 1, 1, 1);
#else
- ztrsm_ (&side, &uplo, &transa, &diag,
- &nsupr, &nsupc,
+ ztrsm_ ("R", "U", "N", "N", &nsupr, &nsupc,
&alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr);
#endif
+ stat->ops[FACT] += 4.0 * ((flops_t) nsupc * (nsupc+1) * nsupr);
}
- } /* end if pkk ... */
+ } /* end if pkk ... */
/* printf("exiting pzgstrf2 %d \n", grid->iam); */
@@ -301,12 +324,10 @@ void pzgstrs2_omp
int_t *usub;
doublecomplex *lusup, *uval;
-#ifdef _OPENMP
- int thread_id = omp_get_thread_num ();
- int num_thread = omp_get_num_threads ();
-#else
- int thread_id = 0;
- int num_thread = 1;
+#if 0
+ //#ifdef USE_VTUNE
+ __SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+ __itt_resume(); // start VTune, again use 2 underscores
#endif
/* Quick return. */
@@ -316,15 +337,12 @@ void pzgstrs2_omp
/* Initialization. */
iam = grid->iam;
pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
- int k_row_cycle = k / grid->nprow; /* for which cycle k exist (to assign rowwise thread blocking) */
- int gb_col_cycle; /* cycle through block columns */
+ //int k_row_cycle = k / grid->nprow; /* for which cycle k exist (to assign rowwise thread blocking) */
+ //int gb_col_cycle; /* cycle through block columns */
klst = FstBlockC (k + 1);
knsupc = SuperSize (k);
usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */
uval = Llu->Unzval_br_ptr[lk];
- nb = usub[0];
- iukp = BR_HEADER;
- rukp = 0;
if (iam == pkk) {
lk = LBj (k, grid);
nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
@@ -334,28 +352,45 @@ void pzgstrs2_omp
lusup = Llu->Lval_buf_2[k0 % (1 + stat->num_look_aheads)];
}
- /* Loop through all the row blocks. */
- for (b = 0; b < nb; ++b) {
- /* assuming column cyclic distribution of data among threads */
- gb = usub[iukp];
- gb_col_cycle = gb / grid->npcol;
- nsupc = SuperSize (gb);
- iukp += UB_DESCRIPTOR;
+ /////////////////////new-test//////////////////////////
+ /* !! Taken from Carl/SuperLU_DIST_5.1.0/EXAMPLE/pdgstrf2_v3.c !! */
+
+ /* Master thread: set up pointers to each block in the row */
+ nb = usub[0];
+ iukp = BR_HEADER;
+ rukp = 0;
+
+ int* blocks_index_pointers = SUPERLU_MALLOC (3 * nb * sizeof(int));
+ int* blocks_value_pointers = blocks_index_pointers + nb;
+ int* nsupc_temp = blocks_value_pointers + nb;
+ for (b = 0; b < nb; b++) { /* set up pointers to each block */
+ blocks_index_pointers[b] = iukp + UB_DESCRIPTOR;
+ blocks_value_pointers[b] = rukp;
+ gb = usub[iukp];
+ rukp += usub[iukp+1];
+ nsupc = SuperSize( gb );
+ nsupc_temp[b] = nsupc;
+ iukp += (UB_DESCRIPTOR + nsupc); /* move to the next block */
+ }
+
+ // Sherry: this version is more NUMA friendly compared to pdgstrf2_v2.c
+ // https://stackoverflow.com/questions/13065943/task-based-programming-pragma-omp-task-versus-pragma-omp-parallel-for
+#pragma omp parallel for schedule(static) default(shared) \
+ private(b,j,iukp,rukp,segsize)
+ /* Loop through all the blocks in the row. */
+ for (b = 0; b < nb; ++b) {
+ iukp = blocks_index_pointers[b];
+ rukp = blocks_value_pointers[b];
/* Loop through all the segments in the block. */
- for (j = 0; j < nsupc; ++j) {
-#ifdef PI_DEBUG
- printf("segsize %d klst %d usub[%d] : %d",segsize,klst ,iukp,usub[iukp]);
-#endif
+ for (j = 0; j < nsupc_temp[b]; j++) {
segsize = klst - usub[iukp++];
- if (segsize) { /* Nonzero segment. */
- luptr = (knsupc - segsize) * (nsupr + 1);
+ if (segsize) {
+#pragma omp task default(shared) firstprivate(segsize,rukp) if (segsize > 30)
+ { /* Nonzero segment. */
+ int_t luptr = (knsupc - segsize) * (nsupr + 1);
+ //printf("[2] segsize %d, nsupr %d\n", segsize, nsupr);
- /* if gb belongs to present thread then do the factorize */
- if ((gb_col_cycle + k_row_cycle + 1) % num_thread == thread_id) {
-#ifdef PI_DEBUG
- printf ("dtrsv param 4 %d param 6 %d\n", segsize, nsupr);
-#endif
#if defined (USE_VENDOR_BLAS)
ztrsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr,
&uval[rukp], &incx, 1, 1, 1);
@@ -363,14 +398,22 @@ void pzgstrs2_omp
ztrsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr,
&uval[rukp], &incx);
#endif
- }
+ } /* end task */
+ rukp += segsize;
+ stat->ops[FACT] += segsize * (segsize + 1);
+ } /* end if segsize > 0 */
+ } /* end for j in parallel ... */
+/* #pragma omp taskwait */
+ } /* end for b ... */
- if (thread_id == 0)
- stat->ops[FACT] += segsize * (segsize + 1); // master thread updated the stats
- rukp += segsize;
- }
- }
- } /* for b ... */
+ /* Deallocate memory */
+ SUPERLU_FREE(blocks_index_pointers);
+
+#if 0
+ //#ifdef USE_VTUNE
+ __itt_pause(); // stop VTune
+ __SSC_MARK(0x222); // stop SDE tracing
+#endif
} /* PZGSTRS2_omp */
diff --git a/SRC/sp_colorder.c b/SRC/sp_colorder.c
index 27cbf93..94db174 100644
--- a/SRC/sp_colorder.c
+++ b/SRC/sp_colorder.c
@@ -125,10 +125,9 @@ sp_colorder(superlu_dist_options_t *options, SuperMatrix *A, int_t *perm_c,
}
if ( options->Fact == DOFACT
- || options->Fact == SamePattern )
+ || options->Fact == SamePattern ) {
/* In this case, perm_r[] may be changed, etree(Pr*A + (Pr*A)')
may be changed, so need to recompute etree. */
- {
/* Factor A "from scratch" -- we also compute the etree, and
* make perm_c consistent with the postorder of the etree.
*/
diff --git a/SRC/sp_ienv.c b/SRC/sp_ienv.c
index 24386cb..08d1e8f 100644
--- a/SRC/sp_ienv.c
+++ b/SRC/sp_ienv.c
@@ -103,7 +103,10 @@ sp_ienv_dist(int_t ispec)
return 128;
#endif
- case 6: return (5);
+ case 6:
+ ttemp = getenv("FILL");
+ if ( ttemp ) return(atoi(ttemp));
+ else return (5);
case 7:
ttemp = getenv ("N_GEMM");
if (ttemp) return atoi (ttemp);
diff --git a/SRC/static_schedule.c b/SRC/static_schedule.c
index b653047..bc1933b 100644
--- a/SRC/static_schedule.c
+++ b/SRC/static_schedule.c
@@ -45,6 +45,14 @@ static_schedule(superlu_dist_options_t * options, int m, int n,
LUstruct_t * LUstruct, gridinfo_t * grid, SuperLUStat_t * stat,
int_t *perm_c_supno, int_t *iperm_c_supno, int *info)
{
+/*
+ * Arguments
+ *
+ * perm_c_supno (output)
+ * perm_c_superno[k] = j means at the k-th step of elimination, the j-th
+ * panel is chosen.
+ *
+ */
int_t *xsup;
int_t i, ib, jb, lb, nlb, il, iu;
int_t Pc, Pr;
@@ -961,6 +969,8 @@ static_schedule(superlu_dist_options_t * options, int m, int n,
#if ( DEBUGlevel >= 1 )
print_memorylog(stat, "after static schedule");
+ check_perm_dist("perm_c_supno", nsupers, perm_c_supno);
+ check_perm_dist("iperm_c_supno", nsupers, iperm_c_supno);
#endif
return 0;
diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h
index 007fbe3..27b3487 100644
--- a/SRC/superlu_ddefs.h
+++ b/SRC/superlu_ddefs.h
@@ -161,7 +161,7 @@ typedef struct {
indices of A are translated into the relative
positions in the gathered x-vector.
This is re-used in repeated calls to pdgsmv() */
- /*int_t *xrow_to_proc; Xiaoye: can be removed */
+ int_t *xrow_to_proc; /* used by PDSLin */
} SOLVEstruct_t;
diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h
index 27c1bdf..b52b537 100644
--- a/SRC/superlu_defs.h
+++ b/SRC/superlu_defs.h
@@ -12,7 +12,7 @@ at the top-level directory.
* \brief Definitions which are precision-neutral
*
* <pre>
- * -- Distributed SuperLU routine (version 4.0) --
+ * -- Distributed SuperLU routine (version 5.2) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* November 1, 2007
*
@@ -43,6 +43,12 @@ at the top-level directory.
#include <limits.h>
#include <string.h>
+/* Following is for vtune */
+#if 0
+#include <ittnotify.h>
+#define USE_VTUNE
+#endif
+
/*************************************************************************
* Constants
**************************************************************************/
@@ -57,9 +63,11 @@ at the top-level directory.
* Versions 4.x and earlier do not include a #define'd version numbers.
*/
#define SUPERLU_DIST_MAJOR_VERSION 5
-#define SUPERLU_DIST_MINOR_VERSION 1
-#define SUPERLU_DIST_PATCH_VERSION 3
+#define SUPERLU_DIST_MINOR_VERSION 2
+#define SUPERLU_DIST_PATCH_VERSION 2
+#define SUPERLU_DIST_RELEASE_DATE "October 24, 2017"
+#include "superlu_dist_config.h"
/* Define my integer size int_t */
#ifdef _CRAY
typedef short int_t;
@@ -703,6 +711,7 @@ extern void PStatFree(SuperLUStat_t *);
extern void PStatPrint(superlu_dist_options_t *, SuperLUStat_t *, gridinfo_t *);
extern void log_memory(long long, SuperLUStat_t *);
extern void print_memorylog(SuperLUStat_t *, char *);
+extern int superlu_dist_GetVersionNumber(int *, int *, int *);
/* Prototypes for parallel symbolic factorization */
extern float symbfact_dist
diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h
new file mode 100644
index 0000000..7cda561
--- /dev/null
+++ b/SRC/superlu_dist_config.h
@@ -0,0 +1,4 @@
+/* #define XSDK_INDEX_SIZE 64 */
+#if (XSDK_INDEX_SIZE == 64)
+#define _LONGINT 1
+#endif
diff --git a/SRC/superlu_dist_config.h.in b/SRC/superlu_dist_config.h.in
new file mode 100644
index 0000000..3fa100f
--- /dev/null
+++ b/SRC/superlu_dist_config.h.in
@@ -0,0 +1,9 @@
+/* superlu_dist_config.h.in */
+
+/* enable 64bit index mode */
+#cmakedefine XSDK_INDEX_SIZE @XSDK_INDEX_SIZE@
+
+#if (XSDK_INDEX_SIZE == 64)
+#define _LONGINT 1
+#endif
+
diff --git a/SRC/superlu_dist_version.c b/SRC/superlu_dist_version.c
new file mode 100644
index 0000000..c6c8759
--- /dev/null
+++ b/SRC/superlu_dist_version.c
@@ -0,0 +1,30 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/** @file superlu_dist_version.h
+ * \brief Gets the SuperLU_DIST's version information from the library.
+ *
+ * -- Distributed SuperLU routine (version 5.2) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * October 13, 2017
+ *
+ */
+
+#include "superlu_defs.h"
+
+int superlu_dist_GetVersionNumber(int *major, int *minor, int *bugfix)
+{
+ if (major) *major = SUPERLU_DIST_MAJOR_VERSION;
+ if (minor) *minor = SUPERLU_DIST_MINOR_VERSION;
+ if (bugfix) *bugfix = SUPERLU_DIST_PATCH_VERSION;
+ return 0;
+}
+
+
diff --git a/SRC/superlu_enum_consts.h b/SRC/superlu_enum_consts.h
index 07fb1a4..cd592a5 100644
--- a/SRC/superlu_enum_consts.h
+++ b/SRC/superlu_enum_consts.h
@@ -67,6 +67,9 @@ typedef enum {
DIST, /* distribute matrix. */
FACT, /* perform LU factorization */
COMM, /* communication for factorization */
+ COMM_DIAG, /* Bcast diagonal block to process column */
+ COMM_RIGHT, /* communicate L panel */
+ COMM_DOWN, /* communicate U panel */
SOL_COMM,/* communication for solve */
RCOND, /* estimate reciprocal condition number */
SOLVE, /* forward and back solves */
diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h
index dc918b2..10999c4 100644
--- a/SRC/superlu_zdefs.h
+++ b/SRC/superlu_zdefs.h
@@ -161,7 +161,7 @@ typedef struct {
indices of A are translated into the relative
positions in the gathered x-vector.
This is re-used in repeated calls to pzgsmv() */
- /*int_t *xrow_to_proc; Xiaoye: can be removed */
+ int_t *xrow_to_proc; /* used by PDSLin */
} SOLVEstruct_t;
diff --git a/SRC/util.c b/SRC/util.c
index 2ae3ccd..75911a4 100644
--- a/SRC/util.c
+++ b/SRC/util.c
@@ -323,7 +323,7 @@ void set_default_options_dist(superlu_dist_options_t *options)
options->ParSymbFact = NO;
options->ColPerm = METIS_AT_PLUS_A;
options->RowPerm = LargeDiag;
- options->ReplaceTinyPivot = YES;
+ options->ReplaceTinyPivot = NO;
options->IterRefine = SLU_DOUBLE;
options->Trans = NOTRANS;
options->SolveInitialized = NO;
@@ -364,9 +364,10 @@ void print_sp_ienv_dist(superlu_dist_options_t *options)
printf("**************************************************\n");
printf(".. blocking parameters from sp_ienv():\n");
- printf("** relaxation : " IFMT "\n", sp_ienv_dist(2));
- printf("** max supernode : " IFMT "\n", sp_ienv_dist(3));
- printf("** estimated fill ratio : " IFMT "\n", sp_ienv_dist(6));
+ printf("** relaxation : " IFMT "\n", sp_ienv_dist(2));
+ printf("** max supernode : " IFMT "\n", sp_ienv_dist(3));
+ printf("** estimated fill ratio : " IFMT "\n", sp_ienv_dist(6));
+ printf("** min GEMM dimension for GPU : " IFMT "\n", sp_ienv_dist(7));
printf("**************************************************\n");
}
@@ -882,23 +883,23 @@ void isort(int_t N, int_t *ARRAY1, int_t *ARRAY2)
int_t TEMP;
IGAP = N / 2;
while (IGAP > 0) {
- for (I = IGAP; I < N; I++) {
- J = I - IGAP;
- while (J >= 0) {
- if (ARRAY1[J] > ARRAY1[J + IGAP]) {
- TEMP = ARRAY1[J];
- ARRAY1[J] = ARRAY1[J + IGAP];
- ARRAY1[J + IGAP] = TEMP;
- TEMP = ARRAY2[J];
- ARRAY2[J] = ARRAY2[J + IGAP];
- ARRAY2[J + IGAP] = TEMP;
- J = J - IGAP;
- } else {
- break;
+ for (I = IGAP; I < N; I++) {
+ J = I - IGAP;
+ while (J >= 0) {
+ if (ARRAY1[J] > ARRAY1[J + IGAP]) {
+ TEMP = ARRAY1[J];
+ ARRAY1[J] = ARRAY1[J + IGAP];
+ ARRAY1[J + IGAP] = TEMP;
+ TEMP = ARRAY2[J];
+ ARRAY2[J] = ARRAY2[J + IGAP];
+ ARRAY2[J + IGAP] = TEMP;
+ J = J - IGAP;
+ } else {
+ break;
+ }
+ }
}
- }
- }
- IGAP = IGAP / 2;
+ IGAP = IGAP / 2;
}
}
@@ -908,40 +909,36 @@ void isort1(int_t N, int_t *ARRAY)
/*
* Purpose
* =======
- * Use quick sort algorithm to sort ARRAY1 and ARRAY2 in the increasing
- * order of ARRAY1.
+ * Use quick sort algorithm to sort ARRAY in increasing order.
*
* Arguments
* =========
* N (input) INTEGER
* On entry, specifies the size of the arrays.
*
- * ARRAY1 (input/output) DOUBLE PRECISION ARRAY of LENGTH N
+ * ARRAY (input/output) DOUBLE PRECISION ARRAY of LENGTH N
* On entry, contains the array to be sorted.
* On exit, contains the sorted array.
*
- * ARRAY2 (input/output) DOUBLE PRECISION ARRAY of LENGTH N
- * On entry, contains the array to be sorted.
- * On exit, contains the sorted array.
*/
int_t IGAP, I, J;
int_t TEMP;
IGAP = N / 2;
while (IGAP > 0) {
- for (I = IGAP; I < N; I++) {
- J = I - IGAP;
- while (J >= 0) {
- if (ARRAY[J] > ARRAY[J + IGAP]) {
- TEMP = ARRAY[J];
- ARRAY[J] = ARRAY[J + IGAP];
- ARRAY[J + IGAP] = TEMP;
- J = J - IGAP;
- } else {
- break;
+ for (I = IGAP; I < N; I++) {
+ J = I - IGAP;
+ while (J >= 0) {
+ if (ARRAY[J] > ARRAY[J + IGAP]) {
+ TEMP = ARRAY[J];
+ ARRAY[J] = ARRAY[J + IGAP];
+ ARRAY[J + IGAP] = TEMP;
+ J = J - IGAP;
+ } else {
+ break;
+ }
+ }
}
- }
- }
- IGAP = IGAP / 2;
+ IGAP = IGAP / 2;
}
}
@@ -1060,7 +1057,7 @@ arrive_at_ublock (int_t j, /* j-th block in a U panel */
int_t * nsupc,/* supernode size of destination block */
int_t iukp0, /* input : search starting point */
int_t rukp0,
- int_t * usub, /* usub scripts */
+ int_t * usub, /* U subscripts */
int_t * perm_u, /* permutation vector from static schedule */
int_t * xsup, /* for SuperSize and LBj */
gridinfo_t * grid)
@@ -1069,19 +1066,26 @@ arrive_at_ublock (int_t j, /* j-th block in a U panel */
*iukp = iukp0; /* point to the first block in index[] */
*rukp = rukp0; /* point to the start of nzval[] */
+ /* Sherry -- why always starts from 0 ?? Can continue at
+ the column left from last search. */
+ /* Caveat: There is a permutation perm_u involved for j. That's why
+ the search need to restart from 0. */
#ifdef ISORT
for (jj = 0; jj < perm_u[j]; jj++) /* perm_u[j] == j */
#else
for (jj = 0; jj < perm_u[2 * j + 1]; jj++) /* == j */
#endif
{
+ /* Reinitilize the pointers to the begining of the
+ * k-th column/row of L/U factors.
+ * usub[] - index array for panel U(k,:)
+ */
// printf("iukp %d \n",*iukp );
*jb = usub[*iukp]; /* Global block number of block U(k,j). */
// printf("jb %d \n",*jb );
*nsupc = SuperSize (*jb);
// printf("nsupc %d \n",*nsupc );
*iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
-
*rukp += usub[*iukp - 1]; /* Jump # of nonzeros in block U(k,jj);
Move to block U(k,jj+1) in nzval[] */
*iukp += *nsupc;
diff --git a/SRC/zSchCompUdt-2Ddynamic.c b/SRC/zSchCompUdt-2Ddynamic.c
index 46fa613..0468f5f 100644
--- a/SRC/zSchCompUdt-2Ddynamic.c
+++ b/SRC/zSchCompUdt-2Ddynamic.c
@@ -15,29 +15,46 @@ at the top-level directory.
* Uses 2D partitioning for the scatter phase.
*
* <pre>
- * -- Distributed SuperLU routine (version 4.1) --
+ * -- Distributed SuperLU routine (version 5.2) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* October 1, 2014
*
+ * Modified: September 14, 2017
+ * - First gather U-panel, then depending on "ldu" (excluding leading zeros),
+ * gather only trailing columns of the L-panel corresponding to the nonzero
+ * of U-rows.
+ * - Padding zeros for nice dimensions of GEMM.
+ *
*/
#define SCHEDULE_STRATEGY guided
-double tt_start;
-double tt_end;
+
+/*
+ * Buffers:
+ * [ lookAhead_L_buff | Remain_L_buff ] : stores the gathered L-panel
+ * (A matrix in C := A*B )
+ * bigU : stores the U-panel (B matrix in C := A*B)
+ * bigV : stores the block GEMM result (C matrix in C := A*B)
+ */
if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
int cum_nrow = 0; /* cumulative number of nonzero rows in L(:,k) */
int temp_nbrow; /* nonzero rows in current block L(i,k) */
lptr = lptr0;
luptr = luptr0;
- /**
+ int Lnbrow, Rnbrow; /* number of nonzero rows in look-ahead window,
+ and remaining part. */
+
+ /*******************************************************************
* Seperating L blocks into the top part within look-ahead window
* and the remaining ones.
- */
+ *******************************************************************/
+
int lookAheadBlk=0, RemainBlk=0;
tt_start = SuperLU_timer_();
+ /* Sherry -- can this loop be threaded?? */
/* Loop through all blocks in L(:,k) to set up pointers to the start
* of each block in the data arrays.
* - lookAheadFullRow[i] := number of nonzero rows from block 0 to i
@@ -46,36 +63,36 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
* - (ditto Remain_Info[i])
*/
for (int i = 0; i < nlb; ++i) {
- ib = lsub[lptr]; /* block number of L(i,k). */
+ ib = lsub[lptr]; /* Block number of L(i,k). */
temp_nbrow = lsub[lptr+1]; /* Number of full rows. */
int look_up_flag = 1; /* assume ib is outside look-up window */
- for (int j = k0+1; j < SUPERLU_MIN (k0 + num_look_aheads+2, nsupers ); ++j)
- {
- if(ib == perm_c_supno[j]) {
- look_up_flag=0; /* flag ib is within look-up window */
- break; /* Sherry -- can exit the loop?? */
+ for (int j = k0+1; j < SUPERLU_MIN (k0 + num_look_aheads+2, nsupers );
+ ++j) {
+ if ( ib == perm_c_supno[j] ) {
+ look_up_flag = 0; /* flag ib within look-up window */
+ break; /* Sherry -- can exit the loop?? */
}
- }
+ }
- if( look_up_flag == 0 ) { /* ib is within look up window */
+ if ( look_up_flag == 0 ) { /* ib is within look-up window */
if (lookAheadBlk==0) {
lookAheadFullRow[lookAheadBlk] = temp_nbrow;
} else {
- lookAheadFullRow[lookAheadBlk] = temp_nbrow+lookAheadFullRow[lookAheadBlk-1];
+ lookAheadFullRow[lookAheadBlk] =
+ temp_nbrow + lookAheadFullRow[lookAheadBlk-1];
}
lookAheadStRow[lookAheadBlk] = cum_nrow;
lookAhead_lptr[lookAheadBlk] = lptr;
lookAhead_ib[lookAheadBlk] = ib;
lookAheadBlk++;
- } else { /* ib is not in look up window */
-
- if (RemainBlk==0) {
+ } else { /* ib is not in look-up window */
+ if ( RemainBlk==0 ) {
Remain_info[RemainBlk].FullRow = temp_nbrow;
} else {
- Remain_info[RemainBlk].FullRow = temp_nbrow+Remain_info[RemainBlk-1].FullRow;
+ Remain_info[RemainBlk].FullRow =
+ temp_nbrow + Remain_info[RemainBlk-1].FullRow;
}
-
RemainStRow[RemainBlk] = cum_nrow;
// Remain_lptr[RemainBlk] = lptr;
Remain_info[RemainBlk].lptr = lptr;
@@ -84,139 +101,105 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
RemainBlk++;
}
- cum_nrow +=temp_nbrow;
+ cum_nrow += temp_nbrow;
lptr += LB_DESCRIPTOR; /* Skip descriptor. */
lptr += temp_nbrow; /* Move to next block */
luptr += temp_nbrow;
- } /* for i ... all blocks in L(:,k) */
+ } /* for i ... set up pointers for all blocks in L(:,k) */
lptr = lptr0;
luptr = luptr0;
- /* leading dimension of L buffer */
-#if 0
- int LDlookAhead_LBuff = lookAheadFullRow[lookAheadBlk-1]; /* may go negative.*/
-#else /* Piyush fix */
- int LDlookAhead_LBuff = lookAheadBlk==0? 0 :lookAheadFullRow[lookAheadBlk-1];
-#endif
-
- /* Loop through the look-ahead blocks to copy Lval into the buffer */
-#ifdef __OPENMP
- /* #pragma omp parallel for -- why not?? Sherry */
-#endif
- for (int i = 0; i < lookAheadBlk; ++i) {
- int StRowDest = 0;
- int temp_nbrow;
- if (i==0) {
- temp_nbrow = lookAheadFullRow[0];
- } else {
- StRowDest = lookAheadFullRow[i-1];
- temp_nbrow = lookAheadFullRow[i]-lookAheadFullRow[i-1];
- }
-
- int StRowSource=lookAheadStRow[i];
-
- /* Now copying the matrix*/
- // #pragma omp parallel for (gives slow down)
- for (int j = 0; j < knsupc; ++j) {
- memcpy(&lookAhead_L_buff[StRowDest+j*LDlookAhead_LBuff],
- &lusup[luptr+j*nsupr+StRowSource],
- temp_nbrow * sizeof(doublecomplex) );
- }
- }
-
- int LDRemain_LBuff = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
-
- /* Loop through the remaining blocks to copy Lval into the buffer */
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
- for (int i = 0; i < RemainBlk; ++i) {
- int StRowDest = 0;
- int temp_nbrow;
- if (i==0) {
- temp_nbrow = Remain_info[0].FullRow;
- } else {
- StRowDest = Remain_info[i-1].FullRow;
- temp_nbrow = Remain_info[i].FullRow-Remain_info[i-1].FullRow;
- }
-
- int StRowSource=RemainStRow[i];
-
- /* Now copying the matrix*/
- // #pragma omp parallel for (gives slow down)
- for (int j = 0; j < knsupc; ++j) {
- // printf("StRowDest %d LDRemain_LBuff %d StRowSource %d \n", StRowDest ,LDRemain_LBuff ,StRowSource );
- memcpy(&Remain_L_buff[StRowDest+j*LDRemain_LBuff],
- &lusup[luptr+j*nsupr+StRowSource],
- temp_nbrow * sizeof(doublecomplex) );
- }
- } /* parallel for i ... */
-
-#if ( PRNTlevel>=1 )
- tt_end = SuperLU_timer_();
- GatherLTimer += tt_end - tt_start;
-#endif
-#if 0
- LookAheadRowSepMOP += 2*knsupc*(lookAheadFullRow[lookAheadBlk-1]+Remain_info[RemainBlk-1].FullRow );
-#else
- int_t lnbrow, rnbrow; /* number of nonzero rows in look-ahead window
- or remaining part. */
- lnbrow = lookAheadBlk==0 ? 0 : lookAheadFullRow[lookAheadBlk-1];
- rnbrow = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
- nbrow = lnbrow + rnbrow; /* total number of rows in L */
+ /* leading dimension of L look-ahead buffer, same as Lnbrow */
+ //int LDlookAhead_LBuff = lookAheadBlk==0 ? 0 :lookAheadFullRow[lookAheadBlk-1];
+ Lnbrow = lookAheadBlk==0 ? 0 : lookAheadFullRow[lookAheadBlk-1];
+ /* leading dimension of L remaining buffer, same as Rnbrow */
+ //int LDRemain_LBuff = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+ Rnbrow = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+ /* assert( cum_nrow == (LDlookAhead_LBuff + LDRemain_LBuff) );*/
+ /* Piyush fix */
+ //int LDlookAhead_LBuff = lookAheadBlk==0? 0 : lookAheadFullRow[lookAheadBlk-1];
+
+ nbrow = Lnbrow + Rnbrow; /* total number of rows in L */
LookAheadRowSepMOP += 2*knsupc*(nbrow);
-#endif
-
- /**********************
- * Gather U blocks *
- **********************/
+ /***********************************************
+ * Gather U blocks (AFTER LOOK-AHEAD WINDOW) *
+ ***********************************************/
tt_start = SuperLU_timer_();
-#if 0
- nbrow = lookAheadFullRow[lookAheadBlk-1]+Remain_info[RemainBlk-1].FullRow;
-#endif
if ( nbrow > 0 ) { /* L(:,k) is not empty */
/*
* Counting U blocks
*/
- ncols = 0; /* total number of nonzero columns in U(k,:) */
- ldu = 0;
- full = 1; /* flag the U block is indeed 'full', containing segments
- of same length. No need padding 0 */
- int temp_ncols=0;
+ ldu = 0; /* Calculate ldu for U(k,:) after look-ahead window. */
+ ncols = 0; /* Total number of nonzero columns in U(k,:) */
+ int temp_ncols = 0;
- /* Loop through all blocks in U(k,:) to set up pointers to the start
+#if 0
+ /* jj0 contains the look-ahead window that was updated in
+ dlook_ahead_update.c. Now the search can continue from that point,
+ not to start from block 0. */
+ iukp = iukp0; /* point to the first block in index[] */
+ rukp = rukp0; /* point to the start of nzval[] */
+#esle
+ /* Save pointers at location right after look-ahead window
+ for later restart. */
+ iukp0 = iukp;
+ rukp0 = rukp;
+#endif
+
+ /* if ( iam==0 ) printf("--- k0 %d, k %d, jj0 %d, nub %d\n", k0, k, jj0, nub);*/
+
+ /*
+ * Loop through all blocks in U(k,:) to set up pointers to the start
* of each block in the data arrays, store them in Ublock_info[j]
* for block U(k,j).
*/
- for (j = jj0; j < nub; ++j) { /* jj0 was set to 0 */
+ for (j = jj0; j < nub; ++j) { /* jj0 starts after look-ahead window. */
temp_ncols = 0;
+#if 0
+ /* Sherry - can remove following call, since perm_u == Identity */
arrive_at_ublock(
j, &iukp, &rukp, &jb, &ljb, &nsupc,
iukp0, rukp0, usub, perm_u, xsup, grid
);
+#else
+ jb = usub[iukp];
+ /* ljb = LBj (jb, grid); Local block number of U(k,j). */
+ nsupc = SuperSize(jb);
+ iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+#endif
Ublock_info[j].iukp = iukp;
Ublock_info[j].rukp = rukp;
Ublock_info[j].jb = jb;
-
+
+ /* if ( iam==0 )
+ printf("j %d: Ublock_info[j].iukp %d, Ublock_info[j].rukp %d,"
+ "Ublock_info[j].jb %d, nsupc %d\n",
+ j, Ublock_info[j].iukp, Ublock_info[j].rukp,
+ Ublock_info[j].jb, nsupc); */
+
/* Prepare to call GEMM. */
jj = iukp;
-
for (; jj < iukp+nsupc; ++jj) {
segsize = klst - usub[jj];
if ( segsize ) {
++temp_ncols;
- if ( segsize != ldu ) full = 0; /* need padding 0 */
if ( segsize > ldu ) ldu = segsize;
}
}
Ublock_info[j].full_u_cols = temp_ncols;
ncols += temp_ncols;
- }
+#if 1
+ /* Jump number of nonzeros in block U(k,jj);
+ Move to block U(k,j+1) in nzval[] array. */
+ rukp += usub[iukp - 1];
+ iukp += nsupc;
+#endif
+ } /* end for j ... compute ldu & ncols */
/* Now doing prefix sum on full_u_cols.
* After this, full_u_cols is the number of nonzero columns
@@ -226,101 +209,239 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
Ublock_info[j].full_u_cols += Ublock_info[j-1].full_u_cols;
}
+ /* Padding zeros to make {m,n,k} multiple of vector length. */
+ jj = 8; //n;
+ if (gemm_padding > 0 && Rnbrow > jj && ncols > jj && ldu > jj) {
+ gemm_m_pad = Rnbrow + (Rnbrow % GEMM_PADLEN);
+ gemm_n_pad = ncols + (ncols % GEMM_PADLEN);
+ //gemm_n_pad = ncols;
+ //gemm_k_pad = ldu + (ldu % GEMM_PADLEN);
+ gemm_k_pad = ldu;
+
+ for (i = Rnbrow; i < gemm_m_pad; ++i) // padding A matrix
+ for (j = 0; j < gemm_k_pad; ++j)
+ Remain_L_buff[i + j*gemm_m_pad] = zero;
+ for (i = 0; i < Rnbrow; ++i)
+ for (j = ldu; j < gemm_k_pad; ++j)
+ Remain_L_buff[i + j*gemm_m_pad] = zero;
+ for (i = ldu; i < gemm_k_pad; ++i) // padding B matrix
+ for (j = 0; j < gemm_n_pad; ++j)
+ bigU[i + j*gemm_k_pad] = zero;
+ for (i = 0; i < ldu; ++i)
+ for (j = ncols; j < gemm_n_pad; ++j)
+ bigU[i + j*gemm_k_pad] = zero;
+ } else {
+ gemm_m_pad = Rnbrow;
+ gemm_n_pad = ncols;
+ gemm_k_pad = ldu;
+ }
+
tempu = bigU; /* buffer the entire row block U(k,:) */
/* Gather U(k,:) into buffer bigU[] to prepare for GEMM */
-#ifdef _OPENMP
-#pragma omp parallel for private(j,iukp,rukp,tempu, jb, nsupc,ljb,segsize,\
- lead_zero, jj, i) \
- default (shared) schedule(SCHEDULE_STRATEGY)
+#ifdef _OPENMP
+#pragma omp parallel for firstprivate(iukp, rukp) \
+ private(j,tempu, jb, nsupc,ljb,segsize, lead_zero, jj, i) \
+ default (shared) schedule(SCHEDULE_STRATEGY)
#endif
- for (j = jj0; j < nub; ++j) { /* jj0 was set to 0 */
+ for (j = jj0; j < nub; ++j) { /* jj0 starts after look-ahead window. */
- if(j==jj0) tempu = bigU;
- else tempu = bigU + ldu*Ublock_info[j-1].full_u_cols;
+ if (j==jj0) tempu = bigU;
+ //else tempu = bigU + ldu * Ublock_info[j-1].full_u_cols;
+ else tempu = bigU + gemm_k_pad * Ublock_info[j-1].full_u_cols;
- /* == processing each of the remaining columns == */
+ /* == processing each of the remaining columns in parallel == */
+#if 0
+ /* Sherry - can remove following call, since perm_u == Identity */
arrive_at_ublock(j, &iukp, &rukp, &jb, &ljb, &nsupc,
iukp0, rukp0, usub,perm_u, xsup, grid);
-
- /* Copy from U(k,:) to tempu[], padding zeros. */
+#else
+ iukp = Ublock_info[j].iukp;
+ rukp = Ublock_info[j].rukp;
+ jb = Ublock_info[j].jb;
+ nsupc = SuperSize (jb );
+#endif
+ /* Copy from U(k,j) to tempu[], padding zeros. */
for (jj = iukp; jj < iukp+nsupc; ++jj) {
segsize = klst - usub[jj];
if ( segsize ) {
lead_zero = ldu - segsize;
for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
- tempu += lead_zero;
- for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i];
+ //tempu += lead_zero;
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+ for (i=0; i<segsize; ++i) tempu[i+lead_zero] = uval[rukp+i];
+
rukp += segsize;
- tempu += segsize;
+#if 0
+ tempu += segsize;
+#else
+ tempu += gemm_k_pad;
+#endif
}
- }
+ }
+#if 0
+ rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+#endif
+ } /* parallel for j = jj0 .. nub */
+
+#if 0
+ if (ldu==0) printf("[%d] .. k0 %d, before updating: ldu %d, Lnbrow %d, Rnbrow %d, ncols %d\n",iam,k0,ldu,Lnbrow,Rnbrow, ncols);
+ fflush(stdout);
+#endif
+ } /* end if (nbrow>0), end gather U blocks */
+
+ GatherUTimer += SuperLU_timer_() - tt_start;
+ GatherMOP += 2*ldu*ncols;
+ int jj_cpu = nub; /* limit between CPU and GPU */
+ int thread_id;
+ /*tempv = bigV;*/
- rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
- } /* parallel for j:jjj_st..jjj */
+ /**********************
+ * Gather L blocks *
+ **********************/
+ tt_start = SuperLU_timer_();
- tempu = bigU; /* setting to the start of padded U(k,:) */
+ /* Loop through the look-ahead blocks to copy Lval into the buffer */
+#ifdef _OPENMP
+#pragma omp parallel for private(j,jj,tempu,tempv) default (shared)
+#endif
+ for (int i = 0; i < lookAheadBlk; ++i) {
+ int StRowDest, temp_nbrow;
+ if ( i==0 ) {
+ StRowDest = 0;
+ temp_nbrow = lookAheadFullRow[0];
+ } else {
+ StRowDest = lookAheadFullRow[i-1];
+ temp_nbrow = lookAheadFullRow[i]-lookAheadFullRow[i-1];
+ }
+
+ int StRowSource = lookAheadStRow[i];
+
+ /* Now copying one block into L lookahead buffer */
+ /* #pragma omp parallel for (gives slow down) */
+ // for (int j = 0; j < knsupc; ++j) {
+ for (j = knsupc-ldu; j < knsupc; ++j) { /* skip leading columns
+ corresponding to zero U rows */
+#if 1
+ /* Better let compiler generate memcpy or vectorized code. */
+ //tempu = &lookAhead_L_buff[StRowDest + j*LDlookAhead_LBuff];
+ //tempu = &lookAhead_L_buff[StRowDest + j * Lnbrow];
+ tempu = &lookAhead_L_buff[StRowDest + (j - (knsupc-ldu)) * Lnbrow];
+ tempv = &lusup[luptr+j*nsupr + StRowSource];
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+ for (jj = 0; jj < temp_nbrow; ++jj) tempu[jj] = tempv[jj];
+#else
+ //memcpy(&lookAhead_L_buff[StRowDest + j*LDlookAhead_LBuff],
+ memcpy(&lookAhead_L_buff[StRowDest + (j - (knsupc-ldu)) * Lnbrow],
+ &lusup[luptr+j*nsupr + StRowSource],
+ temp_nbrow * sizeof(doublecomplex) );
+#endif
+ } /* end for j ... */
+ } /* parallel for i ... gather Lval blocks from lookahead window */
+
+ /* Loop through the remaining blocks to copy Lval into the buffer */
+#ifdef _OPENMP
+#pragma omp parallel for private(i,j,jj,tempu,tempv) default (shared) \
+ schedule(SCHEDULE_STRATEGY)
+#endif
+ for (int i = 0; i < RemainBlk; ++i) {
+ int StRowDest, temp_nbrow;
+ if ( i==0 ) {
+ StRowDest = 0;
+ temp_nbrow = Remain_info[0].FullRow;
+ } else {
+ StRowDest = Remain_info[i-1].FullRow;
+ temp_nbrow = Remain_info[i].FullRow - Remain_info[i-1].FullRow;
+ }
- } /* end if (nbrow>0) */
+ int StRowSource = RemainStRow[i];
-#if ( PRNTlevel>=1 )
- GatherUTimer += SuperLU_timer_() - tt_start;
+ /* Now copying a block into L remaining buffer */
+ // #pragma omp parallel for (gives slow down)
+ // for (int j = 0; j < knsupc; ++j) {
+ for (int j = knsupc-ldu; j < knsupc; ++j) {
+ // printf("StRowDest %d Rnbrow %d StRowSource %d \n", StRowDest,Rnbrow ,StRowSource);
+#if 1
+ /* Better let compiler generate memcpy or vectorized code. */
+ //tempu = &Remain_L_buff[StRowDest + j*LDRemain_LBuff];
+ //tempu = &Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * Rnbrow];
+ tempu = &Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * gemm_m_pad];
+ tempv = &lusup[luptr + j*nsupr + StRowSource];
+#if (_OPENMP>=201307)
+#pragma omp simd
#endif
- GatherMOP += 2*ldu*ncols;
+ for (jj = 0; jj < temp_nbrow; ++jj) tempu[jj] = tempv[jj];
+#else
+ //memcpy(&Remain_L_buff[StRowDest + j*LDRemain_LBuff],
+ memcpy(&Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * gemm_m_pad],
+ &lusup[luptr+j*nsupr + StRowSource],
+ temp_nbrow * sizeof(doublecomplex) );
+#endif
+ } /* end for j ... */
+ } /* parallel for i ... copy Lval into the remaining buffer */
- int Lnbrow = lookAheadBlk==0 ? 0 :lookAheadFullRow[lookAheadBlk-1];
- int Rnbrow = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
- int jj_cpu=nub; /*limit between CPU and GPU */
- int thread_id;
- tempv = bigV;
+ tt_end = SuperLU_timer_();
+ GatherLTimer += tt_end - tt_start;
- /**************************************
- * Perform GEMM followed by Scatter *
- **************************************/
- if ( Lnbrow>0 && ldu>0 && ncols>0 ) { /* Both L(:,k) and U(k,:) nonempty */
- /* Perform a large GEMM call */
- ncols = Ublock_info[nub-1].full_u_cols;
- schur_flop_counter += 2 * (double)Lnbrow * (double)ldu * (double)ncols;
- stat->ops[FACT] += 2 * (double)Lnbrow * (double)ldu * (double)ncols;
+ /*************************************************************************
+ * Perform GEMM (look-ahead L part, and remain L part) followed by Scatter
+ *************************************************************************/
+ tempu = bigU; /* setting to the start of padded U(k,:) */
+
+ if ( Lnbrow>0 && ldu>0 && ncols>0 ) { /* Both L(:,k) and U(k,:) nonempty */
+ /***************************************************************
+ * Updating blocks in look-ahead window of the LU(look-ahead-rows,:)
+ ***************************************************************/
+
+ /* Count flops for total GEMM calls */
+ ncols = Ublock_info[nub-1].full_u_cols;
+ flops_t flps = 8.0 * (flops_t)Lnbrow * ldu * ncols;
+ LookAheadScatterMOP += 3 * Lnbrow * ncols; /* scatter-add */
+ schur_flop_counter += flps;
+ stat->ops[FACT] += flps;
+ LookAheadGEMMFlOp += flps;
- /***************************************************************
- * Updating look-ahead blocks in both L and U look-ahead windows.
- ***************************************************************/
#ifdef _OPENMP
-#pragma omp parallel default (shared) private(thread_id,tt_start,tt_end)
- {
- thread_id = omp_get_thread_num();
+#pragma omp parallel default (shared) private(thread_id)
+ {
+ thread_id = omp_get_thread_num();
- /* Ideally, should organize the loop as:
- for (j = 0; j < nub; ++j) {
- for (lb = 0; lb < lookAheadBlk; ++lb) {
- L(lb,k) X U(k,j) -> tempv[]
- }
- }
- But now, we use collapsed loop to achieve more parallelism.
- Total number of block updates is:
- (# of lookAheadBlk in L(:,k)) X (# of blocks in U(k,:))
- */
+ /* Ideally, should organize the loop as:
+ for (j = 0; j < nub; ++j) {
+ for (lb = 0; lb < lookAheadBlk; ++lb) {
+ L(lb,k) X U(k,j) -> tempv[]
+ }
+ }
+ But now, we use collapsed loop to achieve more parallelism.
+ Total number of block updates is:
+ (# of lookAheadBlk in L(:,k)) X (# of blocks in U(k,:))
+ */
+
+ int i = sizeof(int);
+ int* indirect_thread = indirect + (ldt + CACHELINE/i) * thread_id;
+ int* indirect2_thread = indirect2 + (ldt + CACHELINE/i) * thread_id;
+
#pragma omp for \
- private (j,i,lb,rukp,iukp,jb,nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow) \
+ private (nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow) \
schedule(dynamic)
#else /* not use _OPENMP */
- thread_id = 0;
+ thread_id = 0;
+ int* indirect_thread = indirect;
+ int* indirect2_thread = indirect2;
#endif
- /* Each thread is assigned one loop index ij, responsible for
- block update L(lb,k) * U(k,j) -> tempv[]. */
- for (int ij = 0; ij < lookAheadBlk*(nub-jj0); ++ij) {
- if ( thread_id == 0 ) tt_start = SuperLU_timer_();
-
- int j = ij/lookAheadBlk + jj0; /* jj0 was set to 0 */
+ /* Each thread is assigned one loop index ij, responsible for
+ block update L(lb,k) * U(k,j) -> tempv[]. */
+ for (int ij = 0; ij < lookAheadBlk*(nub-jj0); ++ij) {
+ /* jj0 starts after look-ahead window. */
+ int j = ij/lookAheadBlk + jj0;
int lb = ij%lookAheadBlk;
- int* indirect_thread = indirect + ldt*thread_id;
- int* indirect2_thread = indirect2 + ldt*thread_id;
- doublecomplex* tempv1 = bigV + thread_id*ldt*ldt;
-
/* Getting U block U(k,j) information */
/* unsigned long long ut_start, ut_end; */
int_t rukp = Ublock_info[j].rukp;
@@ -329,8 +450,8 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
int nsupc = SuperSize(jb);
int ljb = LBj (jb, grid); /* destination column block */
int st_col;
- int ncols;
- if ( j>jj0 ) { /* jj0 was set to 0 */
+ int ncols; /* Local variable counts only columns in the block */
+ if ( j > jj0 ) { /* jj0 starts after look-ahead window. */
ncols = Ublock_info[j].full_u_cols-Ublock_info[j-1].full_u_cols;
st_col = Ublock_info[j-1].full_u_cols;
} else {
@@ -345,7 +466,16 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
lptr += LB_DESCRIPTOR;
int cum_nrow = (lb==0 ? 0 : lookAheadFullRow[lb-1]);
+ /* Block-by-block GEMM in look-ahead window */
+#if 0
+ i = sizeof(doublecomplex);
+ doublecomplex* tempv1 = bigV + thread_id * (ldt*ldt + CACHELINE/i);
+#else
+ doublecomplex* tempv1 = bigV + thread_id * (ldt*ldt);
+#endif
+
#if ( PRNTlevel>= 1)
+ if (thread_id == 0) tt_start = SuperLU_timer_();
gemm_max_m = SUPERLU_MAX(gemm_max_m, temp_nbrow);
gemm_max_n = SUPERLU_MAX(gemm_max_n, ncols);
gemm_max_k = SUPERLU_MAX(gemm_max_k, ldu);
@@ -353,14 +483,17 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
#if defined (USE_VENDOR_BLAS)
zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
- &lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
- &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
+ //&lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
+ &lookAhead_L_buff[cum_nrow], &Lnbrow,
+ &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
#else
zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
- &lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
- &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow);
+ //&lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
+ &lookAhead_L_buff[cum_nrow], &Lnbrow,
+ &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow);
#endif
-#if ( PRNTlevel>=1 )
+
+#if (PRNTlevel>=1 )
if (thread_id == 0) {
tt_end = SuperLU_timer_();
LookAheadGEMMTimer += tt_end - tt_start;
@@ -378,6 +511,11 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
grid
);
} else {
+#if 0
+ //#ifdef USE_VTUNE
+ __SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+ __itt_resume(); // start VTune, again use 2 underscores
+#endif
zscatter_l (
ib, ljb,
nsupc, iukp, xsup,
@@ -388,137 +526,187 @@ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
Lrowind_bc_ptr, Lnzval_bc_ptr,
grid
);
+#if 0
+ //#ifdef USE_VTUNE
+ __itt_pause(); // stop VTune
+ __SSC_MARK(0x222); // stop SDE tracing
+#endif
}
#if ( PRNTlevel>=1 )
- if (thread_id == 0)
+ if (thread_id == 0)
LookAheadScatterTimer += SuperLU_timer_() - tt_start;
#endif
- } /* end omp for ij = ... */
+ } /* end omp for ij = ... */
+
#ifdef _OPENMP
- } /* end omp parallel */
+ } /* end omp parallel */
#endif
- LookAheadGEMMFlOp += 2*(double)Lnbrow * (double)ldu * (double)ncols;
- stat->ops[FACT] += 2*(double)Lnbrow * (double)ldu * (double)ncols;
- LookAheadScatterMOP += 3*Lnbrow*ncols;
- } /* end if Lnbrow < ... */
-
+ } /* end if Lnbrow>0 ... look-ahead GEMM and scatter */
+
/***************************************************************
* Updating remaining rows and columns on CPU.
***************************************************************/
- Rnbrow = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
- ncols = jj_cpu==0 ? 0 : Ublock_info[jj_cpu-1].full_u_cols;
+ ncols = jj_cpu==0 ? 0 : Ublock_info[jj_cpu-1].full_u_cols;
+
+ if ( Rnbrow>0 && ldu>0 ) { /* There are still blocks remaining ... */
+ double flps = 8.0 * (double)Rnbrow * ldu * ncols;
+ schur_flop_counter += flps;
+ stat->ops[FACT] += flps;
- schur_flop_counter += 2 * (double)Rnbrow * (double)ldu * (double)ncols;
- stat->ops[FACT] += 2 * (double)Rnbrow * (double)ldu * (double)ncols;
+#if ( PRNTlevel>=1 )
+ RemainGEMM_flops += flps;
+ gemm_max_m = SUPERLU_MAX(gemm_max_m, Rnbrow);
+ gemm_max_n = SUPERLU_MAX(gemm_max_n, ncols);
+ gemm_max_k = SUPERLU_MAX(gemm_max_k, ldu);
+ tt_start = SuperLU_timer_();
+ /* printf("[%d] .. k0 %d, before large GEMM: %d-%d-%d, RemainBlk %d\n",
+ iam, k0,Rnbrow,ldu,ncols,RemainBlk); fflush(stdout);
+ assert( Rnbrow*ncols < bigv_size ); */
+#endif
+ /* calling aggregated large GEMM, result stored in bigV[]. */
+#if defined (USE_VENDOR_BLAS)
+ //zgemm_("N", "N", &Rnbrow, &ncols, &ldu, &alpha,
+ zgemm_("N", "N", &gemm_m_pad, &gemm_n_pad, &gemm_k_pad, &alpha,
+ //&Remain_L_buff[(knsupc-ldu)*Rnbrow], &Rnbrow,
+ &Remain_L_buff[0], &gemm_m_pad,
+ &bigU[0], &gemm_k_pad, &beta, bigV, &gemm_m_pad, 1, 1);
+#else
+ //zgemm_("N", "N", &Rnbrow, &ncols, &ldu, &alpha,
+ zgemm_("N", "N", &gemm_m_pad, &gemm_n_pad, &gemm_k_pad, &alpha,
+ //&Remain_L_buff[(knsupc-ldu)*Rnbrow], &Rnbrow,
+ &Remain_L_buff[0], &gemm_m_pad,
+ &bigU[0], &gemm_k_pad, &beta, bigV, &gemm_m_pad);
+#endif
+#if ( PRNTlevel>=1 )
+ tt_end = SuperLU_timer_();
+ RemainGEMMTimer += tt_end - tt_start;
+#if ( PROFlevel>=1 )
+ //fprintf(fgemm, "%8d%8d%8d %16.8e\n", Rnbrow, ncols, ldu,
+ // (tt_end - tt_start)*1e6); // time in microsecond
+ //fflush(fgemm);
+ gemm_stats[gemm_count].m = Rnbrow;
+ gemm_stats[gemm_count].n = ncols;
+ gemm_stats[gemm_count].k = ldu;
+ gemm_stats[gemm_count++].microseconds = (tt_end - tt_start) * 1e6;
+#endif
+ tt_start = SuperLU_timer_();
+#endif
+
+#ifdef USE_VTUNE
+ __SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+ __itt_resume(); // start VTune, again use 2 underscores
+#endif
+
+ /* Scatter into destination block-by-block. */
#ifdef _OPENMP
-#pragma omp parallel default(shared) private(thread_id,tt_start,tt_end)
- {
- thread_id = omp_get_thread_num();
+#pragma omp parallel default(shared) private(thread_id)
+ {
+ thread_id = omp_get_thread_num();
- /* Ideally, should organize the loop as:
+ /* Ideally, should organize the loop as:
for (j = 0; j < jj_cpu; ++j) {
- for (lb = 0; lb < RemainBlk; ++lb) {
+ for (lb = 0; lb < RemainBlk; ++lb) {
L(lb,k) X U(k,j) -> tempv[]
}
}
- But now, we use collapsed loop to achieve more parallelism.
- Total number of block updates is:
- (# of RemainBlk in L(:,k)) X (# of blocks in U(k,:))
- */
+ But now, we use collapsed loop to achieve more parallelism.
+ Total number of block updates is:
+ (# of RemainBlk in L(:,k)) X (# of blocks in U(k,:))
+ */
+
+ int i = sizeof(int);
+ int* indirect_thread = indirect + (ldt + CACHELINE/i) * thread_id;
+ int* indirect2_thread = indirect2 + (ldt + CACHELINE/i) * thread_id;
+
#pragma omp for \
- private (j,i,lb,rukp,iukp,jb,nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow) \
+ private (j,lb,rukp,iukp,jb,nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow) \
schedule(dynamic)
#else /* not use _OPENMP */
- thread_id = 0;
-#endif
- /* Each thread is assigned one loop index ij, responsible for
- block update L(lb,k) * U(k,j) -> tempv[]. */
- for (int ij = 0; ij < RemainBlk*(jj_cpu-jj0); ++ij) { /* jj_cpu := nub */
- int j = ij / RemainBlk + jj0;
- int lb = ij % RemainBlk;
-
- int* indirect_thread = indirect + ldt*thread_id;
- int* indirect2_thread = indirect2 + ldt*thread_id;
- doublecomplex* tempv1 = bigV + thread_id*ldt*ldt;
-
- /* Getting U block U(k,j) information */
- /* unsigned long long ut_start, ut_end; */
- int_t rukp = Ublock_info[j].rukp;
- int_t iukp = Ublock_info[j].iukp;
- int jb = Ublock_info[j].jb;
- int nsupc = SuperSize(jb);
- int ljb = LBj (jb, grid);
- int st_col;
- int ncols;
- if ( j>jj0 ) {
- ncols = Ublock_info[j].full_u_cols-Ublock_info[j-1].full_u_cols;
- st_col = Ublock_info[j-1].full_u_cols;
- } else {
- ncols = Ublock_info[j].full_u_cols;
- st_col = 0;
- }
-
- /* Getting L block L(i,k) information */
- int_t lptr = Remain_info[lb].lptr;
- int ib = Remain_info[lb].ib;
- int temp_nbrow = lsub[lptr+1];
- lptr += LB_DESCRIPTOR;
- int cum_nrow = (lb==0 ? 0 : Remain_info[lb-1].FullRow);
-
+ thread_id = 0;
+ int* indirect_thread = indirect;
+ int* indirect2_thread = indirect2;
+#endif
+ /* Each thread is assigned one loop index ij, responsible for
+ block update L(lb,k) * U(k,j) -> tempv[]. */
+ for (int ij = 0; ij < RemainBlk*(jj_cpu-jj0); ++ij) {
+ /* jj_cpu := nub, jj0 starts after look-ahead window. */
+ int j = ij / RemainBlk + jj0; /* j-th block in U panel */
+ int lb = ij % RemainBlk; /* lb-th block in L panel */
+
+ /* Getting U block U(k,j) information */
+ /* unsigned long long ut_start, ut_end; */
+ int_t rukp = Ublock_info[j].rukp;
+ int_t iukp = Ublock_info[j].iukp;
+ int jb = Ublock_info[j].jb;
+ int nsupc = SuperSize(jb);
+ int ljb = LBj (jb, grid);
+ int st_col;
+ int ncols;
+ if ( j>jj0 ) {
+ ncols = Ublock_info[j].full_u_cols - Ublock_info[j-1].full_u_cols;
+ st_col = Ublock_info[j-1].full_u_cols;
+ } else {
+ ncols = Ublock_info[j].full_u_cols;
+ st_col = 0;
+ }
+
+ /* Getting L block L(i,k) information */
+ int_t lptr = Remain_info[lb].lptr;
+ int ib = Remain_info[lb].ib;
+ int temp_nbrow = lsub[lptr+1];
+ lptr += LB_DESCRIPTOR;
+ int cum_nrow = (lb==0 ? 0 : Remain_info[lb-1].FullRow);
+
+ /* tempv1 points to block(i,j) in bigV : LDA == Rnbrow */
+ //double* tempv1 = bigV + (st_col * Rnbrow + cum_nrow); Sherry
+ doublecomplex* tempv1 = bigV + (st_col * gemm_m_pad + cum_nrow); /* Sherry */
+
+ // printf("[%d] .. before scatter: ib %d, jb %d, temp_nbrow %d, Rnbrow %d\n", iam, ib, jb, temp_nbrow, Rnbrow); fflush(stdout);
+
+ /* Now scattering the block */
+
+ if ( ib < jb ) {
+ zscatter_u (
+ ib, jb,
+ nsupc, iukp, xsup,
+ //klst, Rnbrow, /*** klst, temp_nbrow, Sherry */
+ klst, gemm_m_pad, /*** klst, temp_nbrow, Sherry */
+ lptr, temp_nbrow, /* row dimension of the block */
+ lsub, usub, tempv1,
+ Ufstnz_br_ptr, Unzval_br_ptr,
+ grid
+ );
+ } else {
+ zscatter_l(
+ ib, ljb,
+ nsupc, iukp, xsup,
+ //klst, temp_nbrow, Sherry
+ klst, gemm_m_pad, /*** temp_nbrow, Sherry */
+ lptr, temp_nbrow, /* row dimension of the block */
+ usub, lsub, tempv1,
+ indirect_thread, indirect2_thread,
+ Lrowind_bc_ptr,Lnzval_bc_ptr,
+ grid
+ );
+ }
+
+ } /* end omp for (int ij =...) */
+
+#ifdef _OPENMP
+ } /* end omp parallel region */
+#endif
+
#if ( PRNTlevel>=1 )
- if ( thread_id==0 ) tt_start = SuperLU_timer_();
+ RemainScatterTimer += SuperLU_timer_() - tt_start;
#endif
- /* calling GEMM */
-#if defined (USE_VENDOR_BLAS)
- zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
- &Remain_L_buff[(knsupc-ldu)*Rnbrow+cum_nrow], &Rnbrow,
- &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
-#else
- zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
- &Remain_L_buff[(knsupc-ldu)*Rnbrow+cum_nrow], &Rnbrow,
- &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow);
+#ifdef USE_VTUNE
+ __itt_pause(); // stop VTune
+ __SSC_MARK(0x222); // stop SDE tracing
#endif
-#if ( PRNTlevel>=1 )
- if (thread_id==0) {
- tt_end = SuperLU_timer_();
- RemainGEMMTimer += tt_end - tt_start;
- tt_start = tt_end;
- }
-#endif
-
- /* Now scattering the block */
- if ( ib<jb ) {
- zscatter_u(
- ib, jb,
- nsupc, iukp, xsup,
- klst, temp_nbrow,
- lptr, temp_nbrow,lsub,
- usub, tempv1,
- Ufstnz_br_ptr, Unzval_br_ptr,
- grid
- );
- } else {
- zscatter_l(
- ib, ljb,
- nsupc, iukp, xsup,
- klst, temp_nbrow,
- lptr, temp_nbrow,
- usub, lsub, tempv1,
- indirect_thread, indirect2_thread,
- Lrowind_bc_ptr,Lnzval_bc_ptr,
- grid
- );
- }
+ } /* end if Rnbrow>0 ... update remaining block */
-#if ( PRNTlevel>=1 )
- if (thread_id==0) RemainScatterTimer += SuperLU_timer_() - tt_start;
-#endif
- } /* end omp for (int ij =...) */
-#ifdef _OPENMP
- } /* end omp parallel region */
-#endif
} /* end if L(:,k) and U(k,:) are not empty */
diff --git a/SRC/zbinary_io.c b/SRC/zbinary_io.c
new file mode 100644
index 0000000..dc86959
--- /dev/null
+++ b/SRC/zbinary_io.c
@@ -0,0 +1,40 @@
+#include "superlu_zdefs.h"
+
+int
+zread_binary(FILE *fp, int_t *m, int_t *n, int_t *nnz,
+ doublecomplex **nzval, int_t **rowind, int_t **colptr)
+{
+ size_t isize = sizeof(int_t), dsize = sizeof(double);
+ int nnz_read;
+ fread(n, isize, 1, fp);
+ fread(nnz, isize, 1, fp);
+ printf("fread n %d\tnnz %d\n", *n, *nnz);
+ *m = *n;
+ *colptr = intMalloc_dist(*n+1);
+ *rowind = intMalloc_dist(*nnz);
+ *nzval = doublecomplexMalloc_dist(*nnz);
+ fread(*colptr, isize, (size_t) (*n + 1), fp);
+ fread(*rowind, isize, (size_t) *nnz, fp);
+ nnz_read = fread(*nzval, dsize, (size_t) (2 * (*nnz)), fp);
+ printf("# of doubles fread: %d\n", nnz_read);
+ fclose(fp);
+}
+
+int
+zwrite_binary(int_t n, int_t nnz,
+ doublecomplex *values, int_t *rowind, int_t *colptr)
+{
+ FILE *fp1;
+ int nnz_written;
+ size_t isize = sizeof(int_t), dsize = sizeof(double);
+ fp1 = fopen("/scratch/scratchdirs/xiaoye/temp.bin", "wb");
+ fwrite(&n, isize, 1, fp1);
+ fwrite(&nnz, isize, 1, fp1);
+ fwrite(colptr, isize, n+1, fp1);
+ fwrite(rowind, isize, nnz, fp1);
+ nnz_written = fwrite(values, dsize, 2*nnz, fp1);
+ printf("n %d, # of doublecomplex: %d\n", n, nnz);
+ printf("dump binary file ... # of doubles fwrite: %d\n", nnz_written);
+ assert(nnz_written == 2*nnz);
+ fclose(fp1);
+}
diff --git a/SRC/zlook_ahead_update.c b/SRC/zlook_ahead_update.c
index 683f0af..0fe20bf 100644
--- a/SRC/zlook_ahead_update.c
+++ b/SRC/zlook_ahead_update.c
@@ -14,11 +14,17 @@ at the top-level directory.
* \brief Look-ahead update of the Schur complement.
*
* <pre>
- * -- Distributed SuperLU routine (version 4.0) --
+ * -- Distributed SuperLU routine (version 5.2) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* October 1, 2014
*
+ * Modified: September 18, 2017
+ *
*/
+
+iukp = iukp0; /* point to the first block in index[] */
+rukp = rukp0; /* point to the start of nzval[] */
+
#ifdef ISORT
while (j < nub && iperm_u[j] <= k0 + num_look_aheads)
#else
@@ -27,6 +33,8 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
{
doublecomplex zero = {0.0, 0.0};
+#if 0 // Sherry: no need to search
+ /* Caveat: There is a permutation perm_u involved for j */
/* Search along the row for the pointers {iukp, rukp} pointing to
* block U(k,j).
* j -- current block in look-ahead window, initialized to 0 on entry
@@ -38,6 +46,13 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
j, &iukp, &rukp, &jb, &ljb, &nsupc,
iukp0, rukp0, usub, perm_u, xsup, grid
);
+#else
+ jb = usub[iukp];
+ ljb = LBj (jb, grid); /* Local block number of U(k,j). */
+ nsupc = SuperSize(jb);
+ iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+#endif
+
j++;
jj0++;
jj = iukp;
@@ -46,48 +61,47 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
ldu = klst - usub[jj++];
ncols = 1;
- full = 1; /* flag the U block is indeed 'full', containing segments
- of same length. No need padding 0. */
+
+ /* This loop computes ldu. */
for (; jj < iukp + nsupc; ++jj) { /* for each column jj in block U(k,j) */
segsize = klst - usub[jj];
if (segsize) {
++ncols;
- if (segsize != ldu) full = 0; /* need padding 0 */
if (segsize > ldu) ldu = segsize;
}
}
#if ( DEBUGlevel>=3 )
++num_update;
#endif
- if (0) {
- tempu = &uval[rukp];
- }
- else { /* Copy block U(k,j) into tempU2d, padding zeros. */
+
#if ( DEBUGlevel>=3 )
- printf ("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
- iam, full, k, jb, ldu, ncols, nsupc);
- ++num_copy;
+ printf ("(%d) k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
+ iam, k, jb, ldu, ncols, nsupc);
+ ++num_copy;
#endif
- tempu = bigU; /* Copy one block U(k,j) to bigU for GEMM */
- for (jj = iukp; jj < iukp + nsupc; ++jj) {
- segsize = klst - usub[jj];
- if (segsize) {
- lead_zero = ldu - segsize;
- for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
- tempu += lead_zero;
- for (i = 0; i < segsize; ++i) {
- tempu[i] = uval[rukp + i];
- }
- rukp += segsize;
- tempu += segsize;
+
+ /* Now copy one block U(k,j) to bigU for GEMM, padding zeros up to ldu. */
+ tempu = bigU; /* Copy one block U(k,j) to bigU for GEMM */
+ for (jj = iukp; jj < iukp + nsupc; ++jj) {
+ segsize = klst - usub[jj];
+ if (segsize) {
+ lead_zero = ldu - segsize;
+ for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+ tempu += lead_zero;
+ for (i = 0; i < segsize; ++i) {
+ tempu[i] = uval[rukp + i];
}
+ rukp += segsize;
+ tempu += segsize;
}
- tempu = bigU;
- rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
- } /* if full ... */
+ }
+ tempu = bigU; /* set back to the beginning of the buffer */
+#if 0
+ rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+#endif
nbrow = lsub[1]; /* number of row subscripts in L(:,k) */
- if (myrow == krow) nbrow = lsub[1] - lsub[3]; /* skip diagonal block for those rows */
+ if (myrow == krow) nbrow = lsub[1] - lsub[3]; /* skip diagonal block for those rows. */
// double ttx =SuperLU_timer_();
int current_b = 0; /* Each thread starts searching from first block.
@@ -98,9 +112,9 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
#ifdef _OPENMP
/* Sherry -- examine all the shared variables ??
'firstprivate' ensures that the private variables are initialized
- to the values before entering the loop */
+ to the values before entering the loop. */
#pragma omp parallel for \
- firstprivate(lptr,luptr,ib,tempv,current_b) private(lb) \
+ firstprivate(lptr,luptr,ib,current_b) private(lb) \
default(shared) schedule(dynamic)
#endif
for (lb = 0; lb < nlb; lb++) { /* Loop through each block in L(:,k) */
@@ -133,7 +147,10 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+ /*if (thread_id == 0) tt_start = SuperLU_timer_();*/
+
/* calling gemm */
+ stat->ops[FACT] += 8.0 * (flops_t)temp_nbrow * ldu * ncols;
#if defined (USE_VENDOR_BLAS)
zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
&lusup[luptr + (knsupc - ldu) * nsupr], &nsupr,
@@ -144,7 +161,14 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
tempu, &ldu, &beta, tempv, &temp_nbrow );
#endif
- /* Now scattering the output*/
+#if 0
+ if (thread_id == 0) {
+ tt_end = SuperLU_timer_();
+ LookAheadGEMMTimer += tt_end - tt_start;
+ tt_start = tt_end;
+ }
+#endif
+ /* Now scattering the output. */
if (ib < jb) { /* A(i,j) is in U. */
zscatter_u (ib, jb,
nsupc, iukp, xsup,
@@ -158,14 +182,22 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
Lrowind_bc_ptr, Lnzval_bc_ptr, grid);
}
- ++current_b; /* move to next block */
+ ++current_b; /* Move to next block. */
lptr += temp_nbrow;
luptr += temp_nbrow;
+#if 0
+ if (thread_id == 0) {
+ tt_end = SuperLU_timer_();
+ LookAheadScatterTimer += tt_end - tt_start;
+ }
+#endif
} /* end parallel for lb = 0, nlb ... all blocks in L(:,k) */
- rukp += usub[iukp - 1]; /* Move to next U block, U(k,j+1) */
- iukp += nsupc;
+#if 0
+ rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
+#endif
+ iukp += nsupc; /* Mov to block U(k,j+1) */
/* =========================================== *
* == factorize L(:,j) and send if possible == *
@@ -186,17 +218,14 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
/* Factor diagonal and subdiagonal blocks and test for exact
singularity. */
factored[kk] = 0;
- /* double ttt1 = SuperLU_timer_(); */
-#if ( VAMPIR>=1 )
- VT_begin (5);
-#endif
+
+ double tt1 = SuperLU_timer_();
PZGSTRF2(options, kk0, kk, thresh, Glu_persist, grid, Llu,
U_diag_blk_send_req, tag_ub, stat, info);
-#if ( VAMPIR>=1 )
- VT_end (5);
-#endif
+ pdgstrf2_timer += SuperLU_timer_() - tt1;
+
/* stat->time7 += SuperLU_timer_() - ttt1; */
/* Multicasts numeric values of L(:,kk) to process rows. */
@@ -220,18 +249,12 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
#if ( PROFlevel>=1 )
TIC (t1);
#endif
-#if ( VAMPIR>=1 )
- VT_begin (1);
-#endif
MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
SLU_MPI_TAG (0, kk0) /* (4*kk0)%tag_ub */ ,
scp->comm, &send_req[pj]);
MPI_Isend (lusup1, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj,
SLU_MPI_TAG (1, kk0) /* (4*kk0+1)%tag_ub */ ,
scp->comm, &send_req[pj + Pc]);
-#if ( VAMPIR>=1 )
- VT_end (1);
-#endif
#if ( PROFlevel>=1 )
TOC (t2, t1);
stat->utime[COMM] += t2;
diff --git a/SRC/zmemory_dist.c b/SRC/zmemory_dist.c
index 896c06d..bbaa3aa 100644
--- a/SRC/zmemory_dist.c
+++ b/SRC/zmemory_dist.c
@@ -128,10 +128,13 @@ int_t zQuerySpace_dist(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
mem_usage->total += (float)(2 * k * iword);
#else
/*mem_usage->total += stat->current_buffer;*/
- printf(".. zQuery_Space: peak_buffer %.2f (MB)\n", stat->peak_buffer * 1.0e-6);
mem_usage->total += stat->peak_buffer;
-#endif
+#if ( PRNTlevel>=1 )
+ if (iam==0) printf(".. zQuerySpace: peak_buffer %.2f (MB)\n",
+ stat->peak_buffer * 1.0e-6);
+#endif
+#endif
return 0;
} /* zQuerySpace_dist */
diff --git a/SRC/zreadMM.c b/SRC/zreadMM.c
index 668a995..f9573a8 100644
--- a/SRC/zreadMM.c
+++ b/SRC/zreadMM.c
@@ -16,6 +16,7 @@ at the top-level directory.
*
*/
#include <ctype.h>
+#include <stdio.h>
#include "superlu_zdefs.h"
#undef EXPAND_SYM
@@ -42,6 +43,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
int_t zero_base = 0;
char *p, line[512], banner[64], mtx[64], crd[64], arith[64], sym[64];
int expand;
+ char *cs;
/* File format:
* %%MatrixMarket matrix coordinate real general/symmetric/...
@@ -53,7 +55,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
*/
/* 1/ read header */
- fgets(line,512,fp);
+ cs = fgets(line,512,fp);
for (p=line; *p!='\0'; *p=tolower(*p),p++);
if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, arith, sym) != 5) {
@@ -76,9 +78,9 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
exit(-1);
}
- if(strcmp(arith,"real")) {
- if(!strcmp(arith,"complex")) {
- printf("Complex matrix; use zreadMM instead!\n");
+ if(strcmp(arith,"complex")) {
+ if(!strcmp(arith,"real")) {
+ printf("Complex matrix; use dreadMM instead!\n");
exit(-1);
}
else if(!strcmp(arith, "pattern")) {
@@ -99,7 +101,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
/* 2/ Skip comments */
while(banner[0]=='%') {
- fgets(line,512,fp);
+ cs = fgets(line,512,fp);
sscanf(line,"%s",banner);
}
@@ -122,16 +124,17 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
*m = *n;
printf("m %lld, n %lld, nonz %lld\n", (long long) *m, (long long) *n, (long long) *nonz);
+ fflush(stdout);
zallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */
a = *nzval;
asub = *rowind;
xa = *colptr;
- if ( !(val = (doublecomplex *) SUPERLU_MALLOC(new_nonz * sizeof(double))) )
+ if ( !(val = doublecomplexMalloc_dist(new_nonz)) )
ABORT("Malloc fails for val[]");
- if ( !(row = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+ if ( !(row = (int_t *) intMalloc_dist(new_nonz)) )
ABORT("Malloc fails for row[]");
- if ( !(col = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+ if ( !(col = (int_t *) intMalloc_dist(new_nonz)) )
ABORT("Malloc fails for col[]");
for (j = 0; j < *n; ++j) xa[j] = 0;
@@ -139,17 +142,19 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
/* 4/ Read triplets of values */
for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
#ifdef _LONGINT
- fscanf(fp, "%lld%lld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
+ j = fscanf(fp, "%lld%lld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
#else
- fscanf(fp, "%d%d%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
+ j = fscanf(fp, "%d%d%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
#endif
- if ( nnz == 0 ) /* first nonzero */
+ if ( nnz == 0 ) /* first nonzero */ {
if ( row[0] == 0 || col[0] == 0 ) {
zero_base = 1;
printf("triplet file: row/col indices are zero-based.\n");
} else
printf("triplet file: row/col indices are one-based.\n");
+ fflush(stdout);
+ }
if ( !zero_base ) {
/* Change to 0-based indexing. */
@@ -180,6 +185,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
*nonz = nz;
if(expand) {
printf("new_nonz after symmetric expansion:\t" IFMT "\n", *nonz);
+ fflush(stdout);
}
@@ -233,8 +239,6 @@ static void zreadrhs(int m, doublecomplex *b)
exit(-1);
}
for (i = 0; i < m; ++i)
- fscanf(fp, "%lf%lf\n", &b[i].r, &b[i].i);
+ i = fscanf(fp, "%lf%lf\n", &b[i].r, &b[i].i);
fclose(fp);
}
-
-
diff --git a/SRC/zscatter.c b/SRC/zscatter.c
index 069d3b1..f14870e 100644
--- a/SRC/zscatter.c
+++ b/SRC/zscatter.c
@@ -13,10 +13,13 @@ at the top-level directory.
* \brief Scatter the computed blocks into LU destination.
*
* <pre>
- * -- Distributed SuperLU routine (version 4.0) --
+ * -- Distributed SuperLU routine (version 5.2) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* October 1, 2014
*
+ * Modified:
+ * September 18, 2017, enable SIMD vectorized scatter operation.
+ *
*/
#include <math.h>
#include "superlu_zdefs.h"
@@ -112,9 +115,9 @@ zscatter_l (
int_t iukp, /* point to destination supernode's index[] */
int_t* xsup,
int klst,
- int nbrow,
+ int nbrow, /* LDA of the block in tempv[] */
int_t lptr, /* Input, point to index[] location of block L(i,k) */
- int temp_nbrow, /* number of rows in block L(i,k) */
+ int temp_nbrow, /* number of rows of source block L(i,k) */
int_t* usub,
int_t* lsub,
doublecomplex *tempv,
@@ -126,7 +129,7 @@ zscatter_l (
int_t rel, i, segsize, jj;
doublecomplex *nzval;
int_t *index = Lrowind_bc_ptr[ljb];
- int_t ldv = index[1]; /* LDA of the dest lusup. */
+ int_t ldv = index[1]; /* LDA of the destination lusup. */
int_t lptrj = BC_HEADER;
int_t luptrj = 0;
int_t ijb = index[lptrj];
@@ -139,36 +142,43 @@ zscatter_l (
}
/*
- * Build indirect table. This is needed because the
- * indices are not sorted for the L blocks.
+ * Build indirect table. This is needed because the indices are not sorted
+ * in the L blocks.
*/
int_t fnz = FstBlockC (ib);
int_t dest_nbrow;
lptrj += LB_DESCRIPTOR;
dest_nbrow=index[lptrj - 1];
- for (i = 0; i < dest_nbrow; ++i)
- {
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+ for (i = 0; i < dest_nbrow; ++i) {
rel = index[lptrj + i] - fnz;
indirect_thread[rel] = i;
}
- /* can be precalculated */
- for (i = 0; i < temp_nbrow; ++i)
- {
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+ /* can be precalculated? */
+ for (i = 0; i < temp_nbrow; ++i) { /* Source index is a subset of dest. */
rel = lsub[lptr + i] - fnz;
indirect2[i] =indirect_thread[rel];
}
- nzval = Lnzval_bc_ptr[ljb] + luptrj; /* Dest. block L(i,j) */
- for (jj = 0; jj < nsupc; ++jj)
- {
+ nzval = Lnzval_bc_ptr[ljb] + luptrj; /* Destination block L(i,j) */
+#ifdef __INTEL_COMPILER
+#pragma ivdep
+#endif
+ for (jj = 0; jj < nsupc; ++jj) {
segsize = klst - usub[iukp + jj];
- if (segsize)
- {
- for (i = 0; i < temp_nbrow; ++i)
- {
+ if (segsize) {
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+ for (i = 0; i < temp_nbrow; ++i) {
z_sub(&nzval[indirect2[i]], &nzval[indirect2[i]], &tempv[i]);
}
tempv += nbrow;
@@ -186,9 +196,9 @@ zscatter_u (int ib,
int_t iukp,
int_t * xsup,
int klst,
- int nbrow,
- int_t lptr,
- int temp_nbrow,
+ int nbrow, /* LDA of the block in tempv[] */
+ int_t lptr, /* point to index location of block L(i,k) */
+ int temp_nbrow, /* number of rows of source block L(i,k) */
int_t* lsub,
int_t* usub,
doublecomplex* tempv,
@@ -208,8 +218,8 @@ zscatter_u (int ib,
int_t lib = LBi (ib, grid);
int_t *index = Ufstnz_br_ptr[lib];
- /* Reinitilize the pointers to the begining of the
- * k-th column/row of L/U factors.
+ /* Reinitilize the pointers to the begining of the k-th column/row of
+ * L/U factors.
* usub[] - index array for panel U(k,:)
*/
int_t iuip_lib, ruip_lib;
@@ -217,38 +227,32 @@ zscatter_u (int ib,
ruip_lib = 0;
int_t ijb = index[iuip_lib];
- while (ijb < jb) /* Search for dest block. */
- {
+ while (ijb < jb) { /* Search for destination block. */
ruip_lib += index[iuip_lib + 1];
// printf("supersize[%ld] \t:%ld \n",ijb,SuperSize( ijb ) );
iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
ijb = index[iuip_lib];
}
- /* Skip descriptor. Now point to fstnz index of
- block U(i,j). */
+ /* Skip descriptor. Now point to fstnz index of block U(i,j). */
iuip_lib += UB_DESCRIPTOR;
// tempv = bigV + (cum_nrow + cum_ncol*nbrow);
- for (jj = 0; jj < nsupc; ++jj)
- {
+ for (jj = 0; jj < nsupc; ++jj) {
segsize = klst - usub[iukp + jj];
fnz = index[iuip_lib++];
- if (segsize) /* Nonzero segment in U(k.j). */
- {
+ if (segsize) { /* Nonzero segment in U(k,j). */
ucol = &Unzval_br_ptr[lib][ruip_lib];
// printf("========Entering loop=========\n");
- for (i = 0; i < temp_nbrow; ++i)
- {
-
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+ for (i = 0; i < temp_nbrow; ++i) {
rel = lsub[lptr + i] - fnz;
// printf("%d %d %d %d %d \n",lptr,i,fnz,temp_nbrow,nbrow );
// printf("hello ucol[%d] %d %d : \n",rel,lsub[lptr + i],fnz);
-
z_sub(&ucol[rel], &ucol[rel], &tempv[i]);
- // printf("hello\n");
-
#ifdef PI_DEBUG
double zz = 0.0;
if (!(*(long *) &zz == *(long *) &tempv[i]))
@@ -256,15 +260,16 @@ zscatter_u (int ib,
ucol[rel]);
//printing triplets (location??, old value, new value ) if none of them is zero
#endif
- } /* for i=0..temp_nbropw */
- tempv += nbrow;
+ } /* for i = 0:temp_nbropw */
+ tempv += nbrow; /* Jump LDA to next column */
#ifdef PI_DEBUG
// printf("\n");
#endif
- } /*ig segsize */
+ } /* if segsize */
+
ruip_lib += ilst - fnz;
- } /*for jj=0:nsupc */
+ } /* for jj = 0:nsupc */
#ifdef PI_DEBUG
// printf("\n");
#endif
diff --git a/TEST/#pztest.c# b/TEST/#pztest.c#
new file mode 100644
index 0000000..17fda5c
--- /dev/null
+++ b/TEST/#pztest.c#
@@ -0,0 +1,517 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Driver program for testing PZGSSVX.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 5.2) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 30, 2017
+ * </pre>
+ */
+/*
+ * File name: pztest.c
+ * Purpose: MAIN test program
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <math.h>
+#include "superlu_zdefs.h"
+
+#define NTESTS 1 /*5*/ /* Number of test types */
+#define NTYPES 11 /* Number of matrix types */
+#define NTRAN 2
+#define THRESH 20.0
+#define FMT1 "%10s:n=%d, test(%d)=%12.5g\n"
+#define FMT2 "%10s:fact=%4d, DiagScale=%d, n=%d, imat=%d, test(%d)=%12.5g, berr=%12.5g\n"
+#define FMT3 "%10s:info=%d, izero=%d, n=%d, nrhs=%d, imat=%d, nfail=%d\n"
+
+
+static void
+parse_command_line(int argc, char *argv[], int *nprow, int *npcol,
+ char *matrix_type, int *n, int *relax, int *maxsuper,
+ int *fill_ratio, int *min_gemm_gpu_offload,
+ int *nrhs, FILE **fp);
+
+extern int
+pzcompute_resid(int m, int n, int nrhs, SuperMatrix *A,
+ doublecomplex *x, int ldx, doublecomplex *b, int ldb,
+ gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct, double *resid);
+
+/*! \brief Copy matrix A into matrix B, in distributed compressed row format. */
+void
+zCopy_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B)
+{
+ NRformat_loc *Astore;
+ NRformat_loc *Bstore;
+ int_t i, nnz_loc, m_loc;
+
+ B->Stype = A->Stype;
+ B->Dtype = A->Dtype;
+ B->Mtype = A->Mtype;
+ B->nrow = A->nrow;;
+ B->ncol = A->ncol;
+ Astore = (NRformat_loc *) A->Store;
+ Bstore = (NRformat_loc *) B->Store;
+ Bstore->nnz_loc = Astore->nnz_loc;
+ nnz_loc = Astore->nnz_loc;
+ Bstore->m_loc = Astore->m_loc;
+ m_loc = Astore->m_loc;
+ Bstore->fst_row = Astore->fst_row;
+ memcpy(Bstore->nzval, Astore->nzval, nnz_loc * sizeof(doublecomplex));
+ memcpy(Bstore->colind, Astore->colind, nnz_loc * sizeof(int_t));
+ memcpy(Bstore->rowptr, Astore->rowptr, (m_loc+1) * sizeof(int_t));
+}
+
+/*! \brief Print a summary of the testing results. */
+void
+PrintSumm(char *type, int nfail, int nrun, int nerrs)
+{
+ if ( nfail > 0 )
+ printf("%3s driver: %d out of %d tests failed to pass the threshold\n",
+ type, nfail, nrun);
+ else
+ printf("All tests for %3s driver passed the threshold (%6d tests run)\n", type, nrun);
+
+ if ( nerrs > 0 )
+ printf("%6d error messages recorded\n", nerrs);
+}
+
+int main(int argc, char *argv[])
+{
+/*
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PZTEST is the main test program for the DOUBLE COMPLEX linear
+ * equation driver routines PZGSSVX.
+ *
+ * The program is invoked by a shell script file -- dtest.csh.
+ * The output from the tests are written into a file -- dtest.out.
+ */
+ superlu_dist_options_t options;
+ SuperLUStat_t stat;
+ SuperMatrix A, Asave;
+ NRformat_loc *Astore;
+ ScalePermstruct_t ScalePermstruct;
+ LUstruct_t LUstruct;
+ SOLVEstruct_t SOLVEstruct;
+ gridinfo_t grid;
+ doublecomplex *nzval_save;
+ int_t *colind_save, *rowptr_save;
+ double *berr, *R, *C;
+ doublecomplex *b, *bsave, *xtrue, *solx;
+ int i, j, m, n, izero = 0;
+ int nprow, npcol;
+ int iam, info, ldb, ldx, nrhs;
+ char **cpp, c;
+ FILE *fp, *fopen();
+ char matrix_type[8], equed[1];
+ int relax, maxsuper=sp_ienv_dist(3), fill_ratio=sp_ienv_dist(6),
+ min_gemm_gpu_offload=0;
+ int equil, ifact, nfact, iequil, iequed, prefact, notfactored;
+ int nt, nrun=0, nfail=0, nerrs=0, imat, fimat=0, nimat=1;
+ fact_t fact;
+ double rowcnd, colcnd, amax;
+ double result[NTESTS];
+
+ /* Fixed set of parameters */
+ int iseed[] = {1988, 1989, 1990, 1991};
+ char equeds[] = {'N', 'R', 'C', 'B'};
+ DiagScale_t equils[] = {NOEQUIL, ROW, COL, BOTH};
+ fact_t facts[] = {FACTORED, DOFACT, SamePattern, SamePattern_SameRowPerm};
+ trans_t transs[] = {NOTRANS, TRANS, CONJ};
+
+ nprow = 1; /* Default process rows. */
+ npcol = 1; /* Default process columns. */
+ nrhs = 1; /* Number of right-hand side. */
+ for (i = 0; i < NTESTS; ++i) result[i] = 0.0;
+
+ /* Parse command line argv[]. */
+ parse_command_line(argc, argv, &nprow, &npcol, matrix_type, &n,
+ &relax, &maxsuper,
+ &fill_ratio, &min_gemm_gpu_offload, &nrhs, &fp);
+
+ /* ------------------------------------------------------------
+ INITIALIZE MPI ENVIRONMENT.
+ ------------------------------------------------------------*/
+ MPI_Init( &argc, &argv );
+
+ /* ------------------------------------------------------------
+ INITIALIZE THE SUPERLU PROCESS GRID.
+ ------------------------------------------------------------*/
+ superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+ /* Bail out if I do not belong in the grid. */
+ iam = grid.iam;
+ if ( iam >= nprow * npcol ) goto out;
+ if ( 0 ) {
+ printf("\tProcess grid\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+ fflush(stdout);
+ }
+
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+ /* Set the default input options. */
+ set_default_options_dist(&options);
+ options.PrintStat = NO;
+
+ if (!iam) {
+ print_sp_ienv_dist(&options);
+ print_options_dist(&options);
+ fflush(stdout);
+ }
+
+ if ( !(berr = doubleMalloc_dist(nrhs)) )
+ ABORT("Malloc fails for berr[].");
+
+ /* Loop through all the input options. */
+ for (imat = fimat; imat < nimat; ++imat) { /* All matrix types */
+ //if (!iam) printf("imat loop ... %d\n", imat);
+ /* ------------------------------------------------------------
+ GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE.
+ ------------------------------------------------------------*/
+ zcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid);
+
+ m = A.nrow;
+ n = A.ncol;
+
+ if ( !(bsave = doublecomplexMalloc_dist(ldb * nrhs)) )
+ ABORT("Malloc fails for bsave[]");
+ for (j = 0; j < nrhs; ++j)
+ for (i = 0; i < ldb; ++i) bsave[i+j*ldb] = b[i+j*ldb];
+
+ /* Save a copy of matrix A in Asave. */
+ Astore = (NRformat_loc *) A.Store;
+ int_t nnz_loc = Astore->nnz_loc;
+ int_t m_loc = Astore->m_loc;
+ nzval_save = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc);
+ colind_save = (int_t *) intMalloc_dist(nnz_loc);
+ rowptr_save = (int_t *) intMalloc_dist(m_loc + 1);
+ zCreate_CompRowLoc_Matrix_dist(&Asave, m, n, nnz_loc, m_loc, Astore->fst_row,
+ nzval_save, colind_save, rowptr_save,
+ SLU_NR_loc, SLU_D, SLU_GE);
+ zCopy_CompRowLoc_Matrix_dist(&A, &Asave);
+
+ for (iequed = 0; iequed < 4; ++iequed) {
+ int what_equil = equils[iequed];
+ if (iequed == 0) nfact = 4;
+ else { /* Only test factored, pre-equilibrated matrix */
+ nfact = 1;
+ options.RowPerm = NOROWPERM; /* Turn off MC64 */
+ }
+ //if (!iam) printf("iequed loop ... %d\n", iequed);
+
+ for (ifact = 0; ifact < nfact; ++ifact) {
+ fact = facts[ifact];
+ options.Fact = fact;
+ //if (!iam) printf("ifact loop ... %d\n", ifact);
+
+ for (equil = 0; equil < 2; ++equil) {
+
+ //if (!iam) printf("equil loop ... %d\n", equil);
+
+ options.Equil = equil;
+ /* Need a first factor */
+ prefact = ( options.Fact == FACTORED ||
+ options.Fact == SamePattern ||
+ options.Fact == SamePattern_SameRowPerm );
+
+ /* Restore the matrix A. */
+ zCopy_CompRowLoc_Matrix_dist(&Asave, &A);
+
+ /* Initialize ScalePermstruct and LUstruct. */
+ ScalePermstructInit(m, n, &ScalePermstruct);
+ LUstructInit(n, &LUstruct);
+
+ //if ( options.Fact == FACTORED ||
+ // options.Fact == SamePattern_SameRowPerm ) {
+
+ if ( prefact ) {
+
+ R = (double *) SUPERLU_MALLOC(m*sizeof(double));
+ C = (double *) SUPERLU_MALLOC(n*sizeof(double));
+
+ /* Later call to PZGSSVX only needs to solve. */
+ if ( equil || iequed ) {
+ /* Compute row and column scale factors to
+ equilibrate matrix A. */
+ pzgsequ(&A, R, C, &rowcnd, &colcnd, &amax, &info,
+ &grid);
+
+ /* Force equilibration. */
+ if ( info==0 && n > 0 ) {
+ if ( what_equil == ROW ) {
+ rowcnd = 0.;
+ colcnd = 1.;
+ ScalePermstruct.DiagScale = ROW;
+ ScalePermstruct.R = R;
+ } else if ( what_equil == COL ) {
+ rowcnd = 1.;
+ colcnd = 0.;
+ ScalePermstruct.DiagScale = COL;
+ ScalePermstruct.C = C;
+ } else if ( what_equil == BOTH ) {
+ rowcnd = 0.;
+ colcnd = 0.;
+ ScalePermstruct.DiagScale = BOTH;
+ ScalePermstruct.R = R;
+ ScalePermstruct.C = C;
+ }
+ }
+
+ /* Equilibrate the matrix. */
+ pzlaqgs(&A, R, C, rowcnd, colcnd, amax, equed);
+ // printf("after pdlaqgs: *equed %c\n", *equed);
+
+ /* Not equilibrate anymore when calling PDGSSVX,.
+ * so, no malloc/free {R,C} inside PDGSSVX. */
+ options.Equil = NO;
+ } /* end if (equil || iequed) */
+ } /* end if prefact */
+
+ if ( prefact ) { /* Need a first factor */
+
+ /* Save Fact option. */
+ fact = options.Fact;
+ options.Fact = DOFACT;
+
+ /* Initialize the statistics variables. */
+ PStatInit(&stat);
+
+ int nrhs1 = 0; /* Only performs factorization */
+ pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs1,
+ &grid, &LUstruct, &SOLVEstruct,
+ berr, &stat, &info);
+
+ if ( info ) {
+ printf("** First factor: nrun %d: fact %d, info %d, "
+ "equil %d, what_equil %d, DiagScale %d \n",
+ nrun, fact, info, equil, what_equil,
+ ScalePermstruct.DiagScale);
+ }
+
+ PStatFree(&stat);
+
+ /* Restore Fact option. */
+ options.Fact = fact;
+ if ( fact == SamePattern ) {
+ // {L,U} not re-used in subsequent call to PDGSSVX.
+ Destroy_LU(n, &grid, &LUstruct);
+ }
+
+ } /* end if .. first time factor */
+
+ /*----------------
+ * Test pzgssvx
+ *----------------*/
+
+ if ( options.Fact != FACTORED ) {
+ /* Restore the matrix A. */
+ zCopy_CompRowLoc_Matrix_dist(&Asave, &A);
+ }
+
+ /* Set the right-hand side. */
+ zCopy_Dense_Matrix_dist(m_loc, nrhs, bsave, ldb, b, ldb);
+
+ PStatInit(&stat);
+
+ /*if ( !iam ) printf("\ttest pdgssvx: nrun %d, iequed %d, equil %d, fact %d\n",
+ nrun, iequed, equil, options.Fact);*/
+ /* Testing PDGSSVX: solve and compute the error bounds. */
+ pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs,
+ &grid, &LUstruct, &SOLVEstruct,
+ berr, &stat, &info);
+
+ PStatFree(&stat);
+#if 0
+ pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+ nrhs, b, ldb, xtrue, ldx, &grid);
+#endif
+ /* if ( info && info != izero ) {*/
+ if ( info ) {
+ printf(FMT3, "pzgssvx",info,izero,n,nrhs,imat,nfail);
+ } else {
+ /* Restore the matrix A. */
+ zCopy_CompRowLoc_Matrix_dist(&Asave, &A);
+
+ /* Compute residual of the computed solution.*/
+ solx = b;
+ pzcompute_resid(m, n, nrhs, &A, solx, ldx, bsave, ldb,
+ &grid, &SOLVEstruct, &result[0]);
+
+#if 0 /* how to get RCOND? */
+ /* Check solution accuracy from generated exact solution. */
+ dgst04(n, nrhs, solx, ldx, xact, ldx, rcond,
+ &result[2]);
+ pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+ nrhs, b, ldb, xtrue, ldx, &grid);
+#endif
+
+ /* Print information about the tests that did
+ not pass the threshold. */
+ int k1 = 0;
+ for (i = k1; i < NTESTS; ++i) {
+ if ( result[i] >= THRESH ) {
+ printf(FMT2, "pzgssvx", options.Fact,
+ ScalePermstruct.DiagScale,
+ n, imat, i, result[i], berr[0]);
+ ++nfail;
+ }
+ }
+ nrun += NTESTS;
+ } /* end else .. info == 0 */
+
+ /* -----------------------------------------------------
+ Deallocate storage associated with {L,U}.
+ ----------------------------------------------------- */
+ if ( prefact ) {
+ SUPERLU_FREE(R);
+ SUPERLU_FREE(C);
+ ScalePermstruct.DiagScale = NOEQUIL; /* Avoid free R/C again. */
+ }
+ ScalePermstructFree(&ScalePermstruct);
+ Destroy_LU(n, &grid, &LUstruct);
+ LUstructFree(&LUstruct);
+ if ( options.SolveInitialized ) {
+ zSolveFinalize(&options, &SOLVEstruct);
+ }
+
+ } /* end for equil ... */
+
+ } /* end for ifact ... */
+
+ } /* end for iequed ... */
+
+ /* ------------------------------------------------------------
+ DEALLOCATE STORAGE.
+ ------------------------------------------------------------*/
+ Destroy_CompRowLoc_Matrix_dist(&A);
+ Destroy_CompRowLoc_Matrix_dist(&Asave);
+ // ScalePermstructFree(&ScalePermstruct);
+ SUPERLU_FREE(b);
+ SUPERLU_FREE(bsave);
+ SUPERLU_FREE(xtrue);
+
+ } /* end for imat ... */
+
+ /* Print a summary of the testing results. */
+ if ( iam==0 ) PrintSumm("DGS", nfail, nrun, nerrs);
+
+ SUPERLU_FREE(berr);
+
+ /* ------------------------------------------------------------
+ RELEASE THE SUPERLU PROCESS GRID.
+ ------------------------------------------------------------*/
+out:
+ superlu_gridexit(&grid);
+
+ /* ------------------------------------------------------------
+ TERMINATES THE MPI EXECUTION ENVIRONMENT.
+ ------------------------------------------------------------*/
+ MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+/*
+ * Parse command line options to get various input parameters.
+ */
+static void
+parse_command_line(int argc, char *argv[], int *nprow, int *npcol,
+ char *matrix_type, int *n, int *relax, int *maxsuper,
+ int *fill_ratio, int *min_gemm_gpu_offload,
+ int *nrhs, FILE **fp)
+{
+ int c;
+ extern char *optarg;
+ char str[20];
+
+ while ( (c = getopt(argc, argv, "hr:c:t:n:x:m:b:g:s:f:")) != EOF ) {
+ switch (c) {
+ case 'h':
+ printf("Options:\n");
+ printf("\t-r <int> - process rows\n");
+ printf("\t-c <int> - process columns\n");
+ printf("\t-n <int> - matrix dimension\n");
+ printf("\t-x <int> - granularity of relaxed supernodes\n");
+ printf("\t-m <int> - maximum size of supernode\n");
+ printf("\t-b <int> - estimated fill ratio to allocate storage\n");
+ printf("\t-g <int> - minimum size of GEMM to offload to GPU\n");
+ printf("\t-s <int> - number of right-hand sides\n");
+ printf("\t-f <char[]> - file name storing a sparse matrix\n");
+ exit(1);
+ break;
+ case 'r': *nprow = atoi(optarg);
+ break;
+ case 'c': *npcol = atoi(optarg);
+ break;
+ case 'n': *n = atoi(optarg);
+ break;
+ case 'x': c = atoi(optarg);
+ sprintf(str, "%d", c);
+ setenv("NREL", str, 1);
+ //printf("Reset relax env. variable to %d\n", c);
+ break;
+ case 'm': c = atoi(optarg);
+ sprintf(str, "%d", c);
+ setenv("NSUP", str, 1);
+ //printf("Reset maxsuper env. variable to %d\n", c);
+ break;
+ case 'b': c = atoi(optarg);
+ sprintf(str, "%d", c);
+ setenv("FILL", str, 1);
+ //printf("Reset fill_ratio env. variable to %d\n", c);
+ break;
+ case 'g': c = atoi(optarg);
+ sprintf(str, "%d", c);
+ setenv("N_GEMM", str, 1);
+ //printf("Reset min_gemm_gpu_offload env. variable to %d\n", c);
+ break;
+ case 's': *nrhs = atoi(optarg);
+ break;
+ case 'f':
+ if ( !(*fp = fopen(optarg, "r")) ) {
+ ABORT("File does not exist");
+ }
+ //printf(".. test sparse matrix in file: %s\n", optarg);
+ break;
+ }
+ }
+}
+
+int cpp_defs()
+{
+ printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+ printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+ printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+ printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+ printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+ printf("....\n");
+ return 0;
+}
diff --git a/TEST/CMakeLists.txt b/TEST/CMakeLists.txt
new file mode 100644
index 0000000..460824a
--- /dev/null
+++ b/TEST/CMakeLists.txt
@@ -0,0 +1,79 @@
+include_directories(${SuperLU_DIST_SOURCE_DIR}/SRC)
+
+# Libs linked to all of the tests
+set(all_link_libs superlu_dist ${BLAS_LIB} m)
+
+set(MATRICES ../EXAMPLE/g20.rua) # sample sparse matrix from a file
+set(NPROWS 1 2) # process rows
+set(NPCOLS 1 3) # process columns
+set(NVAL 9 19) # generated matrix dimensions
+set(NRHS 1 3) # number of RHS
+# set(FILLRATIO 2 10) # estimated fill ratio
+set(FILLRATIO 2) # estimated fill ratio
+# following are blocking parameters, see sp_ienv.c
+set(RELAX 8) # relaxed supernode size: 4 8
+set(SUPERSIZE 20) # maximum supernode size: 10 20
+set(MINGEMM 10000) # minimum GEMM size for GPU offload
+
+function(cat IN_FILE OUT_FILE)
+ file(READ ${IN_FILE} CONTENTS)
+ file(APPEND ${OUT_FILE} "${CONTENTS}")
+endfunction()
+
+# Function to perform test
+# call API: add_superlu_dist_tests(pddrive big.rua)
+function(add_superlu_dist_tests target input)
+ set(TEST_INPUT "${SuperLU_DIST_SOURCE_DIR}/EXAMPLE/${input}")
+ set(TEST_OUTPUT "${SuperLU_DIST_BINARY_DIR}/TEST/${target}.out")
+
+ # Prepare a temporary file to "cat" to:
+ # file(WRITE ${TEST_OUTPUT} "")
+
+## get_target_property(TEST_LOC ${target} LOCATION)
+ set(TEST_LOC ${CMAKE_CURRENT_BINARY_DIR})
+
+ foreach (r ${NPROWS})
+ foreach (c ${NPCOLS})
+ MATH( EXPR np "${r}*${c}" )
+ foreach (s ${NRHS})
+ foreach (b ${FILLRATIO})
+ foreach (x ${RELAX})
+ foreach (m ${SUPERSIZE})
+ set(testName "${target}_${r}x${c}_${s}_${b}_${x}_${m}")
+ set(SINGLE_OUTPUT ${SuperLU_DIST_BINARY_DIR}/TEST/${testName}.out)
+ add_test( ${testName}_SP
+ ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${np}
+ ${MPIEXEC_PREFLAGS} ${target} ${MPIEXEC_POSTFLAGS}
+ -r ${r} -c ${c} -s ${s} -b ${b} -x ${x} -m ${m} -f ${TEST_INPUT}
+ )
+# add_test( ${testName}_SP "${CMAKE_COMMAND}"
+# -DTEST=${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${np}
+# ${MPIEXEC_PREFLAGS} ${target} ${MPIEXEC_POSTFLAGS} -r ${r} -c ${c} -s ${s} -b ${b} -x ${x} -m ${m} -f ${TEST_INPUT}
+# -DOUTPUT=${SINGLE_OUTPUT}
+# -DALL_OUTPUT=${TEST_OUTPUT}
+# -DHEADING=Sparse\ matrix\ ${TEST_INPUT}\ --\ r=${r},\ c=${c},\ s=${s},\ x=${x},\ m=${m}
+# -P "${SuperLU_DIST_SOURCE_DIR}/TEST/runtest.cmake"
+# )
+ endforeach()
+ endforeach()
+ endforeach()
+ endforeach()
+ endforeach()
+ endforeach()
+
+# MPI variables:
+# ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} PROCS
+# ${MPIEXEC_PREFLAGS} EXECUTABLE ${MPIEXEC_POSTFLAGS} ARGS)
+
+endfunction(add_superlu_dist_tests)
+
+if(enable_double)
+ set(DTEST pdtest.c dcreate_matrix.c pdcompute_resid.c)
+ add_executable(pdtest ${DTEST})
+ target_link_libraries(pdtest ${all_link_libs})
+ add_superlu_dist_tests(pdtest g20.rua)
+endif()
+
+#if(enable_complex16)
+# set(ZTEST pztest.c zcreate_matrix.c pzcompute_resid.c)
+#endif()
diff --git a/TEST/Makefile b/TEST/Makefile
new file mode 100644
index 0000000..c219d70
--- /dev/null
+++ b/TEST/Makefile
@@ -0,0 +1,56 @@
+#######################################################################
+# This makefile creates the test programs for the linear equation
+# routines in SuperLU_DIST. The test files are grouped as follows:
+#
+# DLINTST -- Double precision real test routines
+# ZLINTST -- Double precision complex test routines
+#
+# Test programs can be generated for all or some of the two different
+# precisions. Enter make followed by one or more of the data types
+# desired. Some examples:
+# make complex16
+# Alternatively, the command
+# make
+# without any arguments creates all two test programs.
+# The executable files are called
+# pdtest
+# pztest
+#
+# To remove the object files after the executable files have been
+# created, enter
+# make clean
+# On some systems, you can force the source files to be recompiled by
+# entering (for example)
+# make double FRC=FRC
+#
+# Creation date: March 16, 2017
+# Modified:
+#######################################################################
+
+include ../make.inc
+HEADER = ../SRC
+
+DLINTST = pdtest.o dcreate_matrix.o pdcompute_resid.o
+
+ZLINTST = pztest.o zcreate_matrix.o pzcompute_resid.o
+
+all: double complex16
+
+testmat:
+ (cd MATGEN; $(MAKE))
+
+./pdtest: $(DLINTST) $(DSUPERLULIB) $(TMGLIB)
+ $(LOADER) $(LOADOPTS) $(DLINTST) $(TMGLIB) $(LIBS) -lm -o $@
+
+./pztest: $(ZLINTST) $(DSUPERLULIB) $(TMGLIB)
+ $(LOADER) $(LOADOPTS) $(ZLINTST) $(TMGLIB) $(LIBS) -lm -o $@
+
+double: ./pdtest
+complex16: ./pztest
+
+.c.o:
+ $(CC) $(CFLAGS) $(CDEFS) -I$(HEADER) -c $< $(VERBOSE)
+
+clean:
+ rm -f *.o *test *.out
+
diff --git a/TEST/README b/TEST/README
new file mode 100644
index 0000000..d40eaab
--- /dev/null
+++ b/TEST/README
@@ -0,0 +1,12 @@
+ SuperLU_DIST TEST
+ =================
+
+This directory contains testing programs to test various functions
+provded in SuperLU_DIST.
+
+1. To run the tests (pdtest for real, pztest for complex), you may type:
+ $ mpiexec -n <np> pdtest -r <process row> -c <process columns> -f ../EXAMPLE/g20.rua
+ $ mpiexec -n <np> pztest -r <process row> -c <process columns> -f ../EXAMPLE/cg20.cua
+
+2. bash scripts to run tests:
+ - pdtest.sh / pztest.sh : invoke many runs varying several input parameters.
diff --git a/EXAMPLE/dcreate_matrix.c b/TEST/dcreate_matrix.c
similarity index 97%
copy from EXAMPLE/dcreate_matrix.c
copy to TEST/dcreate_matrix.c
index 77292d7..a622463 100644
--- a/EXAMPLE/dcreate_matrix.c
+++ b/TEST/dcreate_matrix.c
@@ -89,9 +89,14 @@ int dcreate_matrix(SuperMatrix *A, int nrhs, double **rhs,
#endif
if ( !iam ) {
+ double t = SuperLU_timer_();
+
/* Read the matrix stored on disk in Harwell-Boeing format. */
dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+ printf("Time to read and distribute matrix %.2f\n",
+ SuperLU_timer_() - t); fflush(stdout);
+
/* Broadcast matrix A to the other PEs. */
MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm );
MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm );
diff --git a/TEST/pdcompute_resid.c b/TEST/pdcompute_resid.c
new file mode 100644
index 0000000..120a087
--- /dev/null
+++ b/TEST/pdcompute_resid.c
@@ -0,0 +1,155 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Test for small residual.
+ *
+ * -- Distributed SuperLU routine (version 5.2) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 30, 2017
+ *
+ */
+#include "superlu_ddefs.h"
+
+int pdcompute_resid(int m, int n, int nrhs, SuperMatrix *A,
+ double *x, int ldx, double *b, int ldb,
+ gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct, double *resid)
+{
+/*
+ Purpose
+ =======
+
+ PDCOMPUTE_RESID computes the residual for a solution of a system of linear
+ equations A*x = b or A'*x = b:
+ RESID = norm(B - A*X) / ( norm(A) * norm(X) * EPS ),
+ where EPS is the machine epsilon.
+
+ Arguments
+ =========
+
+ M (input) INTEGER
+ The number of rows of the matrix A. M >= 0.
+
+ N (input) INTEGER
+ The number of columns of the matrix A. N >= 0.
+
+ NRHS (input) INTEGER
+ The number of columns of B, the matrix of right hand sides.
+ NRHS >= 0.
+
+ A (input/output) SuperMatrix*
+ The original M x N sparse matrix A.
+ On exit, the column indices are modified due to SPMV setup.
+
+ X (input) DOUBLE PRECISION array, dimension (LDX,NRHS)
+ The computed solution vectors for the system of linear
+ equations.
+
+ LDX (input) INTEGER
+ The leading dimension of the array X. If TRANS = NOTRANS,
+ LDX >= max(1,N); if TRANS = TRANS or CONJ, LDX >= max(1,M).
+
+ B (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS)
+ On entry, the right hand side vectors for the system of
+ linear equations.
+ On exit, B is overwritten with the difference B - A*X.
+
+ LDB (input) INTEGER
+ The leading dimension of the array B. IF TRANS = NOTRANS,
+ LDB >= max(1,M); if TRANS = TRANS or CONJ, LDB >= max(1,N).
+
+ SOLVEstruct (input) SOLVEstruct_t*
+
+ GRID (input) gridinfo_t*
+
+ RESID (output) double PRECISION
+ The maximum over the number of right-hand sides of
+ norm(B - A*X) / ( norm(A) * norm(X) * EPS ).
+
+ =====================================================================
+*/
+
+ /* Table of constant values */
+ int inc = 1;
+
+ /* Local variables */
+ int i, j;
+ double anorm, rnorm, rnorm_g;
+ double xnorm, xnorm_g;
+ double eps;
+ char transc[1];
+ double *ax, *R;
+ pdgsmv_comm_t gsmv_comm;
+ int m_loc = ((NRformat_loc*) A->Store)->m_loc;
+
+ /* Function prototypes */
+ extern double dasum_(int *, double *, int *);
+
+ /* Function Body */
+ if ( m <= 0 || n <= 0 || nrhs == 0) {
+ *resid = 0.;
+ return 0;
+ }
+
+ /* Exit with RESID = 1/EPS if ANORM = 0. */
+ eps = dmach_dist("Epsilon");
+ anorm = pdlangs("1", A, grid);
+ if (anorm <= 0.) {
+ *resid = 1. / eps;
+ return 0;
+ }
+
+ if ( !(ax = doubleMalloc_dist(m_loc)) ) ABORT("Malloc fails for work[]");
+ R = ax;
+
+ /* A is modified with colind[] permuted to [internal, external]. */
+ pdgsmv_init(A, SOLVEstruct->row_to_proc, grid, &gsmv_comm);
+
+ /* Compute the maximum over the number of right-hand sides of
+ norm(B - A*X) / ( norm(A) * norm(X) * EPS ) . */
+ *resid = 0.;
+ for (j = 0; j < nrhs; ++j) {
+ double *B_col = &b[j*ldb];
+ double *X_col = &x[j*ldx];
+
+ /* Compute residual R = B - op(A) * X,
+ where op(A) = A, A**T, or A**H, depending on TRANS. */
+ /* Matrix-vector multiply. */
+ pdgsmv(0, A, grid, &gsmv_comm, X_col, ax);
+
+ /* Compute residual, stored in R[]. */
+ for (i = 0; i < m_loc; ++i) R[i] = B_col[i] - ax[i];
+
+ rnorm = dasum_(&m_loc, R, &inc);
+ xnorm = dasum_(&m_loc, X_col, &inc);
+
+ /* */
+ MPI_Allreduce( &rnorm, &rnorm_g, 1, MPI_DOUBLE, MPI_SUM, grid->comm );
+ MPI_Allreduce( &xnorm, &xnorm_g, 1, MPI_DOUBLE, MPI_SUM, grid->comm );
+
+ if (xnorm_g <= 0.) {
+ *resid = 1. / eps;
+ } else {
+ /* Computing MAX */
+ double d1, d2;
+ d1 = *resid;
+ d2 = rnorm_g / anorm / xnorm_g / eps;
+ *resid = SUPERLU_MAX(d1, d2);
+ }
+ } /* end for j ... */
+
+ pdgsmv_finalize(&gsmv_comm);
+ SUPERLU_FREE(ax);
+
+ return 0;
+
+} /* pdcompute_redid */
diff --git a/TEST/pdtest.c b/TEST/pdtest.c
new file mode 100644
index 0000000..7666e5a
--- /dev/null
+++ b/TEST/pdtest.c
@@ -0,0 +1,519 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Driver program for testing PDGSSVX.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 5.2) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 30, 2017
+ * </pre>
+ */
+/*
+ * File name: pdtest.c
+ * Purpose: MAIN test program
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <math.h>
+#include "superlu_ddefs.h"
+
+#define NTESTS 1 /*5*/ /* Number of test types */
+#define NTYPES 11 /* Number of matrix types */
+#define NTRAN 2
+#define THRESH 20.0
+#define FMT1 "%10s:n=%d, test(%d)=%12.5g\n"
+#define FMT2 "%10s:fact=%4d, DiagScale=%d, n=%d, imat=%d, test(%d)=%12.5g, berr=%12.5g\n"
+#define FMT3 "%10s:info=%d, izero=%d, n=%d, nrhs=%d, imat=%d, nfail=%d\n"
+
+
+static void
+parse_command_line(int argc, char *argv[], int *nprow, int *npcol,
+ char *matrix_type, int *n, int *relax, int *maxsuper,
+ int *fill_ratio, int *min_gemm_gpu_offload,
+ int *nrhs, FILE **fp);
+
+extern int
+pdcompute_resid(int m, int n, int nrhs, SuperMatrix *A,
+ double *x, int ldx, double *b, int ldb,
+ gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct, double *resid);
+
+/*! \brief Copy matrix A into matrix B, in distributed compressed row format. */
+void
+dCopy_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B)
+{
+ NRformat_loc *Astore;
+ NRformat_loc *Bstore;
+ int_t i, nnz_loc, m_loc;
+
+ B->Stype = A->Stype;
+ B->Dtype = A->Dtype;
+ B->Mtype = A->Mtype;
+ B->nrow = A->nrow;;
+ B->ncol = A->ncol;
+ Astore = (NRformat_loc *) A->Store;
+ Bstore = (NRformat_loc *) B->Store;
+ Bstore->nnz_loc = Astore->nnz_loc;
+ nnz_loc = Astore->nnz_loc;
+ Bstore->m_loc = Astore->m_loc;
+ m_loc = Astore->m_loc;
+ Bstore->fst_row = Astore->fst_row;
+ memcpy(Bstore->nzval, Astore->nzval, nnz_loc * sizeof(double));
+ memcpy(Bstore->colind, Astore->colind, nnz_loc * sizeof(int_t));
+ memcpy(Bstore->rowptr, Astore->rowptr, (m_loc+1) * sizeof(int_t));
+}
+
+/*! \brief Print a summary of the testing results. */
+void
+PrintSumm(char *type, int nfail, int nrun, int nerrs)
+{
+ if ( nfail > 0 )
+ printf("%3s driver: %d out of %d tests failed to pass the threshold\n",
+ type, nfail, nrun);
+ else
+ printf("All tests for %3s driver passed the threshold (%6d tests run)\n", type, nrun);
+
+ if ( nerrs > 0 )
+ printf("%6d error messages recorded\n", nerrs);
+}
+
+int main(int argc, char *argv[])
+{
+/*
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PDTEST is the main test program for the DOUBLE linear
+ * equation driver routines PDGSSVX.
+ *
+ * The program is invoked by a shell script file -- dtest.csh.
+ * The output from the tests are written into a file -- dtest.out.
+ */
+ superlu_dist_options_t options;
+ SuperLUStat_t stat;
+ SuperMatrix A, Asave;
+ NRformat_loc *Astore;
+ ScalePermstruct_t ScalePermstruct;
+ LUstruct_t LUstruct;
+ SOLVEstruct_t SOLVEstruct;
+ gridinfo_t grid;
+ double *nzval_save;
+ int_t *colind_save, *rowptr_save;
+ double *berr, *R, *C;
+ double *b, *bsave, *xtrue, *solx;
+ int i, j, m, n, izero = 0;
+ int nprow, npcol;
+ int iam, info, ldb, ldx, nrhs;
+ int_t iinfo;
+ char **cpp, c;
+ FILE *fp, *fopen();
+ char matrix_type[8], equed[1];
+ int relax, maxsuper=sp_ienv_dist(3), fill_ratio=sp_ienv_dist(6),
+ min_gemm_gpu_offload=0;
+ int equil, ifact, nfact, iequil, iequed, prefact, notfactored;
+ int nt, nrun=0, nfail=0, nerrs=0, imat, fimat=0, nimat=1;
+ fact_t fact;
+ double rowcnd, colcnd, amax;
+ double result[NTESTS];
+
+ /* Fixed set of parameters */
+ int iseed[] = {1988, 1989, 1990, 1991};
+ char equeds[] = {'N', 'R', 'C', 'B'};
+ DiagScale_t equils[] = {NOEQUIL, ROW, COL, BOTH};
+ fact_t facts[] = {FACTORED, DOFACT, SamePattern, SamePattern_SameRowPerm};
+ trans_t transs[] = {NOTRANS, TRANS, CONJ};
+
+ nprow = 1; /* Default process rows. */
+ npcol = 1; /* Default process columns. */
+ nrhs = 1; /* Number of right-hand side. */
+ for (i = 0; i < NTESTS; ++i) result[i] = 0.0;
+
+ /* Parse command line argv[]. */
+ parse_command_line(argc, argv, &nprow, &npcol, matrix_type, &n,
+ &relax, &maxsuper,
+ &fill_ratio, &min_gemm_gpu_offload, &nrhs, &fp);
+
+ /* ------------------------------------------------------------
+ INITIALIZE MPI ENVIRONMENT.
+ ------------------------------------------------------------*/
+ MPI_Init( &argc, &argv );
+
+ /* ------------------------------------------------------------
+ INITIALIZE THE SUPERLU PROCESS GRID.
+ ------------------------------------------------------------*/
+ superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+ /* Bail out if I do not belong in the grid. */
+ iam = grid.iam;
+ if ( iam >= nprow * npcol ) goto out;
+ if ( 0 ) {
+ printf("\tProcess grid\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+ fflush(stdout);
+ }
+
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+ /* Set the default input options. */
+ set_default_options_dist(&options);
+ options.PrintStat = NO;
+
+ if (!iam) {
+ print_sp_ienv_dist(&options);
+ print_options_dist(&options);
+ fflush(stdout);
+ }
+
+ if ( !(berr = doubleMalloc_dist(nrhs)) )
+ ABORT("Malloc fails for berr[].");
+
+ /* Loop through all the input options. */
+ for (imat = fimat; imat < nimat; ++imat) { /* All matrix types */
+ //if (!iam) printf("imat loop ... %d\n", imat);
+ /* ------------------------------------------------------------
+ GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE.
+ ------------------------------------------------------------*/
+ dcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid);
+
+ m = A.nrow;
+ n = A.ncol;
+
+ if ( !(bsave = doubleMalloc_dist(ldb * nrhs)) )
+ ABORT("Malloc fails for bsave[]");
+ for (j = 0; j < nrhs; ++j)
+ for (i = 0; i < ldb; ++i) bsave[i+j*ldb] = b[i+j*ldb];
+
+ /* Save a copy of matrix A in Asave. */
+ Astore = (NRformat_loc *) A.Store;
+ int_t nnz_loc = Astore->nnz_loc;
+ int_t m_loc = Astore->m_loc;
+ nzval_save = (double *) doubleMalloc_dist(nnz_loc);
+ colind_save = (int_t *) intMalloc_dist(nnz_loc);
+ rowptr_save = (int_t *) intMalloc_dist(m_loc + 1);
+ dCreate_CompRowLoc_Matrix_dist(&Asave, m, n, nnz_loc, m_loc, Astore->fst_row,
+ nzval_save, colind_save, rowptr_save,
+ SLU_NR_loc, SLU_D, SLU_GE);
+ dCopy_CompRowLoc_Matrix_dist(&A, &Asave);
+
+ for (iequed = 0; iequed < 4; ++iequed) {
+ int what_equil = equils[iequed];
+ if (iequed == 0) nfact = 4;
+ else { /* Only test factored, pre-equilibrated matrix */
+ nfact = 1;
+ options.RowPerm = NOROWPERM; /* Turn off MC64 */
+ }
+ //if (!iam) printf("iequed loop ... %d\n", iequed);
+
+ for (ifact = 0; ifact < nfact; ++ifact) {
+ fact = facts[ifact];
+ options.Fact = fact;
+ //if (!iam) printf("ifact loop ... %d\n", ifact);
+
+ for (equil = 0; equil < 2; ++equil) {
+
+ //if (!iam) printf("equil loop ... %d\n", equil);
+
+ options.Equil = equil;
+ /* Need a first factor */
+ prefact = ( options.Fact == FACTORED ||
+ options.Fact == SamePattern ||
+ options.Fact == SamePattern_SameRowPerm );
+
+ /* Restore the matrix A. */
+ dCopy_CompRowLoc_Matrix_dist(&Asave, &A);
+
+ /* Initialize ScalePermstruct and LUstruct. */
+ ScalePermstructInit(m, n, &ScalePermstruct);
+ LUstructInit(n, &LUstruct);
+
+ //if ( options.Fact == FACTORED ||
+ // options.Fact == SamePattern_SameRowPerm ) {
+
+ if ( prefact ) {
+
+ R = (double *) SUPERLU_MALLOC(m*sizeof(double));
+ C = (double *) SUPERLU_MALLOC(n*sizeof(double));
+
+ /* Later call to PDGSSVX only needs to solve. */
+ if ( equil || iequed ) {
+ /* Compute row and column scale factors to
+ equilibrate matrix A. */
+ pdgsequ(&A, R, C, &rowcnd, &colcnd, &amax, &iinfo,
+ &grid);
+
+ /* Force equilibration. */
+ if ( iinfo==0 && n > 0 ) {
+ if ( what_equil == ROW ) {
+ rowcnd = 0.;
+ colcnd = 1.;
+ ScalePermstruct.DiagScale = ROW;
+ ScalePermstruct.R = R;
+ } else if ( what_equil == COL ) {
+ rowcnd = 1.;
+ colcnd = 0.;
+ ScalePermstruct.DiagScale = COL;
+ ScalePermstruct.C = C;
+ } else if ( what_equil == BOTH ) {
+ rowcnd = 0.;
+ colcnd = 0.;
+ ScalePermstruct.DiagScale = BOTH;
+ ScalePermstruct.R = R;
+ ScalePermstruct.C = C;
+ }
+ }
+
+ /* Equilibrate the matrix. */
+ pdlaqgs(&A, R, C, rowcnd, colcnd, amax, equed);
+ // printf("after pdlaqgs: *equed %c\n", *equed);
+
+ /* Not equilibrate anymore when calling PDGSSVX,.
+ * so, no malloc/free {R,C} inside PDGSSVX. */
+ options.Equil = NO;
+ } /* end if (equil || iequed) */
+ } /* end if prefact */
+
+ if ( prefact ) { /* Need a first factor */
+
+ /* Save Fact option. */
+ fact = options.Fact;
+ options.Fact = DOFACT;
+
+ /* Initialize the statistics variables. */
+ PStatInit(&stat);
+
+ int nrhs1 = 0; /* Only performs factorization */
+ pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs1,
+ &grid, &LUstruct, &SOLVEstruct,
+ berr, &stat, &info);
+
+ if ( info ) {
+ printf("** First factor: nrun %d: fact %d, info %d, "
+ "equil %d, what_equil %d, DiagScale %d \n",
+ nrun, fact, info, equil, what_equil,
+ ScalePermstruct.DiagScale);
+ }
+
+ PStatFree(&stat);
+
+ /* Restore Fact option. */
+ options.Fact = fact;
+ if ( fact == SamePattern ) {
+ // {L,U} not re-used in subsequent call to PDGSSVX.
+ Destroy_LU(n, &grid, &LUstruct);
+ }
+
+ } /* end if .. first time factor */
+
+ /*----------------
+ * Test pdgssvx
+ *----------------*/
+
+ if ( options.Fact != FACTORED ) {
+ /* Restore the matrix A. */
+ dCopy_CompRowLoc_Matrix_dist(&Asave, &A);
+ }
+
+ /* Set the right-hand side. */
+ dCopy_Dense_Matrix_dist(m_loc, nrhs, bsave, ldb, b, ldb);
+
+ PStatInit(&stat);
+
+ /*if ( !iam ) printf("\ttest pdgssvx: nrun %d, iequed %d, equil %d, fact %d\n",
+ nrun, iequed, equil, options.Fact);*/
+ /* Testing PDGSSVX: solve and compute the error bounds. */
+ pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs,
+ &grid, &LUstruct, &SOLVEstruct,
+ berr, &stat, &info);
+
+ PStatFree(&stat);
+#if 0
+ pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+ nrhs, b, ldb, xtrue, ldx, &grid);
+#endif
+ /* if ( info && info != izero ) {*/
+ if ( info ) {
+ printf(FMT3, "pdgssvx",info,izero,n,nrhs,imat,nfail);
+ } else {
+ /* Restore the matrix A. */
+ dCopy_CompRowLoc_Matrix_dist(&Asave, &A);
+
+ /* Compute residual of the computed solution.*/
+ solx = b;
+ pdcompute_resid(m, n, nrhs, &A, solx, ldx, bsave, ldb,
+ &grid, &SOLVEstruct, &result[0]);
+
+#if 0 /* how to get RCOND? */
+ /* Check solution accuracy from generated exact solution. */
+ dgst04(n, nrhs, solx, ldx, xact, ldx, rcond,
+ &result[2]);
+ pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+ nrhs, b, ldb, xtrue, ldx, &grid);
+#endif
+
+ /* Print information about the tests that did
+ not pass the threshold. */
+ int k1 = 0;
+ for (i = k1; i < NTESTS; ++i) {
+ if ( result[i] >= THRESH ) {
+ printf(FMT2, "pdgssvx", options.Fact,
+ ScalePermstruct.DiagScale,
+ n, imat, i, result[i], berr[0]);
+ ++nfail;
+ }
+ }
+ nrun += NTESTS;
+ } /* end else .. info == 0 */
+
+ /* -----------------------------------------------------
+ Deallocate storage associated with {L,U}.
+ ----------------------------------------------------- */
+ if ( prefact ) {
+ SUPERLU_FREE(R);
+ SUPERLU_FREE(C);
+ ScalePermstruct.DiagScale = NOEQUIL; /* Avoid free R/C again. */
+ }
+ ScalePermstructFree(&ScalePermstruct);
+ Destroy_LU(n, &grid, &LUstruct);
+ LUstructFree(&LUstruct);
+ if ( options.SolveInitialized ) {
+ dSolveFinalize(&options, &SOLVEstruct);
+ }
+
+ } /* end for equil ... */
+
+ } /* end for ifact ... */
+
+ } /* end for iequed ... */
+
+ /* ------------------------------------------------------------
+ DEALLOCATE STORAGE.
+ ------------------------------------------------------------*/
+ Destroy_CompRowLoc_Matrix_dist(&A);
+ Destroy_CompRowLoc_Matrix_dist(&Asave);
+ // ScalePermstructFree(&ScalePermstruct);
+ SUPERLU_FREE(b);
+ SUPERLU_FREE(bsave);
+ SUPERLU_FREE(xtrue);
+
+ } /* end for imat ... */
+
+ /* Print a summary of the testing results. */
+ if ( iam==0 ) PrintSumm("DGS", nfail, nrun, nerrs);
+
+ SUPERLU_FREE(berr);
+
+ /* ------------------------------------------------------------
+ RELEASE THE SUPERLU PROCESS GRID.
+ ------------------------------------------------------------*/
+out:
+ superlu_gridexit(&grid);
+
+ /* ------------------------------------------------------------
+ TERMINATES THE MPI EXECUTION ENVIRONMENT.
+ ------------------------------------------------------------*/
+ MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+/*
+ * Parse command line options to get various input parameters.
+ */
+static void
+parse_command_line(int argc, char *argv[], int *nprow, int *npcol,
+ char *matrix_type, int *n, int *relax, int *maxsuper,
+ int *fill_ratio, int *min_gemm_gpu_offload,
+ int *nrhs, FILE **fp)
+{
+ int c;
+ extern char *optarg;
+ char str[20];
+
+ while ( (c = getopt(argc, argv, "hr:c:t:n:x:m:b:g:s:f:")) != EOF ) {
+ switch (c) {
+ case 'h':
+ printf("Options:\n");
+ printf("\t-r <int> - process rows\n");
+ printf("\t-c <int> - process columns\n");
+ printf("\t-n <int> - matrix dimension\n");
+ printf("\t-x <int> - granularity of relaxed supernodes\n");
+ printf("\t-m <int> - maximum size of supernode\n");
+ printf("\t-b <int> - estimated fill ratio to allocate storage\n");
+ printf("\t-g <int> - minimum size of GEMM to offload to GPU\n");
+ printf("\t-s <int> - number of right-hand sides\n");
+ printf("\t-f <char[]> - file name storing a sparse matrix\n");
+ exit(1);
+ break;
+ case 'r': *nprow = atoi(optarg);
+ break;
+ case 'c': *npcol = atoi(optarg);
+ break;
+ case 'n': *n = atoi(optarg);
+ break;
+ case 'x': c = atoi(optarg);
+ sprintf(str, "%d", c);
+ setenv("NREL", str, 1);
+ //printf("Reset relax env. variable to %d\n", c);
+ break;
+ case 'm': c = atoi(optarg);
+ sprintf(str, "%d", c);
+ setenv("NSUP", str, 1);
+ //printf("Reset maxsuper env. variable to %d\n", c);
+ break;
+ case 'b': c = atoi(optarg);
+ sprintf(str, "%d", c);
+ setenv("FILL", str, 1);
+ //printf("Reset fill_ratio env. variable to %d\n", c);
+ break;
+ case 'g': c = atoi(optarg);
+ sprintf(str, "%d", c);
+ setenv("N_GEMM", str, 1);
+ //printf("Reset min_gemm_gpu_offload env. variable to %d\n", c);
+ break;
+ case 's': *nrhs = atoi(optarg);
+ break;
+ case 'f':
+ if ( !(*fp = fopen(optarg, "r")) ) {
+ ABORT("File does not exist");
+ }
+ //printf(".. test sparse matrix in file: %s\n", optarg);
+ break;
+ }
+ }
+}
+
+int cpp_defs()
+{
+ printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+ printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+ printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+ printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+ printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+ printf("....\n");
+ return 0;
+}
diff --git a/TEST/pdtest.sh b/TEST/pdtest.sh
new file mode 100755
index 0000000..8ca51a3
--- /dev/null
+++ b/TEST/pdtest.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# bash hint: == is for string comparisons, -eq is for numeric ones.
+
+ofile=pdtest.out # output file
+if [ -e $ofile ]; then
+ rm -f $ofile
+fi
+echo "Double-precision testing output" > $ofile
+
+MATRICES=(../EXAMPLE/g20.rua)
+NPROWS="1 2"
+NPCOLS="1 3"
+NVAL="9 19"
+NRHS="1 3"
+FILLRATIO="2 6"
+# following are blocking parameters, see sp_ienv.c
+RELAX="4 8"
+SUPERSIZE="10 20"
+MINGEMM="10000"
+
+##
+# Loop through all matrices ...
+#
+for mat in $MATRICES; do
+
+ #--------------------------------------------
+ # Test matrix types generated in LAPACK-style
+ #--------------------------------------------
+ if [ "$mat" == "LAPACK" ]; then
+ echo '== LAPACK test matrices' >> $ofile
+ for n in $NVAL ; do
+ for s in $NRHS ; do
+ echo '' >> $ofile
+ echo 'n='$n 'nrhs='$s >> $ofile
+ mpiexec -n 2 pdtest -r 1 -c 2 -x 4 -m 10 -b 5 -s 1 >> $ofile
+ done
+ done
+ #--------------------------------------------
+ # Test a specified sparse matrix
+ #--------------------------------------------
+ else
+ echo '' >> $ofile
+ echo '== sparse matrix:' $m >> $ofile
+ for s in $NRHS; do
+ for r in $NPROWS; do
+ for c in $NPCOLS; do
+ np=$(($r*$c))
+ for b in $FILLRATIO; do
+ for x in $RELAX; do
+ for m in $SUPERSIZE; do
+ echo '' >> $ofile
+ echo "**-- nrhs = $s, process grid = $r X $c, fill $b, relax $x, max-super $m"
+ echo "**-- nrhs = $s, process grid = $r X $c, fill $b, relax $x, max-super $m" >> $ofile
+ mpiexec -n $np pdtest -r $r -c $c -x $x -m $m -b $b -s 1 -f $mat >> $ofile
+ done
+ done
+ done
+ done
+ done
+ done
+ fi
+done
+
diff --git a/TEST/pzcompute_resid.c b/TEST/pzcompute_resid.c
new file mode 100644
index 0000000..0c29fac
--- /dev/null
+++ b/TEST/pzcompute_resid.c
@@ -0,0 +1,154 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Test for small residual.
+ *
+ * -- Distributed SuperLU routine (version 5.2) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 30, 2017
+ *
+ */
+#include "superlu_zdefs.h"
+
+int pzcompute_resid(int m, int n, int nrhs, SuperMatrix *A,
+ doublecomplex *x, int ldx, doublecomplex *b, int ldb,
+ gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct, double *resid)
+{
+/*
+ Purpose
+ =======
+
+ PZCOMPUTE_RESID computes the residual for a solution of a system of linear
+ equations A*x = b or A'*x = b:
+ RESID = norm(B - A*X) / ( norm(A) * norm(X) * EPS ),
+ where EPS is the machine epsilon.
+
+ Arguments
+ =========
+
+ M (input) INTEGER
+ The number of rows of the matrix A. M >= 0.
+
+ N (input) INTEGER
+ The number of columns of the matrix A. N >= 0.
+
+ NRHS (input) INTEGER
+ The number of columns of B, the matrix of right hand sides.
+ NRHS >= 0.
+
+ A (input/output) SuperMatrix*
+ The original M x N sparse matrix A.
+ On exit, the column indices are modified due to SPMV setup.
+
+ X (input) DOUBLE COMPLEX PRECISION array, dimension (LDX,NRHS)
+ The computed solution vectors for the system of linear
+ equations.
+
+ LDX (input) INTEGER
+ The leading dimension of the array X. If TRANS = NOTRANS,
+ LDX >= max(1,N); if TRANS = TRANS or CONJ, LDX >= max(1,M).
+
+ B (input/output) DOUBLE COMPLEX PRECISION array, dimension (LDB,NRHS)
+ On entry, the right hand side vectors for the system of
+ linear equations.
+ On exit, B is overwritten with the difference B - A*X.
+
+ LDB (input) INTEGER
+ The leading dimension of the array B. IF TRANS = NOTRANS,
+ LDB >= max(1,M); if TRANS = TRANS or CONJ, LDB >= max(1,N).
+
+ SOLVEstruct (input) SOLVEstruct_t*
+
+ GRID (input) gridinfo_t*
+
+ RESID (output) double PRECISION
+ The maximum over the number of right-hand sides of
+ norm(B - A*X) / ( norm(A) * norm(X) * EPS ).
+
+ =====================================================================
+*/
+
+ /* Table of constant values */
+ int inc = 1;
+
+ /* Local variables */
+ int i, j;
+ double anorm, rnorm, rnorm_g;
+ double xnorm, xnorm_g;
+ double eps;
+ char transc[1];
+ doublecomplex *ax, *R;
+ pzgsmv_comm_t gsmv_comm;
+ int m_loc = ((NRformat_loc*) A->Store)->m_loc;
+
+ /* Function prototypes */
+ extern double dzasum_(int *, doublecomplex *, int *);
+
+ /* Function Body */
+ if ( m <= 0 || n <= 0 || nrhs == 0) {
+ *resid = 0.;
+ return 0;
+ }
+
+ /* Exit with RESID = 1/EPS if ANORM = 0. */
+ eps = dmach_dist("Epsilon");
+ anorm = pzlangs("1", A, grid);
+ if (anorm <= 0.) {
+ *resid = 1. / eps;
+ return 0;
+ }
+
+ if ( !(ax = doublecomplexMalloc_dist(m_loc)) ) ABORT("Malloc fails for work[]");
+ R = ax;
+
+ /* A is modified with colind[] permuted to [internal, external]. */
+ pzgsmv_init(A, SOLVEstruct->row_to_proc, grid, &gsmv_comm);
+
+ /* Compute the maximum over the number of right-hand sides of
+ norm(B - A*X) / ( norm(A) * norm(X) * EPS ) . */
+ *resid = 0.;
+ for (j = 0; j < nrhs; ++j) {
+ doublecomplex *B_col = &b[j*ldb];
+ doublecomplex *X_col = &x[j*ldx];
+
+ /* Compute residual R = B - op(A) * X,
+ where op(A) = A, A**T, or A**H, depending on TRANS. */
+ /* Matrix-vector multiply. */
+ pzgsmv(0, A, grid, &gsmv_comm, X_col, ax);
+
+ /* Compute residual, stored in R[]. */
+ for (i = 0; i < m_loc; ++i) z_sub(&R[i], &B_col[i], &ax[i]);
+
+ rnorm = dzasum_(&m_loc, R, &inc);
+ xnorm = dzasum_(&m_loc, X_col, &inc);
+
+ /* */
+ MPI_Allreduce( &rnorm, &rnorm_g, 1, MPI_DOUBLE, MPI_SUM, grid->comm );
+ MPI_Allreduce( &xnorm, &xnorm_g, 1, MPI_DOUBLE, MPI_SUM, grid->comm );
+
+ if (xnorm_g <= 0.) {
+ *resid = 1. / eps;
+ } else {
+ /* Computing MAX */
+ double d1, d2;
+ d1 = *resid;
+ d2 = rnorm_g / anorm / xnorm_g / eps;
+ *resid = SUPERLU_MAX(d1, d2);
+ }
+ } /* end for j ... */
+
+ pzgsmv_finalize(&gsmv_comm);
+ SUPERLU_FREE(ax);
+
+ return 0;
+
+} /* pzcompute_redid */
diff --git a/TEST/pztest.c b/TEST/pztest.c
new file mode 100644
index 0000000..dadb503
--- /dev/null
+++ b/TEST/pztest.c
@@ -0,0 +1,518 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Driver program for testing PZGSSVX.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 5.2) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 30, 2017
+ * </pre>
+ */
+/*
+ * File name: pztest.c
+ * Purpose: MAIN test program
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <math.h>
+#include "superlu_zdefs.h"
+
+#define NTESTS 1 /*5*/ /* Number of test types */
+#define NTYPES 11 /* Number of matrix types */
+#define NTRAN 2
+#define THRESH 20.0
+#define FMT1 "%10s:n=%d, test(%d)=%12.5g\n"
+#define FMT2 "%10s:fact=%4d, DiagScale=%d, n=%d, imat=%d, test(%d)=%12.5g, berr=%12.5g\n"
+#define FMT3 "%10s:info=%d, izero=%d, n=%d, nrhs=%d, imat=%d, nfail=%d\n"
+
+
+static void
+parse_command_line(int argc, char *argv[], int *nprow, int *npcol,
+ char *matrix_type, int *n, int *relax, int *maxsuper,
+ int *fill_ratio, int *min_gemm_gpu_offload,
+ int *nrhs, FILE **fp);
+
+extern int
+pzcompute_resid(int m, int n, int nrhs, SuperMatrix *A,
+ doublecomplex *x, int ldx, doublecomplex *b, int ldb,
+ gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct, double *resid);
+
+/*! \brief Copy matrix A into matrix B, in distributed compressed row format. */
+void
+zCopy_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B)
+{
+ NRformat_loc *Astore;
+ NRformat_loc *Bstore;
+ int_t i, nnz_loc, m_loc;
+
+ B->Stype = A->Stype;
+ B->Dtype = A->Dtype;
+ B->Mtype = A->Mtype;
+ B->nrow = A->nrow;;
+ B->ncol = A->ncol;
+ Astore = (NRformat_loc *) A->Store;
+ Bstore = (NRformat_loc *) B->Store;
+ Bstore->nnz_loc = Astore->nnz_loc;
+ nnz_loc = Astore->nnz_loc;
+ Bstore->m_loc = Astore->m_loc;
+ m_loc = Astore->m_loc;
+ Bstore->fst_row = Astore->fst_row;
+ memcpy(Bstore->nzval, Astore->nzval, nnz_loc * sizeof(doublecomplex));
+ memcpy(Bstore->colind, Astore->colind, nnz_loc * sizeof(int_t));
+ memcpy(Bstore->rowptr, Astore->rowptr, (m_loc+1) * sizeof(int_t));
+}
+
+/*! \brief Print a summary of the testing results. */
+void
+PrintSumm(char *type, int nfail, int nrun, int nerrs)
+{
+ if ( nfail > 0 )
+ printf("%3s driver: %d out of %d tests failed to pass the threshold\n",
+ type, nfail, nrun);
+ else
+ printf("All tests for %3s driver passed the threshold (%6d tests run)\n", type, nrun);
+
+ if ( nerrs > 0 )
+ printf("%6d error messages recorded\n", nerrs);
+}
+
+int main(int argc, char *argv[])
+{
+/*
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PZTEST is the main test program for the DOUBLE COMPLEX linear
+ * equation driver routines PZGSSVX.
+ *
+ * The program is invoked by a shell script file -- dtest.csh.
+ * The output from the tests are written into a file -- dtest.out.
+ */
+ superlu_dist_options_t options;
+ SuperLUStat_t stat;
+ SuperMatrix A, Asave;
+ NRformat_loc *Astore;
+ ScalePermstruct_t ScalePermstruct;
+ LUstruct_t LUstruct;
+ SOLVEstruct_t SOLVEstruct;
+ gridinfo_t grid;
+ doublecomplex *nzval_save;
+ int_t *colind_save, *rowptr_save;
+ double *berr, *R, *C;
+ doublecomplex *b, *bsave, *xtrue, *solx;
+ int i, j, m, n, izero = 0;
+ int nprow, npcol;
+ int iam, info, ldb, ldx, nrhs;
+ int_t iinfo;
+ char **cpp, c;
+ FILE *fp, *fopen();
+ char matrix_type[8], equed[1];
+ int relax, maxsuper=sp_ienv_dist(3), fill_ratio=sp_ienv_dist(6),
+ min_gemm_gpu_offload=0;
+ int equil, ifact, nfact, iequil, iequed, prefact, notfactored;
+ int nt, nrun=0, nfail=0, nerrs=0, imat, fimat=0, nimat=1;
+ fact_t fact;
+ double rowcnd, colcnd, amax;
+ double result[NTESTS];
+
+ /* Fixed set of parameters */
+ int iseed[] = {1988, 1989, 1990, 1991};
+ char equeds[] = {'N', 'R', 'C', 'B'};
+ DiagScale_t equils[] = {NOEQUIL, ROW, COL, BOTH};
+ fact_t facts[] = {FACTORED, DOFACT, SamePattern, SamePattern_SameRowPerm};
+ trans_t transs[] = {NOTRANS, TRANS, CONJ};
+
+ nprow = 1; /* Default process rows. */
+ npcol = 1; /* Default process columns. */
+ nrhs = 1; /* Number of right-hand side. */
+ for (i = 0; i < NTESTS; ++i) result[i] = 0.0;
+
+ /* Parse command line argv[]. */
+ parse_command_line(argc, argv, &nprow, &npcol, matrix_type, &n,
+ &relax, &maxsuper,
+ &fill_ratio, &min_gemm_gpu_offload, &nrhs, &fp);
+
+ /* ------------------------------------------------------------
+ INITIALIZE MPI ENVIRONMENT.
+ ------------------------------------------------------------*/
+ MPI_Init( &argc, &argv );
+
+ /* ------------------------------------------------------------
+ INITIALIZE THE SUPERLU PROCESS GRID.
+ ------------------------------------------------------------*/
+ superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+ /* Bail out if I do not belong in the grid. */
+ iam = grid.iam;
+ if ( iam >= nprow * npcol ) goto out;
+ if ( 0 ) {
+ printf("\tProcess grid\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+ fflush(stdout);
+ }
+
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+ /* Set the default input options. */
+ set_default_options_dist(&options);
+ options.PrintStat = NO;
+
+ if (!iam) {
+ print_sp_ienv_dist(&options);
+ print_options_dist(&options);
+ fflush(stdout);
+ }
+
+ if ( !(berr = doubleMalloc_dist(nrhs)) )
+ ABORT("Malloc fails for berr[].");
+
+ /* Loop through all the input options. */
+ for (imat = fimat; imat < nimat; ++imat) { /* All matrix types */
+ //if (!iam) printf("imat loop ... %d\n", imat);
+ /* ------------------------------------------------------------
+ GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE.
+ ------------------------------------------------------------*/
+ zcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid);
+
+ m = A.nrow;
+ n = A.ncol;
+
+ if ( !(bsave = doublecomplexMalloc_dist(ldb * nrhs)) )
+ ABORT("Malloc fails for bsave[]");
+ for (j = 0; j < nrhs; ++j)
+ for (i = 0; i < ldb; ++i) bsave[i+j*ldb] = b[i+j*ldb];
+
+ /* Save a copy of matrix A in Asave. */
+ Astore = (NRformat_loc *) A.Store;
+ int_t nnz_loc = Astore->nnz_loc;
+ int_t m_loc = Astore->m_loc;
+ nzval_save = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc);
+ colind_save = (int_t *) intMalloc_dist(nnz_loc);
+ rowptr_save = (int_t *) intMalloc_dist(m_loc + 1);
+ zCreate_CompRowLoc_Matrix_dist(&Asave, m, n, nnz_loc, m_loc, Astore->fst_row,
+ nzval_save, colind_save, rowptr_save,
+ SLU_NR_loc, SLU_D, SLU_GE);
+ zCopy_CompRowLoc_Matrix_dist(&A, &Asave);
+
+ for (iequed = 0; iequed < 4; ++iequed) {
+ int what_equil = equils[iequed];
+ if (iequed == 0) nfact = 4;
+ else { /* Only test factored, pre-equilibrated matrix */
+ nfact = 1;
+ options.RowPerm = NOROWPERM; /* Turn off MC64 */
+ }
+ //if (!iam) printf("iequed loop ... %d\n", iequed);
+
+ for (ifact = 0; ifact < nfact; ++ifact) {
+ fact = facts[ifact];
+ options.Fact = fact;
+ //if (!iam) printf("ifact loop ... %d\n", ifact);
+
+ for (equil = 0; equil < 2; ++equil) {
+
+ //if (!iam) printf("equil loop ... %d\n", equil);
+
+ options.Equil = equil;
+ /* Need a first factor */
+ prefact = ( options.Fact == FACTORED ||
+ options.Fact == SamePattern ||
+ options.Fact == SamePattern_SameRowPerm );
+
+ /* Restore the matrix A. */
+ zCopy_CompRowLoc_Matrix_dist(&Asave, &A);
+
+ /* Initialize ScalePermstruct and LUstruct. */
+ ScalePermstructInit(m, n, &ScalePermstruct);
+ LUstructInit(n, &LUstruct);
+
+ //if ( options.Fact == FACTORED ||
+ // options.Fact == SamePattern_SameRowPerm ) {
+
+ if ( prefact ) {
+
+ R = (double *) SUPERLU_MALLOC(m*sizeof(double));
+ C = (double *) SUPERLU_MALLOC(n*sizeof(double));
+
+ /* Later call to PZGSSVX only needs to solve. */
+ if ( equil || iequed ) {
+ /* Compute row and column scale factors to
+ equilibrate matrix A. */
+ pzgsequ(&A, R, C, &rowcnd, &colcnd, &amax, &iinfo,
+ &grid);
+
+ /* Force equilibration. */
+ if ( iinfo==0 && n > 0 ) {
+ if ( what_equil == ROW ) {
+ rowcnd = 0.;
+ colcnd = 1.;
+ ScalePermstruct.DiagScale = ROW;
+ ScalePermstruct.R = R;
+ } else if ( what_equil == COL ) {
+ rowcnd = 1.;
+ colcnd = 0.;
+ ScalePermstruct.DiagScale = COL;
+ ScalePermstruct.C = C;
+ } else if ( what_equil == BOTH ) {
+ rowcnd = 0.;
+ colcnd = 0.;
+ ScalePermstruct.DiagScale = BOTH;
+ ScalePermstruct.R = R;
+ ScalePermstruct.C = C;
+ }
+ }
+
+ /* Equilibrate the matrix. */
+ pzlaqgs(&A, R, C, rowcnd, colcnd, amax, equed);
+ // printf("after pdlaqgs: *equed %c\n", *equed);
+
+ /* Not equilibrate anymore when calling PDGSSVX,.
+ * so, no malloc/free {R,C} inside PDGSSVX. */
+ options.Equil = NO;
+ } /* end if (equil || iequed) */
+ } /* end if prefact */
+
+ if ( prefact ) { /* Need a first factor */
+
+ /* Save Fact option. */
+ fact = options.Fact;
+ options.Fact = DOFACT;
+
+ /* Initialize the statistics variables. */
+ PStatInit(&stat);
+
+ int nrhs1 = 0; /* Only performs factorization */
+ pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs1,
+ &grid, &LUstruct, &SOLVEstruct,
+ berr, &stat, &info);
+
+ if ( info ) {
+ printf("** First factor: nrun %d: fact %d, info %d, "
+ "equil %d, what_equil %d, DiagScale %d \n",
+ nrun, fact, info, equil, what_equil,
+ ScalePermstruct.DiagScale);
+ }
+
+ PStatFree(&stat);
+
+ /* Restore Fact option. */
+ options.Fact = fact;
+ if ( fact == SamePattern ) {
+ // {L,U} not re-used in subsequent call to PDGSSVX.
+ Destroy_LU(n, &grid, &LUstruct);
+ }
+
+ } /* end if .. first time factor */
+
+ /*----------------
+ * Test pzgssvx
+ *----------------*/
+
+ if ( options.Fact != FACTORED ) {
+ /* Restore the matrix A. */
+ zCopy_CompRowLoc_Matrix_dist(&Asave, &A);
+ }
+
+ /* Set the right-hand side. */
+ zCopy_Dense_Matrix_dist(m_loc, nrhs, bsave, ldb, b, ldb);
+
+ PStatInit(&stat);
+
+ /*if ( !iam ) printf("\ttest pdgssvx: nrun %d, iequed %d, equil %d, fact %d\n",
+ nrun, iequed, equil, options.Fact);*/
+ /* Testing PDGSSVX: solve and compute the error bounds. */
+ pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs,
+ &grid, &LUstruct, &SOLVEstruct,
+ berr, &stat, &info);
+
+ PStatFree(&stat);
+#if 0
+ pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+ nrhs, b, ldb, xtrue, ldx, &grid);
+#endif
+ /* if ( info && info != izero ) {*/
+ if ( info ) {
+ printf(FMT3, "pzgssvx",info,izero,n,nrhs,imat,nfail);
+ } else {
+ /* Restore the matrix A. */
+ zCopy_CompRowLoc_Matrix_dist(&Asave, &A);
+
+ /* Compute residual of the computed solution.*/
+ solx = b;
+ pzcompute_resid(m, n, nrhs, &A, solx, ldx, bsave, ldb,
+ &grid, &SOLVEstruct, &result[0]);
+
+#if 0 /* how to get RCOND? */
+ /* Check solution accuracy from generated exact solution. */
+ dgst04(n, nrhs, solx, ldx, xact, ldx, rcond,
+ &result[2]);
+ pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+ nrhs, b, ldb, xtrue, ldx, &grid);
+#endif
+
+ /* Print information about the tests that did
+ not pass the threshold. */
+ int k1 = 0;
+ for (i = k1; i < NTESTS; ++i) {
+ if ( result[i] >= THRESH ) {
+ printf(FMT2, "pzgssvx", options.Fact,
+ ScalePermstruct.DiagScale,
+ n, imat, i, result[i], berr[0]);
+ ++nfail;
+ }
+ }
+ nrun += NTESTS;
+ } /* end else .. info == 0 */
+
+ /* -----------------------------------------------------
+ Deallocate storage associated with {L,U}.
+ ----------------------------------------------------- */
+ if ( prefact ) {
+ SUPERLU_FREE(R);
+ SUPERLU_FREE(C);
+ ScalePermstruct.DiagScale = NOEQUIL; /* Avoid free R/C again. */
+ }
+ ScalePermstructFree(&ScalePermstruct);
+ Destroy_LU(n, &grid, &LUstruct);
+ LUstructFree(&LUstruct);
+ if ( options.SolveInitialized ) {
+ zSolveFinalize(&options, &SOLVEstruct);
+ }
+
+ } /* end for equil ... */
+
+ } /* end for ifact ... */
+
+ } /* end for iequed ... */
+
+ /* ------------------------------------------------------------
+ DEALLOCATE STORAGE.
+ ------------------------------------------------------------*/
+ Destroy_CompRowLoc_Matrix_dist(&A);
+ Destroy_CompRowLoc_Matrix_dist(&Asave);
+ // ScalePermstructFree(&ScalePermstruct);
+ SUPERLU_FREE(b);
+ SUPERLU_FREE(bsave);
+ SUPERLU_FREE(xtrue);
+
+ } /* end for imat ... */
+
+ /* Print a summary of the testing results. */
+ if ( iam==0 ) PrintSumm("DGS", nfail, nrun, nerrs);
+
+ SUPERLU_FREE(berr);
+
+ /* ------------------------------------------------------------
+ RELEASE THE SUPERLU PROCESS GRID.
+ ------------------------------------------------------------*/
+out:
+ superlu_gridexit(&grid);
+
+ /* ------------------------------------------------------------
+ TERMINATES THE MPI EXECUTION ENVIRONMENT.
+ ------------------------------------------------------------*/
+ MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+/*
+ * Parse command line options to get various input parameters.
+ */
+static void
+parse_command_line(int argc, char *argv[], int *nprow, int *npcol,
+ char *matrix_type, int *n, int *relax, int *maxsuper,
+ int *fill_ratio, int *min_gemm_gpu_offload,
+ int *nrhs, FILE **fp)
+{
+ int c;
+ extern char *optarg;
+ char str[20];
+
+ while ( (c = getopt(argc, argv, "hr:c:t:n:x:m:b:g:s:f:")) != EOF ) {
+ switch (c) {
+ case 'h':
+ printf("Options:\n");
+ printf("\t-r <int> - process rows\n");
+ printf("\t-c <int> - process columns\n");
+ printf("\t-n <int> - matrix dimension\n");
+ printf("\t-x <int> - granularity of relaxed supernodes\n");
+ printf("\t-m <int> - maximum size of supernode\n");
+ printf("\t-b <int> - estimated fill ratio to allocate storage\n");
+ printf("\t-g <int> - minimum size of GEMM to offload to GPU\n");
+ printf("\t-s <int> - number of right-hand sides\n");
+ printf("\t-f <char[]> - file name storing a sparse matrix\n");
+ exit(1);
+ break;
+ case 'r': *nprow = atoi(optarg);
+ break;
+ case 'c': *npcol = atoi(optarg);
+ break;
+ case 'n': *n = atoi(optarg);
+ break;
+ case 'x': c = atoi(optarg);
+ sprintf(str, "%d", c);
+ setenv("NREL", str, 1);
+ //printf("Reset relax env. variable to %d\n", c);
+ break;
+ case 'm': c = atoi(optarg);
+ sprintf(str, "%d", c);
+ setenv("NSUP", str, 1);
+ //printf("Reset maxsuper env. variable to %d\n", c);
+ break;
+ case 'b': c = atoi(optarg);
+ sprintf(str, "%d", c);
+ setenv("FILL", str, 1);
+ //printf("Reset fill_ratio env. variable to %d\n", c);
+ break;
+ case 'g': c = atoi(optarg);
+ sprintf(str, "%d", c);
+ setenv("N_GEMM", str, 1);
+ //printf("Reset min_gemm_gpu_offload env. variable to %d\n", c);
+ break;
+ case 's': *nrhs = atoi(optarg);
+ break;
+ case 'f':
+ if ( !(*fp = fopen(optarg, "r")) ) {
+ ABORT("File does not exist");
+ }
+ //printf(".. test sparse matrix in file: %s\n", optarg);
+ break;
+ }
+ }
+}
+
+int cpp_defs()
+{
+ printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+ printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+ printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+ printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+ printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+ printf("....\n");
+ return 0;
+}
diff --git a/TEST/pztest.sh b/TEST/pztest.sh
new file mode 100755
index 0000000..d7956aa
--- /dev/null
+++ b/TEST/pztest.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# bash hint: == is for string comparisons, -eq is for numeric ones.
+
+ofile=pztest.out # output file
+if [ -e $ofile ]; then
+ rm -f $ofile
+fi
+echo "Double-complex testing output" > $ofile
+
+MATRICES=(../EXAMPLE/cg20.cua)
+NPROWS="1 2"
+NPCOLS="1 3"
+NVAL="9 19"
+NRHS="1 3"
+FILLRATIO="2 6"
+# following are blocking parameters, see sp_ienv.c
+RELAX="4 8"
+SUPERSIZE="10 20"
+MINGEMM="10000"
+
+##
+# Loop through all matrices ...
+#
+for mat in $MATRICES; do
+
+ #--------------------------------------------
+ # Test matrix types generated in LAPACK-style
+ #--------------------------------------------
+ if [ "$mat" == "LAPACK" ]; then
+ echo '== LAPACK test matrices' >> $ofile
+ for n in $NVAL ; do
+ for s in $NRHS ; do
+ echo '' >> $ofile
+ echo 'n='$n 'nrhs='$s >> $ofile
+ mpiexec -n 2 pztest -r 1 -c 2 -x 4 -m 10 -b 5 -s 1 >> $ofile
+ done
+ done
+ #--------------------------------------------
+ # Test a specified sparse matrix
+ #--------------------------------------------
+ else
+ echo '' >> $ofile
+ echo '== sparse matrix:' $m >> $ofile
+ for s in $NRHS; do
+ for r in $NPROWS; do
+ for c in $NPCOLS; do
+ np=$(($r*$c))
+ for b in $FILLRATIO; do
+ for x in $RELAX; do
+ for m in $SUPERSIZE; do
+ echo '' >> $ofile
+ echo "**-- nrhs = $s, process grid = $r X $c, fill $b, relax $x, max-super $m"
+ echo "**-- nrhs = $s, process grid = $r X $c, fill $b, relax $x, max-super $m" >> $ofile
+ mpiexec -n $np pztest -r $r -c $c -x $x -m $m -b $b -s 1 -f $mat >> $ofile
+ done
+ done
+ done
+ done
+ done
+ done
+ fi
+done
+
diff --git a/TEST/runtest.cmake b/TEST/runtest.cmake
new file mode 100644
index 0000000..93ae6a1
--- /dev/null
+++ b/TEST/runtest.cmake
@@ -0,0 +1,13 @@
+# execute the test command that was added earlier.
+execute_process( COMMAND ${TEST}
+ OUTPUT_FILE ${OUTPUT}
+ RESULT_VARIABLE RET )
+file(APPEND ${ALL_OUTPUT} ${HEADING})
+file(APPEND ${ALL_OUTPUT} "\n")
+#file(READ ${OUTPUT} SINGLE_OUTPUT)
+file(APPEND ${ALL_OUTPUT} OUTPUT_FILE)
+#file(REMOVE ${OUTPUT}) # remove the individual output file.
+
+if (NOT "${RET}" STREQUAL "0")
+ message (FATAL_ERROR "TEST FAILED!")
+endif()
diff --git a/EXAMPLE/zcreate_matrix.c b/TEST/zcreate_matrix.c
similarity index 97%
copy from EXAMPLE/zcreate_matrix.c
copy to TEST/zcreate_matrix.c
index 87774cf..8660143 100644
--- a/EXAMPLE/zcreate_matrix.c
+++ b/TEST/zcreate_matrix.c
@@ -88,9 +88,14 @@ int zcreate_matrix(SuperMatrix *A, int nrhs, doublecomplex **rhs,
#endif
if ( !iam ) {
+ double t = SuperLU_timer_();
+
/* Read the matrix stored on disk in Harwell-Boeing format. */
zreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+ printf("Time to read and distribute matrix %.2f\n",
+ SuperLU_timer_() - t); fflush(stdout);
+
/* Broadcast matrix A to the other PEs. */
MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm );
MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm );
diff --git a/compile.out b/compile.out
new file mode 100644
index 0000000..590cd80
--- /dev/null
+++ b/compile.out
@@ -0,0 +1,62 @@
+( cd SRC; make )
+make[1]: Entering directory `/home/xiaoye/Dropbox/Codes/SuperLU/SuperLU_DIST_5.2.2/SRC'
+printf "/* #define XSDK_INDEX_SIZE 64 */\n" > superlu_dist_config.h
+printf "#if (XSDK_INDEX_SIZE == 64)\n#define _LONGINT 1\n#endif\n" >> superlu_dist_config.h
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c dlangs_dist.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c dgsequ_dist.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c dlaqgs_dist.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c dutil_dist.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c dmemory_dist.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c dmyblas2_dist.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c dsp_blas2_dist.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c dsp_blas3_dist.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pdgssvx.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pdgssvx_ABglobal.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c dreadhb.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c dreadrb.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c dreadtriple.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c dreadMM.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pdgsequ.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pdlaqgs.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c dldperm_dist.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pdlangs.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pdutil.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pdsymbfact_distdata.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c ddistribute.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pddistribute.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pdgstrf.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pdgstrf2.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pdGetDiagU.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pdgstrs.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pdgstrs1.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pdgstrs_lsum.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pdgstrs_Bglobal.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pdgsrfs.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pdgsmv.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pdgsrfs_ABXglobal.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pdgsmv_AXglobal.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c sp_ienv.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c etree.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c sp_colorder.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c get_perm_c.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c colamd.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c mmd.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c comm.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c memory.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c util.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c superlu_grid.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c pxerr_dist.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c superlu_timer.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c symbfact.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c psymbfact.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c psymbfact_util.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c get_perm_c_parmetis.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c mc64ad_dist.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c static_schedule.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c xerr_dist.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c smach_dist.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c dmach_dist.c
+/home/xiaoye/mpich-install/bin/mpicc -O3 -DNDEBUG -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/64-bit/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64 -c superlu_dist_version.c
+/usr/bin/ar cr /home/xiaoye/Dropbox/Codes/SuperLU/SuperLU_DIST_5.2.2/ssg1-build/SRC/libsuperlu_dist.a \
+ dlangs_dist.o dgsequ_dist.o dlaqgs_dist.o dutil_dist.o dmemory_dist.o dmyblas2_dist.o dsp_blas2_dist.o dsp_blas3_dist.o pdgssvx.o pdgssvx_ABglobal.o dreadhb.o dreadrb.o dreadtriple.o dreadMM.o pdgsequ.o pdlaqgs.o dldperm_dist.o pdlangs.o pdutil.o pdsymbfact_distdata.o ddistribute.o pddistribute.o pdgstrf.o pdgstrf2.o pdGetDiagU.o pdgstrs.o pdgstrs1.o pdgstrs_lsum.o pdgstrs_Bglobal.o pdgsrfs.o pdgsmv.o pdgsrfs_ABXglobal.o pdgsmv_AXglobal.o sp_ienv.o etree.o sp_colorder.o get_perm_c.o c [...]
+make[1]: Leaving directory `/home/xiaoye/Dropbox/Codes/SuperLU/SuperLU_DIST_5.2.2/SRC'
diff --git a/MAKE_INC/make.mpich b/make.inc
similarity index 51%
copy from MAKE_INC/make.mpich
copy to make.inc
index 559a086..4ca856f 100644
--- a/MAKE_INC/make.mpich
+++ b/make.inc
@@ -8,26 +8,17 @@
#
# Creation date: March 1, 2016 version 5.0.0
#
-# Modified:
+# Modified: October 13, 2017 version 5.2.1
#
#
############################################################################
#
# The name of the libraries to be created/linked to
#
-VERSION = 5.1.3
-SuperLUroot = /home/xiaoye/Dropbox/Codes/SuperLU/SuperLU_DIST_${VERSION}
-DSUPERLULIB = $(SuperLUroot)/lib/libsuperlu_dist.a
+SuperLUroot = /home/xiaoye/Dropbox/Codes/SuperLU/SuperLU_DIST_5.2.2
+DSUPERLULIB = $(SuperLUroot)/SRC/libsuperlu_dist.a
-# BLASDEF = -DUSE_VENDOR_BLAS
-
-PARMETIS_DIR := ${HOME}/lib/static/parmetis-4.0.3
-I_PARMETIS := -I${PARMETIS_DIR}/include -I${PARMETIS_DIR}/metis/include
-METISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libmetis -lmetis
-PARMETISLIB := -L${PARMETIS_DIR}/build/Linux-x86_64/libparmetis -lparmetis
-
-LIBS = $(DSUPERLULIB) /usr/lib/libf77blas.so /usr/lib/libatlas.so \
- ${PARMETISLIB} ${METISLIB}
+LIBS = $(DSUPERLULIB) /usr/lib/libf77blas.so /usr/lib/libatlas.so /home/xiaoye/lib/staticparmetis-4.0.3/build/Linux-x86_64/libparmetis/libparmetis.a /home/xiaoye/lib/static/parmetis-4.0.3/build/Linux-x86_64/libmetis/libmetis.a
#
# The archiver and the flag(s) to use when building archive (library)
@@ -38,11 +29,12 @@ ARCHFLAGS = cr
RANLIB = /usr/bin/ranlib
CC = /home/xiaoye/mpich-install/bin/mpicc
-CFLAGS = -DNDEBUG -DUSE_VENDOR_BLAS -DAdd_ -DDEBUGlevel=0 -DPRNTlevel=0 -std=c99 -fPIC -g ${I_PARMETIS}
-# CFLAGS += -D_LONGINT
+CFLAGS = -O3 -DNDEBUG -I/home/xiaoye/lib/static/parmetis-4.0.3/metis/include -I/home/xiaoye/lib/static/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -fopenmp -std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0
+# CFLAGS += -DXSDK_INDEX_SIZE=64
# CFLAGS +=
+#XSDK_INDEX_SIZE = 64 ## 64-bit integer
NOOPTS = -O0
FORTRAN = /usr/bin/gfortran
LOADER = $(CC)
-LOADOPTS = -Wl,-rpath=/home/xiaoye/Dropbox/Codes/SuperLU/SuperLU_DIST_${VERSION}/lib -g # -Wl,-Bdynamic
+LOADOPTS = -Wl,-rpath, -Wl,-rpath -Wl,/home/xiaoye/mpich-install/lib -Wl,--enable-new-dtags -fopenmp
diff --git a/make.inc.in b/make.inc.in
index 15383ac..c4526cb 100644
--- a/make.inc.in
+++ b/make.inc.in
@@ -8,14 +8,14 @@
#
# Creation date: March 1, 2016 version 5.0.0
#
-# Modified:
+# Modified: October 13, 2017 version 5.2.1
#
#
############################################################################
#
# The name of the libraries to be created/linked to
#
-SuperLUroot = ${CMAKE_SOURCE_DIR}/build
+SuperLUroot = ${CMAKE_INSTALL_PREFIX}
DSUPERLULIB = $(SuperLUroot)/SRC/${PROJECT_NAME_LIB_EXPORT}
LIBS = $(DSUPERLULIB) ${BLAS_LIB_EXPORT} ${PARMETIS_LIB_EXPORT}
@@ -32,8 +32,9 @@ CC = @CMAKE_C_COMPILER@
CFLAGS = @CMAKE_C_FLAGS_RELEASE@ @CMAKE_C_FLAGS@
# CFLAGS += -D${DirDefs}
# CFLAGS += @COMPILE_DEFINITIONS@
+#XSDK_INDEX_SIZE = 64 ## 64-bit integer
NOOPTS = -O0
FORTRAN = @CMAKE_Fortran_COMPILER@
LOADER = $(CC)
-LOADOPTS = -Wl,-rpath, at CMAKE_INSTALL_RPATH@ @CMAKE_EXE_LINKER_FLAGS@
+LOADOPTS = -Wl,-rpath, at OpenMP_CXX_FLAGS@ @CMAKE_EXE_LINKER_FLAGS@
diff --git a/run_cmake_build.csh b/run_cmake_build.csh
old mode 100644
new mode 100755
index 42b6482..c003716
--- a/run_cmake_build.csh
+++ b/run_cmake_build.csh
@@ -4,7 +4,7 @@ if ( ! $?NERSC_HOST ) then
echo "NERSC_HOST undefined"
else
if ( "$NERSC_HOST" == "edison" ) then
- setenv PARMETIS_ROOT ~/Edison/lib/parmetis-4.0.3
+ setenv PARMETIS_ROOT ~/Edison/lib/parmetis-4.0.3
# setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/shared-build
setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/static-build/Linux-x86_64
cmake .. \
diff --git a/run_cmake_build.csh b/run_cmake_build.sh
old mode 100644
new mode 100755
similarity index 57%
copy from run_cmake_build.csh
copy to run_cmake_build.sh
index 42b6482..9b1e422
--- a/run_cmake_build.csh
+++ b/run_cmake_build.sh
@@ -1,12 +1,13 @@
-#!/bin/csh
+#!/bin/bash
-if ( ! $?NERSC_HOST ) then
+if [ !$?NERSC_HOST ]
+then
echo "NERSC_HOST undefined"
-else
- if ( "$NERSC_HOST" == "edison" ) then
- setenv PARMETIS_ROOT ~/Edison/lib/parmetis-4.0.3
+elif [ "$NERSC_HOST" == "edison" ]
+then
+ export PARMETIS_ROOT=~/Edison/lib/parmetis-4.0.3
# setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/shared-build
- setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/static-build/Linux-x86_64
+ export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/static-build/Linux-x86_64
cmake .. \
-DUSE_XSDK_DEFAULTS=FALSE\
-DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \
@@ -17,40 +18,40 @@ else
-Denable_blaslib=OFF \
# -DTPL_BLAS_LIBRARIES=" " \
-DBUILD_SHARED_LIBS=OFF \
- -DCMAKE_INSTALL_PREFIX=..
- endif
-
- if ( "$NERSC_HOST" == "cori" ) then
- setenv PARMETIS_ROOT ~/Cori/lib/parmetis-4.0.3
- setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/shared-build
-# setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/static-build/Linux-x86_64
+ -DCMAKE_INSTALL_PREFIX=.
+elif [ "$NERSC_HOST" == "cori" ]
+then
+ export PARMETIS_ROOT=~/Cori/lib/parmetis-4.0.3
+# export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/shared-build
+ setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/static-build/Linux-x86_64
cmake .. \
-DUSE_XSDK_DEFAULTS=TRUE\
-DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \
- -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.so;${PARMETIS_BUILD_DIR}/libmetis/libmetis.so" \
+ -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \
-Denable_blaslib=OFF \
-DCMAKE_Fortran_COMPILER=ftn \
-DCMAKE_C_FLAGS="-std=c99 -fPIC" \
- -DCMAKE_EXE_LINKER_FLAGS="-shared" \
- -DCMAKE_INSTALL_PREFIX=..
- endif
-endif
+# -DCMAKE_EXE_LINKER_FLAGS="-shared" \
+ -DCMAKE_INSTALL_PREFIX=.
+fi
-set THISHOST=`hostname -s`
-#echo $THISHOST
-if ( "$THISHOST" == "ssg1" ) then
- setenv PARMETIS_ROOT ~/lib/static/parmetis-4.0.3
- setenv PARMETIS_BUILD_DIR ${PARMETIS_ROOT}/build/Linux-x86_64
- echo $PARMETIS_ROOT
+THISHOST=`hostname -s`
+echo "host: $THISHOST"
+if [ "$THISHOST" == "ssg1" ]
+then
+ rm -fr ssg1-build; mkdir ssg1-build; cd ssg1-build;
+ export PARMETIS_ROOT=~/lib/static/64-bit/parmetis-4.0.3
+ export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/build/Linux-x86_64
+ echo "ParMetis root: $PARMETIS_ROOT"
cmake .. \
-DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \
-DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \
- -DCMAKE_C_FLAGS="-std=c99 -g" \
+ -DCMAKE_C_FLAGS="-std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DXSDK_INDEX_SIZE=64" \
-Denable_blaslib=OFF \
-DBUILD_SHARED_LIBS=OFF \
-DCMAKE_C_COMPILER=mpicc \
- -DCMAKE_INSTALL_PREFIX=..
-endif
+ -DCMAKE_INSTALL_PREFIX=.
+fi
# make VERBOSE=1
# make test
diff --git a/superlu_dist.pc.in b/superlu_dist.pc.in
new file mode 100644
index 0000000..2de05e0
--- /dev/null
+++ b/superlu_dist.pc.in
@@ -0,0 +1,12 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+libdir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@
+includedir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_INCLUDEDIR@
+
+Name: @CMAKE_PROJECT_NAME@
+Description: Distributed-memory direct solution of sparse systems of linear equations
+Version: @PROJECT_VERSION@
+URL: http://crd-legacy.lbl.gov/~xiaoye/SuperLU/
+
+Libs: -L${libdir} -lsuperlu
+Libs.private: @BLAS_LIB@ -lm
+Cflags: -I${includedir}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/superlu-dist.git
More information about the debian-science-commits
mailing list