[openblas] 01/07: Imported Upstream version 0.2.10
Sébastien Villemot
sebastien at debian.org
Tue Jul 29 19:17:32 UTC 2014
This is an automated email from the git hooks/post-receive script.
sebastien pushed a commit to branch master
in repository openblas.
commit 4df17b47de43d59f8f80dc9be7ee7e73b4732251
Author: Sébastien Villemot <sebastien at debian.org>
Date: Tue Jul 29 18:17:12 2014 +0200
Imported Upstream version 0.2.10
---
.gitignore | 4 +
.travis.yml | 2 +-
CONTRIBUTORS.md | 8 +-
Changelog.txt | 90 +-
GotoBLAS_01Readme.txt | 2 +-
GotoBLAS_02QuickInstall.txt | 6 +-
GotoBLAS_03FAQ.txt | 2 +-
GotoBLAS_05LargePage.txt | 2 +-
GotoBLAS_06WeirdPerformance.txt | 4 +-
LICENSE | 22 +-
Makefile | 29 +-
Makefile.alpha | 2 +-
Makefile.ia64 | 2 +-
Makefile.install | 10 +-
Makefile.power | 8 +-
Makefile.rule | 37 +-
Makefile.sparc | 2 +-
Makefile.system | 117 +-
Makefile.tail | 14 +-
Makefile.x86 | 4 +-
Makefile.x86_64 | 2 +-
README.md | 14 +-
benchmark/Makefile | 714 ++++++-
benchmark/cholesky.c | 32 +-
benchmark/gemm.c | 210 ++
benchmark/hemm.c | 192 ++
benchmark/her2k.c | 191 ++
benchmark/herk.c | 189 ++
benchmark/linpack.c | 32 +-
benchmark/symm.c | 203 ++
benchmark/syr2k.c | 203 ++
benchmark/syrk.c | 199 ++
benchmark/trmm.c | 202 ++
benchmark/trsm.c | 202 ++
c_check | 18 +-
cblas.h | 19 +-
cblas_noconst.h | 17 +-
common.h | 29 +-
common_arm.h | 26 +-
common_arm64.h | 26 +-
common_c.h | 23 +
common_d.h | 12 +
common_ia64.h | 4 +-
common_interface.h | 141 +-
common_level1.h | 25 +-
common_level2.h | 16 +-
common_level3.h | 39 +-
common_linux.h | 6 +-
common_macro.h | 36 +
common_mips64.h | 46 +-
common_param.h | 113 +-
common_power.h | 2 +-
common_reference.h | 24 +-
common_s.h | 16 +
common_sparc.h | 26 +-
common_thread.h | 6 +-
common_x86.h | 12 +-
common_x86_64.h | 12 +-
common_z.h | 23 +
cpuid.S | 4 +-
cpuid_alpha.c | 8 +-
cpuid_arm.c | 8 +-
cpuid_ia64.c | 6 +-
cpuid_mips.c | 22 +-
cpuid_power.c | 2 +-
cpuid_x86.c | 86 +-
ctest/Makefile | 16 +-
ctest/c_c2chke.c | 292 +--
ctest/c_c3chke.c | 12 +-
ctest/c_cblas1.c | 6 +-
ctest/c_cblas2.c | 40 +-
ctest/c_cblas3.c | 60 +-
ctest/c_cblat2.f | 14 +-
ctest/c_cblat3.f | 78 +-
ctest/c_d2chke.c | 290 +--
ctest/c_d3chke.c | 14 +-
ctest/c_dblas1.c | 2 +-
ctest/c_dblas2.c | 58 +-
ctest/c_dblas3.c | 36 +-
ctest/c_dblat1.f | 4 +-
ctest/c_dblat2.f | 14 +-
ctest/c_dblat3.f | 64 +-
ctest/c_s2chke.c | 290 +--
ctest/c_s3chke.c | 14 +-
ctest/c_sblas1.c | 4 +-
ctest/c_sblas2.c | 58 +-
ctest/c_sblas3.c | 36 +-
ctest/c_sblat2.f | 14 +-
ctest/c_sblat3.f | 66 +-
ctest/c_xerbla.c | 20 +-
ctest/c_z2chke.c | 292 +--
ctest/c_z3chke.c | 12 +-
ctest/c_zblas1.c | 6 +-
ctest/c_zblas2.c | 40 +-
ctest/c_zblas3.c | 60 +-
ctest/c_zblat2.f | 26 +-
ctest/c_zblat3.f | 80 +-
driver/level2/Makefile | 2130 ++++++++++----------
driver/level2/gbmv_k.c | 4 +-
driver/level2/gbmv_thread.c | 32 +-
driver/level2/gemv_thread.c | 22 +-
driver/level2/ger_thread.c | 28 +-
driver/level2/sbmv_k.c | 6 +-
driver/level2/sbmv_thread.c | 80 +-
driver/level2/spmv_k.c | 2 +-
driver/level2/spmv_thread.c | 78 +-
driver/level2/spr2_k.c | 2 +-
driver/level2/spr2_thread.c | 64 +-
driver/level2/spr_k.c | 2 +-
driver/level2/spr_thread.c | 56 +-
driver/level2/symv_thread.c | 76 +-
driver/level2/syr2_k.c | 2 +-
driver/level2/syr2_thread.c | 62 +-
driver/level2/syr_k.c | 2 +-
driver/level2/syr_thread.c | 56 +-
driver/level2/tbmv_L.c | 8 +-
driver/level2/tbmv_U.c | 6 +-
driver/level2/tbmv_thread.c | 96 +-
driver/level2/tbsv_L.c | 6 +-
driver/level2/tbsv_U.c | 8 +-
driver/level2/tpmv_L.c | 6 +-
driver/level2/tpmv_U.c | 6 +-
driver/level2/tpmv_thread.c | 88 +-
driver/level2/tpsv_L.c | 8 +-
driver/level2/tpsv_U.c | 6 +-
driver/level2/trmv_L.c | 10 +-
driver/level2/trmv_U.c | 8 +-
driver/level2/trmv_thread.c | 98 +-
driver/level2/trsv_L.c | 14 +-
driver/level2/trsv_U.c | 10 +-
driver/level2/zgbmv_k.c | 2 +-
driver/level2/zhbmv_k.c | 18 +-
driver/level2/zher2_k.c | 6 +-
driver/level2/zhpmv_k.c | 32 +-
driver/level2/zhpr2_k.c | 6 +-
driver/level2/zsbmv_k.c | 12 +-
driver/level2/zspmv_k.c | 16 +-
driver/level2/zspr2_k.c | 4 +-
driver/level2/zspr_k.c | 2 +-
driver/level2/zsyr2_k.c | 4 +-
driver/level2/zsyr_k.c | 2 +-
driver/level2/ztbmv_L.c | 6 +-
driver/level2/ztbmv_U.c | 2 +-
driver/level2/ztbsv_L.c | 10 +-
driver/level2/ztbsv_U.c | 12 +-
driver/level2/ztpmv_L.c | 8 +-
driver/level2/ztpmv_U.c | 4 +-
driver/level2/ztpsv_L.c | 12 +-
driver/level2/ztpsv_U.c | 14 +-
driver/level2/ztrmv_L.c | 2 +-
driver/level2/ztrmv_U.c | 6 +-
driver/level2/ztrsv_L.c | 8 +-
driver/level2/ztrsv_U.c | 6 +-
driver/level3/Makefile | 226 ++-
driver/level3/gemm3m_level3.c | 130 +-
driver/level3/gemm_thread_m.c | 8 +-
driver/level3/gemm_thread_mn.c | 12 +-
driver/level3/gemm_thread_n.c | 10 +-
driver/level3/gemm_thread_variable.c | 8 +-
driver/level3/level3.c | 48 +-
driver/level3/level3_gemm3m_thread.c | 280 +--
driver/level3/level3_syr2k.c | 114 +-
driver/level3/level3_syrk.c | 202 +-
driver/level3/level3_syrk_threaded.c | 142 +-
driver/level3/level3_thread.c | 132 +-
driver/level3/syr2k_k.c | 2 +-
driver/level3/syr2k_kernel.c | 42 +-
driver/level3/syrk_k.c | 2 +-
driver/level3/syrk_kernel.c | 24 +-
driver/level3/syrk_thread.c | 52 +-
driver/level3/trmm_L.c | 94 +-
driver/level3/trmm_R.c | 102 +-
driver/level3/trsm_L.c | 40 +-
driver/level3/trsm_R.c | 104 +-
driver/level3/zher2k_k.c | 2 +-
driver/level3/zher2k_kernel.c | 38 +-
driver/level3/zherk_k.c | 2 +-
driver/level3/zherk_kernel.c | 24 +-
driver/level3/zsyrk_beta.c | 2 +-
driver/mapper/mapper.c | 40 +-
driver/others/Makefile | 12 +-
driver/others/blas_l1_thread.c | 16 +-
driver/others/blas_server.c | 225 +--
driver/others/blas_server_omp.c | 36 +-
driver/others/blas_server_win32.c | 122 +-
driver/others/divtable.c | 36 +-
driver/others/dynamic.c | 123 +-
driver/others/init.c | 104 +-
driver/others/lamc3.c | 2 +-
driver/others/lamch.c | 2 +-
driver/others/memory.c | 236 +--
driver/others/memory_qalloc.c | 6 +-
driver/others/openblas_error_handle.c | 5 +-
driver/others/openblas_get_config.c | 47 +-
driver/others/openblas_get_parallel.c | 34 +-
driver/others/openblas_set_num_threads.c | 22 +-
driver/others/parameter.c | 50 +-
driver/others/profile.c | 12 +-
driver/others/xerbla.c | 4 +-
exports/Makefile | 8 +-
exports/dllinit.c | 2 +-
exports/gensymbol | 83 +-
f_check | 44 +-
ftest.f | 2 +-
ftest3.f | 2 +-
getarch.c | 39 +-
getarch_2nd.c | 6 +-
interface/Makefile | 134 +-
interface/asum.c | 4 +-
kernel/arm/swap.c => interface/axpby.c | 56 +-
interface/axpy.c | 14 +-
interface/copy.c | 6 +-
interface/dot.c | 4 +-
interface/dsdot.c | 8 +-
interface/gbmv.c | 10 +-
interface/gemm.c | 125 +-
interface/gemv.c | 20 +-
interface/ger.c | 21 +-
interface/imatcopy.c | 142 ++
interface/imax.c | 4 +-
interface/lapack/gesv.c | 8 +-
interface/lapack/getf2.c | 4 +-
interface/lapack/getrf.c | 4 +-
interface/lapack/getrs.c | 4 +-
interface/lapack/larf.c.obsolete | 4 +-
interface/lapack/laswp.c | 10 +-
interface/lapack/lauu2.c | 4 +-
interface/lapack/lauum.c | 6 +-
interface/lapack/potf2.c | 4 +-
interface/lapack/potrf.c | 4 +-
interface/lapack/potri.c | 8 +-
interface/lapack/trti2.c | 8 +-
interface/lapack/trtri.c | 10 +-
interface/lapack/zgetf2.c | 2 +-
interface/lapack/zgetrf.c | 2 +-
interface/lapack/zgetrs.c | 6 +-
interface/lapack/zlaswp.c | 4 +-
interface/lapack/zlauu2.c | 4 +-
interface/lapack/zlauum.c | 6 +-
interface/lapack/zpotf2.c | 4 +-
interface/lapack/zpotrf.c | 6 +-
interface/lapack/zpotri.c | 8 +-
interface/lapack/ztrti2.c | 8 +-
interface/lapack/ztrtri.c | 12 +-
interface/max.c | 4 +-
interface/nrm2.c | 4 +-
interface/omatcopy.c | 120 ++
interface/rot.c | 4 +-
interface/rotm.c | 6 +-
interface/rotmg.c | 14 +-
interface/sbmv.c | 26 +-
interface/scal.c | 15 +-
interface/sdsdot.c | 8 +-
interface/spmv.c | 6 +-
interface/spr.c | 8 +-
interface/spr2.c | 8 +-
interface/swap.c | 14 +-
interface/symm.c | 74 +-
interface/symv.c | 10 +-
interface/syr.c | 8 +-
interface/syr2.c | 8 +-
interface/syr2k.c | 44 +-
interface/syrk.c | 50 +-
interface/tbmv.c | 12 +-
interface/tbsv.c | 12 +-
interface/tpmv.c | 12 +-
interface/tpsv.c | 10 +-
interface/trmv.c | 16 +-
interface/trsm.c | 50 +-
interface/trsv.c | 10 +-
kernel/arm/zswap.c => interface/zaxpby.c | 66 +-
interface/zaxpy.c | 14 +-
interface/zgbmv.c | 6 +-
interface/zgemv.c | 22 +-
interface/zger.c | 21 +-
interface/zhbmv.c | 8 +-
interface/zhemv.c | 6 +-
interface/zher.c | 10 +-
interface/zher2.c | 8 +-
interface/zhpmv.c | 4 +-
interface/zhpr.c | 10 +-
interface/zhpr2.c | 10 +-
interface/zimatcopy.c | 185 ++
interface/zomatcopy.c | 154 ++
interface/zrot.c | 4 +-
interface/zsbmv.c | 24 +-
interface/zscal.c | 11 +-
interface/zspmv.c | 4 +-
interface/zspr.c | 10 +-
interface/zspr2.c | 12 +-
interface/zswap.c | 8 +-
interface/zsymv.c | 12 +-
interface/zsyr.c | 8 +-
interface/zsyr2.c | 6 +-
interface/ztbmv.c | 12 +-
interface/ztbsv.c | 12 +-
interface/ztpmv.c | 10 +-
interface/ztpsv.c | 10 +-
interface/ztrmv.c | 12 +-
interface/ztrsv.c | 12 +-
kernel/Makefile | 6 +-
kernel/Makefile.L1 | 240 ++-
kernel/Makefile.L2 | 142 +-
kernel/Makefile.L3 | 207 +-
kernel/alpha/KERNEL | 4 +-
kernel/alpha/cnrm2.S | 4 +-
kernel/alpha/dnrm2.S | 4 +-
kernel/alpha/gemm_kernel_4x4.S | 6 +-
kernel/alpha/gemv_n.S | 8 +-
kernel/alpha/iamax.S | 4 +-
kernel/alpha/imax.S | 2 +-
kernel/alpha/izamax.S | 2 +-
kernel/alpha/snrm2.S | 4 +-
kernel/alpha/trsm_kernel_4x4_LN.S | 120 +-
kernel/alpha/trsm_kernel_4x4_LT.S | 120 +-
kernel/alpha/trsm_kernel_4x4_RT.S | 120 +-
kernel/alpha/zamax.S | 2 +-
kernel/alpha/zaxpy.S | 30 +-
kernel/alpha/zgemm_kernel_2x2.S | 2 +-
kernel/alpha/znrm2.S | 4 +-
kernel/alpha/ztrsm_kernel_2x2_LN.S | 24 +-
kernel/alpha/ztrsm_kernel_2x2_LT.S | 24 +-
kernel/alpha/ztrsm_kernel_2x2_RT.S | 24 +-
kernel/arm/KERNEL.ARMV5 | 4 +-
kernel/arm/KERNEL.ARMV6 | 12 +-
kernel/arm/KERNEL.ARMV7 | 30 +-
kernel/arm/amax.c | 4 +-
kernel/arm/amin.c | 4 +-
kernel/arm/asum.c | 2 +-
kernel/arm/{axpy.c => axpby.c} | 66 +-
kernel/arm/axpy.c | 2 +-
kernel/arm/ccopy_vfp.S | 2 +-
kernel/arm/cdot_vfp.S | 10 +-
kernel/arm/cgemm_kernel_2x2_vfp.S | 28 +-
kernel/arm/cgemm_kernel_2x2_vfpv3.S | 28 +-
kernel/arm/cgemm_ncopy_2_vfp.S | 14 +-
kernel/arm/cgemv_n_vfp.S | 4 +-
kernel/arm/copy.c | 2 +-
kernel/arm/ctrmm_kernel_2x2_vfp.S | 28 +-
kernel/arm/ctrmm_kernel_2x2_vfpv3.S | 28 +-
kernel/arm/dcopy_vfp.S | 2 +-
kernel/arm/ddot_vfp.S | 2 +-
kernel/arm/dgemm_kernel_4x2_vfp.S | 50 +-
kernel/arm/dgemm_kernel_4x4_vfpv3.S | 74 +-
kernel/arm/dgemm_ncopy_2_vfp.S | 14 +-
kernel/arm/dgemm_ncopy_4_vfp.S | 20 +-
kernel/arm/dgemm_tcopy_4_vfp.S | 16 +-
kernel/arm/dot.c | 2 +-
kernel/arm/dtrmm_kernel_4x2_vfp.S | 42 +-
kernel/arm/dtrmm_kernel_4x4_vfpv3.S | 74 +-
kernel/arm/gemv_n.c | 4 +-
kernel/arm/gemv_n_vfp.S | 4 +-
kernel/arm/gemv_n_vfpv3.S | 4 +-
kernel/arm/gemv_t.c | 3 +-
kernel/arm/iamax.c | 4 +-
kernel/arm/iamax_vfp.S | 2 +-
kernel/arm/iamin.c | 4 +-
kernel/arm/imax.c | 4 +-
kernel/arm/imin.c | 8 +-
kernel/arm/izamax.c | 4 +-
kernel/arm/izamin.c | 4 +-
kernel/arm/max.c | 4 +-
kernel/arm/min.c | 4 +-
kernel/arm/nrm2.c | 6 +-
kernel/arm/nrm2_vfp.S | 62 +-
kernel/arm/nrm2_vfpv3.S | 56 +-
kernel/arm/{zswap.c => omatcopy_cn.c} | 78 +-
kernel/arm/{zswap.c => omatcopy_ct.c} | 77 +-
kernel/arm/{zswap.c => omatcopy_rn.c} | 78 +-
kernel/arm/{copy.c => omatcopy_rt.c} | 45 +-
kernel/arm/rot.c | 2 +-
kernel/arm/rot_vfp.S | 20 +-
kernel/arm/scal.c | 2 +-
kernel/arm/scal_vfp.S | 48 +-
kernel/arm/scopy_vfp.S | 2 +-
kernel/arm/sdot_vfp.S | 2 +-
kernel/arm/sgemm_kernel_4x2_vfp.S | 50 +-
kernel/arm/sgemm_kernel_4x4_vfpv3.S | 74 +-
kernel/arm/sgemm_ncopy_2_vfp.S | 14 +-
kernel/arm/sgemm_ncopy_4_vfp.S | 20 +-
kernel/arm/sgemm_tcopy_4_vfp.S | 18 +-
kernel/arm/strmm_kernel_4x2_vfp.S | 42 +-
kernel/arm/strmm_kernel_4x4_vfpv3.S | 68 +-
kernel/arm/swap.c | 2 +-
kernel/arm/zamax.c | 4 +-
kernel/arm/zamin.c | 4 +-
kernel/arm/zasum.c | 2 +-
kernel/arm/{zaxpy.c => zaxpby.c} | 89 +-
kernel/arm/zaxpy.c | 2 +-
kernel/arm/zcopy.c | 2 +-
kernel/arm/zcopy_vfp.S | 2 +-
kernel/arm/zdot.c | 4 +-
kernel/arm/zdot_vfp.S | 10 +-
kernel/arm/zgemm_kernel_2x2_vfp.S | 28 +-
kernel/arm/zgemm_kernel_2x2_vfpv3.S | 28 +-
kernel/arm/zgemm_ncopy_2_vfp.S | 14 +-
kernel/arm/zgemv_n.c | 4 +-
kernel/arm/zgemv_n_vfp.S | 4 +-
kernel/arm/zgemv_t.c | 2 +-
kernel/arm/znrm2.c | 10 +-
kernel/arm/{axpy.c => zomatcopy_cn.c} | 54 +-
kernel/arm/{axpy.c => zomatcopy_cnc.c} | 53 +-
kernel/arm/{axpy.c => zomatcopy_ct.c} | 55 +-
kernel/arm/{axpy.c => zomatcopy_ctc.c} | 55 +-
kernel/arm/{axpy.c => zomatcopy_rn.c} | 54 +-
kernel/arm/{axpy.c => zomatcopy_rnc.c} | 55 +-
kernel/arm/{axpy.c => zomatcopy_rt.c} | 56 +-
kernel/arm/{axpy.c => zomatcopy_rtc.c} | 56 +-
kernel/arm/zrot.c | 2 +-
kernel/arm/zscal.c | 2 +-
kernel/arm/zswap.c | 2 +-
kernel/arm/ztrmm_kernel_2x2_vfp.S | 28 +-
kernel/arm/ztrmm_kernel_2x2_vfpv3.S | 28 +-
kernel/arm64/KERNEL.ARMV8 | 4 +-
kernel/{arm => generic}/dot.c | 60 +-
kernel/generic/gemm_beta.c | 4 +-
kernel/generic/gemm_ncopy_1.c | 4 +-
kernel/generic/gemm_ncopy_16.c | 84 +-
kernel/generic/gemm_ncopy_2.c | 8 +-
kernel/generic/gemm_ncopy_4.c | 56 +-
kernel/generic/gemm_ncopy_6.c | 56 +-
kernel/generic/gemm_ncopy_8.c | 68 +-
kernel/generic/gemm_tcopy_1.c | 2 +-
kernel/generic/gemm_tcopy_16.c | 50 +-
kernel/generic/gemm_tcopy_2.c | 4 +-
kernel/generic/gemm_tcopy_4.c | 68 +-
kernel/generic/gemm_tcopy_6.c | 68 +-
kernel/generic/gemm_tcopy_8.c | 138 +-
kernel/generic/gemmkernel_2x2.c | 24 +-
kernel/generic/ger.c | 4 +-
kernel/generic/laswp_ncopy_1.c | 40 +-
kernel/generic/laswp_ncopy_2.c | 72 +-
kernel/generic/laswp_ncopy_4.c | 86 +-
kernel/generic/laswp_ncopy_8.c | 44 +-
kernel/generic/neg_tcopy_1.c | 2 +-
kernel/generic/neg_tcopy_16.c | 50 +-
kernel/generic/neg_tcopy_2.c | 4 +-
kernel/generic/neg_tcopy_4.c | 68 +-
kernel/generic/neg_tcopy_8.c | 138 +-
kernel/generic/symm_lcopy_1.c | 4 +-
kernel/generic/symm_lcopy_16.c | 20 +-
kernel/generic/symm_lcopy_2.c | 8 +-
kernel/generic/symm_lcopy_4.c | 12 +-
kernel/generic/symm_lcopy_6.c | 12 +-
kernel/generic/symm_lcopy_8.c | 16 +-
kernel/generic/symm_ucopy_1.c | 4 +-
kernel/generic/symm_ucopy_16.c | 22 +-
kernel/generic/symm_ucopy_2.c | 8 +-
kernel/generic/symm_ucopy_4.c | 14 +-
kernel/generic/symm_ucopy_6.c | 14 +-
kernel/generic/symm_ucopy_8.c | 18 +-
kernel/generic/symv_k.c | 18 +-
kernel/generic/trmm_lncopy_1.c | 2 +-
kernel/generic/trmm_lncopy_16.c | 64 +-
kernel/generic/trmm_lncopy_2.c | 16 +-
kernel/generic/trmm_lncopy_4.c | 68 +-
kernel/generic/trmm_lncopy_6.c | 68 +-
kernel/generic/trmm_lncopy_8.c | 178 +-
kernel/generic/trmm_ltcopy_1.c | 4 +-
kernel/generic/trmm_ltcopy_16.c | 94 +-
kernel/generic/trmm_ltcopy_2.c | 16 +-
kernel/generic/trmm_ltcopy_4.c | 72 +-
kernel/generic/trmm_ltcopy_6.c | 72 +-
kernel/generic/trmm_ltcopy_8.c | 140 +-
kernel/generic/trmm_uncopy_1.c | 8 +-
kernel/generic/trmm_uncopy_16.c | 76 +-
kernel/generic/trmm_uncopy_2.c | 22 +-
kernel/generic/trmm_uncopy_4.c | 74 +-
kernel/generic/trmm_uncopy_6.c | 12 +-
kernel/generic/trmm_uncopy_8.c | 182 +-
kernel/generic/trmm_utcopy_1.c | 6 +-
kernel/generic/trmm_utcopy_16.c | 94 +-
kernel/generic/trmm_utcopy_2.c | 26 +-
kernel/generic/trmm_utcopy_4.c | 64 +-
kernel/generic/trmm_utcopy_6.c | 64 +-
kernel/generic/trmm_utcopy_8.c | 160 +-
kernel/generic/trmmkernel_16x2.c | 94 +-
kernel/generic/trmmkernel_2x2.c | 52 +-
kernel/generic/trmmkernel_8x2.c | 76 +-
kernel/generic/trsm_kernel_LN.c | 74 +-
kernel/generic/trsm_kernel_LT.c | 58 +-
kernel/generic/trsm_kernel_RN.c | 62 +-
kernel/generic/trsm_kernel_RT.c | 60 +-
kernel/generic/trsm_lncopy_1.c | 2 +-
kernel/generic/trsm_lncopy_16.c | 22 +-
kernel/generic/trsm_lncopy_2.c | 4 +-
kernel/generic/trsm_lncopy_4.c | 8 +-
kernel/generic/trsm_lncopy_6.c | 8 +-
kernel/generic/trsm_lncopy_8.c | 52 +-
kernel/generic/trsm_ltcopy_16.c | 22 +-
kernel/generic/trsm_ltcopy_2.c | 6 +-
kernel/generic/trsm_ltcopy_4.c | 18 +-
kernel/generic/trsm_ltcopy_6.c | 18 +-
kernel/generic/trsm_ltcopy_8.c | 14 +-
kernel/generic/trsm_uncopy_1.c | 2 +-
kernel/generic/trsm_uncopy_16.c | 22 +-
kernel/generic/trsm_uncopy_2.c | 4 +-
kernel/generic/trsm_uncopy_4.c | 10 +-
kernel/generic/trsm_uncopy_6.c | 10 +-
kernel/generic/trsm_uncopy_8.c | 34 +-
kernel/generic/trsm_utcopy_1.c | 2 +-
kernel/generic/trsm_utcopy_16.c | 20 +-
kernel/generic/trsm_utcopy_2.c | 4 +-
kernel/generic/trsm_utcopy_4.c | 8 +-
kernel/generic/trsm_utcopy_6.c | 8 +-
kernel/generic/trsm_utcopy_8.c | 14 +-
kernel/generic/zgemm3m_ncopy_1.c | 12 +-
kernel/generic/zgemm3m_ncopy_2.c | 20 +-
kernel/generic/zgemm3m_ncopy_4.c | 24 +-
kernel/generic/zgemm3m_ncopy_8.c | 32 +-
kernel/generic/zgemm3m_tcopy_1.c | 14 +-
kernel/generic/zgemm3m_tcopy_2.c | 18 +-
kernel/generic/zgemm3m_tcopy_4.c | 54 +-
kernel/generic/zgemm3m_tcopy_8.c | 212 +-
kernel/generic/zgemm_beta.c | 16 +-
kernel/generic/zgemm_ncopy_1.c | 16 +-
kernel/generic/zgemm_ncopy_2.c | 32 +-
kernel/generic/zgemm_ncopy_4.c | 84 +-
kernel/generic/zgemm_ncopy_4_sandy.c | 48 +-
kernel/generic/zgemm_ncopy_8.c | 26 +-
kernel/generic/zgemm_ncopy_8_sandy.c | 56 +-
kernel/generic/zgemm_tcopy_1.c | 28 +-
kernel/generic/zgemm_tcopy_2.c | 52 +-
kernel/generic/zgemm_tcopy_4.c | 74 +-
kernel/generic/zgemm_tcopy_4_sandy.c | 48 +-
kernel/generic/zgemm_tcopy_8.c | 44 +-
kernel/generic/zgemm_tcopy_8_sandy.c | 54 +-
kernel/generic/zgemmkernel_2x2.c | 22 +-
kernel/generic/zger.c | 8 +-
kernel/generic/zhemm3m_lcopy_1.c | 18 +-
kernel/generic/zhemm3m_lcopy_2.c | 4 +-
kernel/generic/zhemm3m_lcopy_4.c | 6 +-
kernel/generic/zhemm3m_lcopy_8.c | 8 +-
kernel/generic/zhemm3m_ucopy_1.c | 18 +-
kernel/generic/zhemm3m_ucopy_2.c | 10 +-
kernel/generic/zhemm3m_ucopy_4.c | 14 +-
kernel/generic/zhemm3m_ucopy_8.c | 8 +-
kernel/generic/zhemm_ltcopy_1.c | 4 +-
kernel/generic/zhemm_ltcopy_2.c | 8 +-
kernel/generic/zhemm_ltcopy_4.c | 12 +-
kernel/generic/zhemm_ltcopy_8.c | 16 +-
kernel/generic/zhemm_utcopy_1.c | 4 +-
kernel/generic/zhemm_utcopy_2.c | 8 +-
kernel/generic/zhemm_utcopy_4.c | 14 +-
kernel/generic/zhemm_utcopy_8.c | 22 +-
kernel/generic/zhemv_k.c | 12 +-
kernel/generic/zlaswp_ncopy_1.c | 40 +-
kernel/generic/zlaswp_ncopy_2.c | 76 +-
kernel/generic/zlaswp_ncopy_4.c | 106 +-
kernel/generic/zneg_tcopy_1.c | 28 +-
kernel/generic/zneg_tcopy_2.c | 52 +-
kernel/generic/zneg_tcopy_4.c | 74 +-
kernel/generic/zneg_tcopy_8.c | 44 +-
kernel/generic/zsymm3m_lcopy_1.c | 20 +-
kernel/generic/zsymm3m_lcopy_2.c | 8 +-
kernel/generic/zsymm3m_lcopy_4.c | 10 +-
kernel/generic/zsymm3m_lcopy_8.c | 14 +-
kernel/generic/zsymm3m_ucopy_1.c | 20 +-
kernel/generic/zsymm3m_ucopy_2.c | 10 +-
kernel/generic/zsymm3m_ucopy_4.c | 14 +-
kernel/generic/zsymm3m_ucopy_8.c | 18 +-
kernel/generic/zsymm_lcopy_1.c | 4 +-
kernel/generic/zsymm_lcopy_2.c | 8 +-
kernel/generic/zsymm_lcopy_4.c | 12 +-
kernel/generic/zsymm_lcopy_8.c | 14 +-
kernel/generic/zsymm_ucopy_1.c | 2 +-
kernel/generic/zsymm_ucopy_2.c | 8 +-
kernel/generic/zsymm_ucopy_4.c | 12 +-
kernel/generic/zsymm_ucopy_8.c | 16 +-
kernel/generic/zsymv_k.c | 10 +-
kernel/generic/ztrmm_lncopy_1.c | 2 +-
kernel/generic/ztrmm_lncopy_2.c | 12 +-
kernel/generic/ztrmm_lncopy_4.c | 104 +-
kernel/generic/ztrmm_lncopy_8.c | 50 +-
kernel/generic/ztrmm_ltcopy_1.c | 6 +-
kernel/generic/ztrmm_ltcopy_2.c | 28 +-
kernel/generic/ztrmm_ltcopy_4.c | 116 +-
kernel/generic/ztrmm_ltcopy_8.c | 84 +-
kernel/generic/ztrmm_uncopy_1.c | 4 +-
kernel/generic/ztrmm_uncopy_2.c | 22 +-
kernel/generic/ztrmm_uncopy_4.c | 98 +-
kernel/generic/ztrmm_uncopy_8.c | 66 +-
kernel/generic/ztrmm_utcopy_1.c | 4 +-
kernel/generic/ztrmm_utcopy_2.c | 36 +-
kernel/generic/ztrmm_utcopy_4.c | 102 +-
kernel/generic/ztrmm_utcopy_8.c | 102 +-
kernel/generic/ztrmmkernel_2x2.c | 36 +-
kernel/generic/ztrsm_lncopy_1.c | 2 +-
kernel/generic/ztrsm_lncopy_2.c | 4 +-
kernel/generic/ztrsm_lncopy_4.c | 12 +-
kernel/generic/ztrsm_lncopy_8.c | 18 +-
kernel/generic/ztrsm_ltcopy_1.c | 2 +-
kernel/generic/ztrsm_ltcopy_2.c | 4 +-
kernel/generic/ztrsm_ltcopy_4.c | 12 +-
kernel/generic/ztrsm_ltcopy_8.c | 18 +-
kernel/generic/ztrsm_uncopy_1.c | 2 +-
kernel/generic/ztrsm_uncopy_2.c | 4 +-
kernel/generic/ztrsm_uncopy_4.c | 12 +-
kernel/generic/ztrsm_uncopy_8.c | 18 +-
kernel/generic/ztrsm_utcopy_1.c | 2 +-
kernel/generic/ztrsm_utcopy_2.c | 4 +-
kernel/generic/ztrsm_utcopy_4.c | 12 +-
kernel/generic/ztrsm_utcopy_8.c | 16 +-
kernel/ia64/amax.S | 8 +-
kernel/ia64/asum.S | 8 +-
kernel/ia64/caxpy.S | 4 +-
kernel/ia64/copy.S | 4 +-
kernel/ia64/daxpy.S | 2 +-
kernel/ia64/ddot.S | 10 +-
kernel/ia64/gemm_beta.S | 10 +-
kernel/ia64/gemm_kernel.S | 28 +-
kernel/ia64/gemm_ncopy.S | 4 +-
kernel/ia64/gemv_n.S | 16 +-
kernel/ia64/gemv_t.S | 12 +-
kernel/ia64/iamax.S | 2 +-
kernel/ia64/izamax.S | 8 +-
kernel/ia64/lsame.S | 2 +-
kernel/ia64/nrm2.S | 6 +-
kernel/ia64/qaxpy.S | 4 +-
kernel/ia64/qgemm_kernel.S | 28 +-
kernel/ia64/qgemv_n.S | 22 +-
kernel/ia64/qgemv_t.S | 114 +-
kernel/ia64/qscal.S | 2 +-
kernel/ia64/saxpy.S | 8 +-
kernel/ia64/scal.S | 2 +-
kernel/ia64/sdot.S | 10 +-
kernel/ia64/sgemv_n.S | 16 +-
kernel/ia64/symv_U.S | 16 +-
kernel/ia64/trsm_kernel_LN.S | 50 +-
kernel/ia64/trsm_kernel_LT.S | 24 +-
kernel/ia64/trsm_kernel_RT.S | 60 +-
kernel/ia64/xdot.S | 8 +-
kernel/ia64/zcopy.S | 2 +-
kernel/ia64/zdot.S | 8 +-
kernel/ia64/zgemm3m_kernel.S | 26 +-
kernel/ia64/zgemm_beta.S | 12 +-
kernel/ia64/zgemm_kernel.S | 22 +-
kernel/ia64/zgemm_ncopy.S | 4 +-
kernel/ia64/zgemv_n.S | 16 +-
kernel/ia64/zgemv_t.S | 14 +-
kernel/ia64/zscal.S | 2 +-
kernel/ia64/zswap.S | 6 +-
kernel/ia64/ztrsm_kernel_LN.S | 28 +-
kernel/ia64/ztrsm_kernel_LT.S | 28 +-
kernel/ia64/ztrsm_kernel_RT.S | 28 +-
kernel/mips64/KERNEL.LOONGSON3A | 2 +-
kernel/mips64/KERNEL.LOONGSON3B | 4 +-
kernel/mips64/amax.S | 4 +-
kernel/mips64/amin.S | 4 +-
kernel/mips64/asum.S | 4 +-
kernel/mips64/axpy.S | 2 +-
kernel/mips64/axpy_loongson3a.S | 46 +-
kernel/mips64/cgemm_kernel_loongson3a_2x2.S | 114 +-
kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S | 102 +-
kernel/mips64/cgemm_kernel_loongson3b_2x2.S | 114 +-
kernel/mips64/cnrm2.S | 10 +-
kernel/mips64/copy.S | 4 +-
kernel/mips64/daxpy_loongson3a_simd.S | 104 +-
kernel/mips64/dgemm_kernel_loongson3a_4x4.S | 484 ++---
kernel/mips64/dgemm_kernel_loongson3b_4x4.S | 414 ++--
kernel/mips64/dnrm2.S | 4 +-
kernel/mips64/dot.S | 6 +-
kernel/mips64/gemm_beta.S | 2 +-
kernel/mips64/gemm_kernel.S | 6 +-
kernel/mips64/gemv_n.S | 4 +-
kernel/mips64/gemv_n_loongson3a.c | 4 +-
kernel/mips64/gemv_t.S | 16 +-
kernel/mips64/gemv_t_loongson3a.c | 2 +-
kernel/mips64/iamax.S | 6 +-
kernel/mips64/iamin.S | 6 +-
kernel/mips64/imax.S | 6 +-
kernel/mips64/imin.S | 6 +-
kernel/mips64/izamax.S | 8 +-
kernel/mips64/izamin.S | 8 +-
kernel/mips64/max.S | 4 +-
kernel/mips64/min.S | 4 +-
kernel/mips64/rot.S | 4 +-
kernel/mips64/scal.S | 6 +-
kernel/mips64/sgemm_kernel_8x4_ps.S | 400 ++--
kernel/mips64/sgemm_kernel_loongson3a_4x4.S | 414 ++--
kernel/mips64/sgemm_kernel_loongson3b_4x4.S | 414 ++--
kernel/mips64/snrm2.S | 10 +-
kernel/mips64/swap.S | 2 +-
kernel/mips64/symv_L.S | 2 +-
kernel/mips64/symv_U.S | 2 +-
kernel/mips64/trsm_kernel_LN.S | 4 +-
kernel/mips64/trsm_kernel_LN_loongson3a.S | 270 +--
kernel/mips64/trsm_kernel_LT.S | 4 +-
kernel/mips64/trsm_kernel_LT_loongson3a.S | 244 +--
kernel/mips64/trsm_kernel_RN_loongson3a.S | 162 +-
kernel/mips64/trsm_kernel_RT.S | 4 +-
kernel/mips64/trsm_kernel_RT_loongson3a.S | 204 +-
kernel/mips64/zamax.S | 4 +-
kernel/mips64/zamin.S | 4 +-
kernel/mips64/zasum.S | 4 +-
kernel/mips64/zaxpy.S | 4 +-
kernel/mips64/zcopy.S | 4 +-
kernel/mips64/zdot.S | 6 +-
kernel/mips64/zgemm3m_kernel.S | 6 +-
kernel/mips64/zgemm_kernel.S | 4 +-
kernel/mips64/zgemm_kernel_loongson3a_2x2.S | 160 +-
kernel/mips64/zgemm_kernel_loongson3b_2x2.S | 114 +-
kernel/mips64/zgemv_n.S | 2 +-
kernel/mips64/zgemv_n_loongson3a.c | 2 +-
kernel/mips64/zgemv_t.S | 12 +-
kernel/mips64/zgemv_t_loongson3a.c | 2 +-
kernel/mips64/znrm2.S | 4 +-
kernel/mips64/zrot.S | 4 +-
kernel/mips64/zscal.S | 4 +-
kernel/mips64/zswap.S | 2 +-
kernel/mips64/zsymv_L.S | 2 +-
kernel/mips64/zsymv_U.S | 4 +-
kernel/mips64/ztrsm_kernel_LT.S | 4 +-
kernel/mips64/ztrsm_kernel_RT.S | 2 +-
kernel/power/KERNEL.CELL | 4 +-
kernel/power/KERNEL.POWER5 | 12 +-
kernel/power/KERNEL.POWER6 | 4 +-
kernel/power/KERNEL.PPC440 | 12 +-
kernel/power/KERNEL.PPC970 | 4 +-
kernel/power/KERNEL.PPCG4 | 4 +-
kernel/power/amax.S | 4 +-
kernel/power/amax_cell.S | 6 +-
kernel/power/amax_hummer.S | 6 +-
kernel/power/amax_ppc440.S | 4 +-
kernel/power/amin.S | 4 +-
kernel/power/amin_cell.S | 6 +-
kernel/power/amin_hummer.S | 6 +-
kernel/power/amin_ppc440.S | 4 +-
kernel/power/asum.S | 4 +-
kernel/power/asum_cell.S | 4 +-
kernel/power/asum_hummer.S | 6 +-
kernel/power/asum_ppc440.S | 4 +-
kernel/power/axpy.S | 16 +-
kernel/power/axpy_hummer.S | 6 +-
kernel/power/axpy_ppc440.S | 12 +-
kernel/power/cnrm2.S | 4 +-
kernel/power/cnrm2_hummer.S | 8 +-
kernel/power/cnrm2_ppc440.S | 8 +-
kernel/power/copy.S | 6 +-
kernel/power/copy_hummer.S | 18 +-
kernel/power/dnrm2_hummer.S | 10 +-
kernel/power/dnrm2_ppc440.S | 6 +-
kernel/power/dot.S | 4 +-
kernel/power/dot_cell.S | 2 +-
kernel/power/dot_hummer.S | 8 +-
kernel/power/dot_ppc440.S | 2 +-
kernel/power/exfunc.S | 2 +-
kernel/power/gemm_beta.S | 6 +-
kernel/power/gemm_kernel.S | 14 +-
kernel/power/gemm_kernel_altivec.S | 10 +-
kernel/power/gemm_kernel_altivec_cell.S | 10 +-
kernel/power/gemm_kernel_altivec_g4.S | 10 +-
kernel/power/gemm_kernel_cell.S | 12 +-
kernel/power/gemm_kernel_g4.S | 12 +-
kernel/power/gemm_kernel_hummer.S | 24 +-
kernel/power/gemm_kernel_power3.S | 10 +-
kernel/power/gemm_kernel_power6.S | 8 +-
kernel/power/gemm_kernel_ppc440.S | 10 +-
kernel/power/gemm_ncopy_4.S | 10 +-
kernel/power/gemm_ncopy_hummer_4.S | 16 +-
kernel/power/gemm_ncopy_hummer_8.S | 26 +-
kernel/power/gemm_tcopy_4.S | 12 +-
kernel/power/gemm_tcopy_hummer_4.S | 18 +-
kernel/power/gemm_tcopy_hummer_8.S | 22 +-
kernel/power/gemv_hummer_n.S | 6 +-
kernel/power/gemv_n.S | 2 +-
kernel/power/gemv_t.S | 12 +-
kernel/power/gemv_t_ppc440.S | 6 +-
kernel/power/ger.S | 4 +-
kernel/power/iamax.S | 8 +-
kernel/power/iamax_hummer.S | 6 +-
kernel/power/iamax_ppc440.S | 8 +-
kernel/power/iamin.S | 8 +-
kernel/power/iamin_hummer.S | 6 +-
kernel/power/iamin_ppc440.S | 8 +-
kernel/power/imax.S | 4 +-
kernel/power/imax_hummer.S | 6 +-
kernel/power/imax_ppc440.S | 4 +-
kernel/power/imin.S | 4 +-
kernel/power/imin_hummer.S | 6 +-
kernel/power/imin_ppc440.S | 6 +-
kernel/power/izamax.S | 8 +-
kernel/power/izamax_hummer.S | 6 +-
kernel/power/izamax_ppc440.S | 6 +-
kernel/power/izamin.S | 4 +-
kernel/power/izamin_hummer.S | 6 +-
kernel/power/izamin_ppc440.S | 6 +-
kernel/power/max.S | 4 +-
kernel/power/max_hummer.S | 6 +-
kernel/power/max_ppc440.S | 4 +-
kernel/power/min.S | 4 +-
kernel/power/min_hummer.S | 6 +-
kernel/power/min_ppc440.S | 4 +-
kernel/power/nrm2.S | 4 +-
kernel/power/rot.S | 4 +-
kernel/power/rot_ppc440.S | 4 +-
kernel/power/scal.S | 4 +-
kernel/power/scal_hummer.S | 6 +-
kernel/power/scal_ppc440.S | 4 +-
kernel/power/snrm2.S | 4 +-
kernel/power/snrm2_hummer.S | 4 +-
kernel/power/snrm2_ppc440.S | 6 +-
kernel/power/swap.S | 12 +-
kernel/power/swap_hummer.S | 12 +-
kernel/power/symv_L.S | 12 +-
kernel/power/symv_U.S | 18 +-
kernel/power/trsm_kernel_LN.S | 18 +-
kernel/power/trsm_kernel_LT.S | 20 +-
kernel/power/trsm_kernel_RT.S | 20 +-
kernel/power/trsm_kernel_cell_LN.S | 20 +-
kernel/power/trsm_kernel_cell_LT.S | 22 +-
kernel/power/trsm_kernel_cell_RT.S | 20 +-
kernel/power/trsm_kernel_hummer_LN.S | 16 +-
kernel/power/trsm_kernel_hummer_LT.S | 16 +-
kernel/power/trsm_kernel_hummer_RT.S | 16 +-
kernel/power/trsm_kernel_power6_LN.S | 16 +-
kernel/power/trsm_kernel_power6_LT.S | 18 +-
kernel/power/trsm_kernel_power6_RT.S | 18 +-
kernel/power/trsm_kernel_ppc440_LN.S | 16 +-
kernel/power/trsm_kernel_ppc440_LT.S | 18 +-
kernel/power/trsm_kernel_ppc440_RT.S | 18 +-
kernel/power/zamax.S | 4 +-
kernel/power/zamax_cell.S | 4 +-
kernel/power/zamax_hummer.S | 6 +-
kernel/power/zamax_ppc440.S | 6 +-
kernel/power/zamin.S | 4 +-
kernel/power/zamin_cell.S | 4 +-
kernel/power/zamin_hummer.S | 6 +-
kernel/power/zamin_ppc440.S | 4 +-
kernel/power/zasum.S | 4 +-
kernel/power/zasum_cell.S | 4 +-
kernel/power/zasum_hummer.S | 6 +-
kernel/power/zasum_ppc440.S | 4 +-
kernel/power/zaxpy.S | 8 +-
kernel/power/zaxpy_hummer.S | 6 +-
kernel/power/zaxpy_ppc440.S | 6 +-
kernel/power/zcopy.S | 6 +-
kernel/power/zcopy_hummer.S | 14 +-
kernel/power/zdot.S | 8 +-
kernel/power/zdot_cell.S | 8 +-
kernel/power/zdot_hummer.S | 8 +-
kernel/power/zdot_ppc440.S | 6 +-
kernel/power/zgemm_beta.S | 10 +-
kernel/power/zgemm_kernel.S | 10 +-
kernel/power/zgemm_kernel_altivec.S | 4 +-
kernel/power/zgemm_kernel_altivec_cell.S | 4 +-
kernel/power/zgemm_kernel_altivec_g4.S | 4 +-
kernel/power/zgemm_kernel_cell.S | 6 +-
kernel/power/zgemm_kernel_g4.S | 4 +-
kernel/power/zgemm_kernel_hummer.S | 16 +-
kernel/power/zgemm_kernel_power3.S | 8 +-
kernel/power/zgemm_kernel_power6.S | 16 +-
kernel/power/zgemm_kernel_ppc440.S | 4 +-
kernel/power/zgemm_ncopy_hummer_2.S | 12 +-
kernel/power/zgemm_ncopy_hummer_4.S | 16 +-
kernel/power/zgemm_tcopy_hummer_2.S | 12 +-
kernel/power/zgemm_tcopy_hummer_4.S | 16 +-
kernel/power/zgemv_n.S | 4 +-
kernel/power/zgemv_n_ppc440.S | 4 +-
kernel/power/zgemv_t.S | 12 +-
kernel/power/zgemv_t_ppc440.S | 8 +-
kernel/power/zger.S | 4 +-
kernel/power/znrm2.S | 4 +-
kernel/power/znrm2_hummer.S | 10 +-
kernel/power/znrm2_ppc440.S | 6 +-
kernel/power/zrot.S | 6 +-
kernel/power/zrot_ppc440.S | 4 +-
kernel/power/zscal.S | 6 +-
kernel/power/zscal_hummer.S | 6 +-
kernel/power/zscal_ppc440.S | 6 +-
kernel/power/zswap.S | 14 +-
kernel/power/zswap_hummer.S | 12 +-
kernel/power/zsymv_L.S | 6 +-
kernel/power/zsymv_U.S | 6 +-
kernel/power/ztrsm_kernel_LN.S | 10 +-
kernel/power/ztrsm_kernel_LT.S | 10 +-
kernel/power/ztrsm_kernel_RT.S | 12 +-
kernel/power/ztrsm_kernel_cell_LN.S | 10 +-
kernel/power/ztrsm_kernel_cell_LT.S | 10 +-
kernel/power/ztrsm_kernel_cell_RT.S | 12 +-
kernel/power/ztrsm_kernel_hummer_LN.S | 12 +-
kernel/power/ztrsm_kernel_hummer_LT.S | 8 +-
kernel/power/ztrsm_kernel_hummer_RT.S | 10 +-
kernel/power/ztrsm_kernel_power6_LN.S | 24 +-
kernel/power/ztrsm_kernel_power6_LT.S | 24 +-
kernel/power/ztrsm_kernel_power6_RT.S | 24 +-
kernel/power/ztrsm_kernel_ppc440_LN.S | 8 +-
kernel/power/ztrsm_kernel_ppc440_LT.S | 8 +-
kernel/power/ztrsm_kernel_ppc440_RT.S | 10 +-
kernel/setparam-ref.c | 197 +-
kernel/sparc/KERNEL.sparc | 4 +-
kernel/sparc/axpy.S | 2 +-
kernel/sparc/cabs.S | 2 +-
kernel/sparc/dnrm2.S | 2 +-
kernel/sparc/dot.S | 6 +-
kernel/sparc/gemm_kernel_2x8.S | 4 +-
kernel/sparc/gemv_n.S | 12 +-
kernel/sparc/gemv_t.S | 8 +-
kernel/sparc/ger.S | 6 +-
kernel/sparc/imax.S | 2 +-
kernel/sparc/lsame.S | 2 +-
kernel/sparc/max.S | 2 +-
kernel/sparc/rot.S | 4 +-
kernel/sparc/scal.S | 2 +-
kernel/sparc/swap.S | 2 +-
kernel/sparc/trsm_kernel_LN_2x8.S | 2 +-
kernel/sparc/trsm_kernel_LT_2x8.S | 2 +-
kernel/sparc/trsm_kernel_RT.S | 2 +-
kernel/sparc/trsm_kernel_RT_2x8.S | 2 +-
kernel/sparc/zamax.S | 2 +-
kernel/sparc/zasum.S | 2 +-
kernel/sparc/zgemm_kernel.S | 4 +-
kernel/sparc/zgemm_kernel_1x4.S | 6 +-
kernel/sparc/zgemv_n.S | 6 +-
kernel/sparc/zgemv_t.S | 6 +-
kernel/sparc/znrm2.S | 2 +-
kernel/sparc/zrot.S | 4 +-
kernel/sparc/zscal.S | 2 +-
kernel/sparc/zswap.S | 2 +-
kernel/sparc/ztrsm_kernel_LN.S | 6 +-
kernel/sparc/ztrsm_kernel_LT.S | 6 +-
kernel/sparc/ztrsm_kernel_LT_1x4.S | 8 +-
kernel/sparc/ztrsm_kernel_RT.S | 6 +-
kernel/sparc/ztrsm_kernel_RT_1x4.S | 6 +-
kernel/x86/KERNEL | 310 +--
kernel/x86/KERNEL.ATOM | 8 +-
kernel/x86/KERNEL.BARCELONA | 14 +-
kernel/x86/KERNEL.BOBCAT | 14 +-
kernel/x86/KERNEL.BULLDOZER | 14 +-
kernel/x86/KERNEL.DUNNINGTON | 8 +-
kernel/x86/KERNEL.OPTERON | 14 +-
kernel/x86/KERNEL.PENRYN | 8 +-
kernel/x86/KERNEL.PILEDRIVER | 14 +-
kernel/x86/KERNEL.PRESCOTT | 14 +-
kernel/x86/KERNEL.YONAH | 14 +-
kernel/x86/amax.S | 46 +-
kernel/x86/amax_sse.S | 20 +-
kernel/x86/amax_sse2.S | 20 +-
kernel/x86/asum.S | 10 +-
kernel/x86/asum_sse.S | 8 +-
kernel/x86/asum_sse2.S | 10 +-
kernel/x86/axpy.S | 4 +-
kernel/x86/axpy_sse.S | 6 +-
kernel/x86/axpy_sse2.S | 6 +-
kernel/x86/axpy_sse2_opteron.S | 4 +-
kernel/x86/copy.S | 20 +-
kernel/x86/copy_sse.S | 2 +-
kernel/x86/copy_sse2.S | 2 +-
kernel/x86/cpuid.S | 2 +-
kernel/x86/dot.S | 6 +-
kernel/x86/dot_amd.S | 6 +-
kernel/x86/dot_sse.S | 10 +-
kernel/x86/dot_sse2.S | 6 +-
kernel/x86/dot_sse2_opteron.S | 6 +-
kernel/x86/dot_sse_opteron.S | 8 +-
kernel/x86/gemm_beta.S | 4 +-
kernel/x86/gemm_kernel_1x4.S | 20 +-
kernel/x86/gemm_kernel_2x2.S | 34 +-
kernel/x86/gemm_kernel_2x2_atom.S | 24 +-
kernel/x86/gemm_kernel_2x4_3dnow.S | 34 +-
kernel/x86/gemm_kernel_2x4_barcelona.S | 34 +-
kernel/x86/gemm_kernel_2x4_core2.S | 28 +-
kernel/x86/gemm_kernel_2x4_penryn.S | 28 +-
kernel/x86/gemm_kernel_2x4_sse2.S | 50 +-
kernel/x86/gemm_kernel_2x4_sse3.S | 36 +-
kernel/x86/gemm_kernel_4x2_core2.S | 30 +-
kernel/x86/gemm_kernel_4x2_sse2.S | 68 +-
kernel/x86/gemm_kernel_4x4_barcelona.S | 64 +-
kernel/x86/gemm_kernel_4x4_penryn.S | 28 +-
kernel/x86/gemm_kernel_4x4_sse.S | 66 +-
kernel/x86/gemm_kernel_4x4_sse3.S | 62 +-
kernel/x86/gemm_kernel_8x1_sse2.S | 20 +-
kernel/x86/gemm_kernel_8x2_core2.S | 34 +-
kernel/x86/gemm_kernel_8x2_sse.S | 104 +-
kernel/x86/gemm_ncopy_2.S | 2 +-
kernel/x86/gemm_ncopy_2_sse.S | 4 +-
kernel/x86/gemm_ncopy_4_sse.S | 4 +-
kernel/x86/gemm_tcopy_2.S | 4 +-
kernel/x86/gemm_tcopy_2_sse.S | 4 +-
kernel/x86/gemm_tcopy_4_sse.S | 4 +-
kernel/x86/gemv_n.S | 10 +-
kernel/x86/gemv_n_atom.S | 6 +-
kernel/x86/gemv_n_sse.S | 6 +-
kernel/x86/gemv_n_sse2.S | 6 +-
kernel/x86/gemv_t.S | 4 +-
kernel/x86/gemv_t_atom.S | 12 +-
kernel/x86/gemv_t_sse.S | 20 +-
kernel/x86/gemv_t_sse2.S | 14 +-
kernel/x86/iamax.S | 46 +-
kernel/x86/iamax_sse.S | 42 +-
kernel/x86/iamax_sse2.S | 44 +-
kernel/x86/izamax.S | 52 +-
kernel/x86/izamax_sse.S | 26 +-
kernel/x86/izamax_sse2.S | 16 +-
kernel/x86/nrm2.S | 10 +-
kernel/x86/nrm2_sse.S | 12 +-
kernel/x86/qaxpy.S | 4 +-
kernel/x86/qdot.S | 2 +-
kernel/x86/qgemm_kernel_2x2.S | 38 +-
kernel/x86/qgemv_n.S | 10 +-
kernel/x86/qgemv_t.S | 4 +-
kernel/x86/qtrsm_kernel_LN_2x2.S | 28 +-
kernel/x86/qtrsm_kernel_LT_2x2.S | 28 +-
kernel/x86/qtrsm_kernel_RT_2x2.S | 28 +-
kernel/x86/rot.S | 8 +-
kernel/x86/rot_sse.S | 6 +-
kernel/x86/rot_sse2.S | 4 +-
kernel/x86/scal_sse.S | 6 +-
kernel/x86/scal_sse2.S | 6 +-
kernel/x86/swap.S | 2 +-
kernel/x86/swap_sse.S | 10 +-
kernel/x86/swap_sse2.S | 4 +-
kernel/x86/trsm_kernel_LN_2x2.S | 24 +-
kernel/x86/trsm_kernel_LN_2x2_atom.S | 16 +-
kernel/x86/trsm_kernel_LN_2x4_penryn.S | 24 +-
kernel/x86/trsm_kernel_LN_2x4_sse2.S | 44 +-
kernel/x86/trsm_kernel_LN_2x4_sse3.S | 22 +-
kernel/x86/trsm_kernel_LN_4x2_core2.S | 54 +-
kernel/x86/trsm_kernel_LN_4x2_sse2.S | 62 +-
kernel/x86/trsm_kernel_LN_4x4_penryn.S | 28 +-
kernel/x86/trsm_kernel_LN_4x4_sse.S | 46 +-
kernel/x86/trsm_kernel_LN_8x2_sse.S | 78 +-
kernel/x86/trsm_kernel_LT_1x4.S | 22 +-
kernel/x86/trsm_kernel_LT_2x2.S | 24 +-
kernel/x86/trsm_kernel_LT_2x2_atom.S | 16 +-
kernel/x86/trsm_kernel_LT_2x4_penryn.S | 22 +-
kernel/x86/trsm_kernel_LT_2x4_sse2.S | 44 +-
kernel/x86/trsm_kernel_LT_2x4_sse3.S | 22 +-
kernel/x86/trsm_kernel_LT_4x2_core2.S | 54 +-
kernel/x86/trsm_kernel_LT_4x2_sse2.S | 60 +-
kernel/x86/trsm_kernel_LT_4x4_penryn.S | 28 +-
kernel/x86/trsm_kernel_LT_4x4_sse.S | 46 +-
kernel/x86/trsm_kernel_LT_8x2_sse.S | 78 +-
kernel/x86/trsm_kernel_RT_1x4.S | 22 +-
kernel/x86/trsm_kernel_RT_2x2.S | 24 +-
kernel/x86/trsm_kernel_RT_2x2_atom.S | 16 +-
kernel/x86/trsm_kernel_RT_2x4_penryn.S | 24 +-
kernel/x86/trsm_kernel_RT_2x4_sse2.S | 44 +-
kernel/x86/trsm_kernel_RT_2x4_sse3.S | 22 +-
kernel/x86/trsm_kernel_RT_4x2_core2.S | 54 +-
kernel/x86/trsm_kernel_RT_4x2_sse2.S | 60 +-
kernel/x86/trsm_kernel_RT_4x4_penryn.S | 28 +-
kernel/x86/trsm_kernel_RT_4x4_sse.S | 46 +-
kernel/x86/trsm_kernel_RT_8x2_sse.S | 78 +-
kernel/x86/xaxpy.S | 4 +-
kernel/x86/xdot.S | 2 +-
kernel/x86/xgemm3m_kernel_2x2.S | 38 +-
kernel/x86/xgemm_kernel_1x1.S | 26 +-
kernel/x86/xgemv_n.S | 2 +-
kernel/x86/xgemv_t.S | 2 +-
kernel/x86/xtrsm_kernel_LT_1x1.S | 26 +-
kernel/x86/zamax.S | 52 +-
kernel/x86/zamax_sse.S | 18 +-
kernel/x86/zamax_sse2.S | 12 +-
kernel/x86/zasum.S | 10 +-
kernel/x86/zasum_sse.S | 16 +-
kernel/x86/zasum_sse2.S | 10 +-
kernel/x86/zaxpy.S | 2 +-
kernel/x86/zaxpy_sse.S | 6 +-
kernel/x86/zaxpy_sse2.S | 8 +-
kernel/x86/zcopy.S | 30 +-
kernel/x86/zcopy_sse.S | 4 +-
kernel/x86/zcopy_sse2.S | 2 +-
kernel/x86/zdot.S | 6 +-
kernel/x86/zdot_amd.S | 6 +-
kernel/x86/zdot_sse.S | 2 +-
kernel/x86/zdot_sse2.S | 234 +--
kernel/x86/zgemm3m_kernel_1x4_athlon.S | 30 +-
kernel/x86/zgemm3m_kernel_2x2_atom.S | 24 +-
kernel/x86/zgemm3m_kernel_2x2_coppermine.S | 32 +-
kernel/x86/zgemm3m_kernel_2x4_barcelona.S | 34 +-
kernel/x86/zgemm3m_kernel_2x4_opteron.S | 50 +-
kernel/x86/zgemm3m_kernel_2x4_penryn.S | 28 +-
kernel/x86/zgemm3m_kernel_2x4_prescott.S | 36 +-
kernel/x86/zgemm3m_kernel_4x2_core2.S | 28 +-
kernel/x86/zgemm3m_kernel_4x2_northwood.S | 66 +-
kernel/x86/zgemm3m_kernel_4x4_barcelona.S | 64 +-
kernel/x86/zgemm3m_kernel_4x4_opteron.S | 64 +-
kernel/x86/zgemm3m_kernel_4x4_penryn.S | 30 +-
kernel/x86/zgemm3m_kernel_4x4_prescott.S | 62 +-
kernel/x86/zgemm3m_kernel_8x2_core2.S | 34 +-
kernel/x86/zgemm3m_kernel_8x2_sse.S | 102 +-
kernel/x86/zgemm_beta.S | 2 +-
kernel/x86/zgemm_kernel_1x1.S | 8 +-
kernel/x86/zgemm_kernel_1x1_atom.S | 14 +-
kernel/x86/zgemm_kernel_1x2.S | 20 +-
kernel/x86/zgemm_kernel_1x2_3dnow.S | 16 +-
kernel/x86/zgemm_kernel_1x2_barcelona.S | 20 +-
kernel/x86/zgemm_kernel_1x2_penryn.S | 22 +-
kernel/x86/zgemm_kernel_1x2_sse2.S | 32 +-
kernel/x86/zgemm_kernel_1x2_sse3.S | 28 +-
kernel/x86/zgemm_kernel_2x1_core2.S | 20 +-
kernel/x86/zgemm_kernel_2x1_sse2.S | 28 +-
kernel/x86/zgemm_kernel_2x2_barcelona.S | 46 +-
kernel/x86/zgemm_kernel_2x2_penryn.S | 46 +-
kernel/x86/zgemm_kernel_2x2_sse.S | 48 +-
kernel/x86/zgemm_kernel_2x2_sse3.S | 46 +-
kernel/x86/zgemm_kernel_4x1_core2.S | 28 +-
kernel/x86/zgemm_kernel_4x1_sse.S | 48 +-
kernel/x86/zgemm_ncopy_2.S | 2 +-
kernel/x86/zgemm_tcopy_2.S | 4 +-
kernel/x86/zgemv_n.S | 4 +-
kernel/x86/zgemv_n_atom.S | 6 +-
kernel/x86/zgemv_n_sse.S | 8 +-
kernel/x86/zgemv_n_sse2.S | 8 +-
kernel/x86/zgemv_t.S | 2 +-
kernel/x86/zgemv_t_atom.S | 10 +-
kernel/x86/zgemv_t_sse.S | 16 +-
kernel/x86/zgemv_t_sse2.S | 10 +-
kernel/x86/znrm2.S | 10 +-
kernel/x86/znrm2_sse.S | 12 +-
kernel/x86/zrot.S | 8 +-
kernel/x86/zrot_sse.S | 4 +-
kernel/x86/zrot_sse2.S | 2 +-
kernel/x86/zscal.S | 2 +-
kernel/x86/zscal_sse.S | 2 +-
kernel/x86/zscal_sse2.S | 16 +-
kernel/x86/zswap.S | 2 +-
kernel/x86/zswap_sse.S | 8 +-
kernel/x86/zswap_sse2.S | 2 +-
kernel/x86/ztrsm_kernel_LN_2x1_core2.S | 14 +-
kernel/x86/ztrsm_kernel_LN_2x1_sse2.S | 18 +-
kernel/x86/ztrsm_kernel_LN_2x2_penryn.S | 42 +-
kernel/x86/ztrsm_kernel_LN_2x2_sse.S | 30 +-
kernel/x86/ztrsm_kernel_LN_4x1_sse.S | 24 +-
kernel/x86/ztrsm_kernel_LT_1x1.S | 8 +-
kernel/x86/ztrsm_kernel_LT_1x1_atom.S | 12 +-
kernel/x86/ztrsm_kernel_LT_1x2_penryn.S | 20 +-
kernel/x86/ztrsm_kernel_LT_1x2_sse2.S | 28 +-
kernel/x86/ztrsm_kernel_LT_1x2_sse3.S | 20 +-
kernel/x86/ztrsm_kernel_LT_2x1_core2.S | 16 +-
kernel/x86/ztrsm_kernel_LT_2x1_sse2.S | 18 +-
kernel/x86/ztrsm_kernel_LT_2x2_penryn.S | 42 +-
kernel/x86/ztrsm_kernel_LT_2x2_sse.S | 28 +-
kernel/x86/ztrsm_kernel_LT_4x1_sse.S | 24 +-
kernel/x86/ztrsm_kernel_RT_1x2_penryn.S | 20 +-
kernel/x86/ztrsm_kernel_RT_1x2_sse2.S | 24 +-
kernel/x86/ztrsm_kernel_RT_1x2_sse3.S | 20 +-
kernel/x86/ztrsm_kernel_RT_2x2_penryn.S | 42 +-
kernel/x86/ztrsm_kernel_RT_2x2_sse.S | 28 +-
kernel/x86_64/KERNEL | 6 +-
kernel/x86_64/KERNEL.ATOM | 4 +-
kernel/x86_64/KERNEL.BARCELONA | 6 +-
kernel/x86_64/KERNEL.BOBCAT | 6 +-
kernel/x86_64/KERNEL.BULLDOZER | 8 +-
kernel/x86_64/KERNEL.CORE2 | 4 +-
kernel/x86_64/KERNEL.DUNNINGTON | 4 +-
kernel/x86_64/KERNEL.HASWELL | 9 +-
kernel/x86_64/KERNEL.NANO | 4 +-
kernel/x86_64/KERNEL.NEHALEM | 28 +-
kernel/x86_64/KERNEL.OPTERON | 4 +-
kernel/x86_64/KERNEL.OPTERON_SSE3 | 6 +-
kernel/x86_64/KERNEL.PENRYN | 4 +-
kernel/x86_64/KERNEL.PILEDRIVER | 8 +-
kernel/x86_64/KERNEL.PRESCOTT | 8 +-
kernel/x86_64/KERNEL.SANDYBRIDGE | 30 +-
kernel/x86_64/amax.S | 44 +-
kernel/x86_64/amax_atom.S | 16 +-
kernel/x86_64/amax_sse.S | 18 +-
kernel/x86_64/amax_sse2.S | 18 +-
kernel/x86_64/asum.S | 6 +-
kernel/x86_64/asum_atom.S | 22 +-
kernel/x86_64/asum_sse.S | 14 +-
kernel/x86_64/asum_sse2.S | 16 +-
kernel/x86_64/axpy.S | 6 +-
kernel/x86_64/axpy_atom.S | 2 +-
kernel/x86_64/axpy_sse.S | 8 +-
kernel/x86_64/axpy_sse2.S | 6 +-
kernel/x86_64/builtin_stinit.S | 2 +-
kernel/x86_64/cabs.S | 6 +-
kernel/x86_64/cgemm_kernel_4x2_bulldozer.S | 50 +-
kernel/x86_64/cgemm_kernel_4x2_piledriver.S | 49 +-
kernel/x86_64/cgemm_kernel_4x8_sandy.S | 108 +-
kernel/x86_64/cgemm_kernel_8x2_haswell.S | 49 +-
...rnel_8x2_haswell.S => cgemm_kernel_8x2_sandy.S} | 177 +-
kernel/x86_64/cgemv_n.S | 74 +-
kernel/x86_64/cgemv_t.S | 28 +-
kernel/x86_64/copy.S | 10 +-
kernel/x86_64/daxpy_bulldozer.S | 6 +-
kernel/x86_64/ddot_bulldozer.S | 4 +-
kernel/x86_64/dgemm_kernel_4x4_haswell.S | 2 +-
kernel/x86_64/dgemm_kernel_4x8_sandy.S | 154 +-
kernel/x86_64/dgemm_kernel_6x4_piledriver.S | 30 +-
kernel/x86_64/dgemm_kernel_8x2_bulldozer.S | 4 +-
kernel/x86_64/dgemm_kernel_8x2_piledriver.S | 4 +-
kernel/x86_64/dgemm_ncopy_2.S | 2 +-
kernel/x86_64/dgemm_ncopy_4.S | 2 +-
kernel/x86_64/dgemm_ncopy_8.S | 2 +-
kernel/x86_64/dgemm_ncopy_8_bulldozer.S | 2 +-
kernel/x86_64/dgemm_tcopy_2.S | 4 +-
kernel/x86_64/dgemm_tcopy_4.S | 4 +-
kernel/x86_64/dgemm_tcopy_8.S | 4 +-
kernel/x86_64/dgemm_tcopy_8_bulldozer.S | 4 +-
kernel/x86_64/dgemv_n.S | 26 +-
kernel/x86_64/dgemv_n_atom.S | 8 +-
kernel/x86_64/dgemv_n_bulldozer.S | 22 +-
kernel/x86_64/dgemv_t.S | 8 +-
kernel/x86_64/dgemv_t_atom.S | 10 +-
kernel/x86_64/dgemv_t_bulldozer.S | 10 +-
kernel/x86_64/dot_atom.S | 4 +-
kernel/x86_64/dot_sse.S | 6 +-
kernel/x86_64/dot_sse2.S | 4 +-
kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S | 96 +-
kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S | 46 +-
kernel/x86_64/gemm_beta.S | 2 +-
kernel/x86_64/gemm_kernel_2x8_nehalem.S | 72 +-
kernel/x86_64/gemm_kernel_4x2_atom.S | 48 +-
kernel/x86_64/gemm_kernel_4x4_barcelona.S | 80 +-
kernel/x86_64/gemm_kernel_4x4_core2.S | 102 +-
kernel/x86_64/gemm_kernel_4x4_penryn.S | 88 +-
kernel/x86_64/gemm_kernel_4x4_sse2.S | 90 +-
kernel/x86_64/gemm_kernel_4x4_sse3.S | 80 +-
kernel/x86_64/gemm_kernel_4x8_nano.S | 106 +-
kernel/x86_64/gemm_kernel_4x8_nehalem.S | 114 +-
kernel/x86_64/gemm_kernel_8x4_barcelona.S | 116 +-
kernel/x86_64/gemm_kernel_8x4_core2.S | 118 +-
kernel/x86_64/gemm_kernel_8x4_penryn.S | 100 +-
kernel/x86_64/gemm_kernel_8x4_sse.S | 120 +-
kernel/x86_64/gemm_kernel_8x4_sse3.S | 114 +-
kernel/x86_64/gemm_ncopy_2.S | 2 +-
kernel/x86_64/gemm_ncopy_2_bulldozer.S | 2 +-
kernel/x86_64/gemm_ncopy_4.S | 2 +-
kernel/x86_64/gemm_ncopy_4_opteron.S | 4 +-
kernel/x86_64/gemm_tcopy_2.S | 4 +-
kernel/x86_64/gemm_tcopy_2_bulldozer.S | 10 +-
kernel/x86_64/gemm_tcopy_4.S | 2 +-
kernel/x86_64/gemm_tcopy_4_opteron.S | 2 +-
kernel/x86_64/iamax.S | 44 +-
kernel/x86_64/iamax_sse.S | 42 +-
kernel/x86_64/iamax_sse2.S | 40 +-
kernel/x86_64/izamax.S | 50 +-
kernel/x86_64/izamax_sse.S | 24 +-
kernel/x86_64/izamax_sse2.S | 14 +-
kernel/x86_64/nrm2.S | 8 +-
kernel/x86_64/nrm2_sse.S | 10 +-
kernel/x86_64/qdot.S | 2 +-
kernel/x86_64/qgemm_kernel_2x2.S | 40 +-
kernel/x86_64/qgemv_n.S | 10 +-
kernel/x86_64/qgemv_t.S | 4 +-
kernel/x86_64/qtrsm_kernel_LN_2x2.S | 34 +-
kernel/x86_64/qtrsm_kernel_LT_2x2.S | 34 +-
kernel/x86_64/qtrsm_kernel_RT_2x2.S | 34 +-
kernel/x86_64/rot.S | 6 +-
kernel/x86_64/rot_sse.S | 6 +-
kernel/x86_64/rot_sse2.S | 6 +-
kernel/x86_64/scal_atom.S | 6 +-
kernel/x86_64/scal_sse.S | 12 +-
kernel/x86_64/scal_sse2.S | 10 +-
kernel/x86_64/sgemm_kernel_16x2_bulldozer.S | 4 +-
kernel/x86_64/sgemm_kernel_16x2_piledriver.S | 4 +-
kernel/x86_64/sgemm_kernel_16x4_haswell.S | 4 +-
...el_16x4_haswell.S => sgemm_kernel_16x4_sandy.S} | 190 +-
kernel/x86_64/sgemm_kernel_8x4_bulldozer.S | 126 +-
kernel/x86_64/sgemm_kernel_8x8_sandy.S | 150 +-
kernel/x86_64/sgemv_n.S | 44 +-
kernel/x86_64/sgemv_t.S | 236 +--
kernel/x86_64/swap.S | 2 +-
kernel/x86_64/swap_sse.S | 6 +-
kernel/x86_64/swap_sse2.S | 2 +-
kernel/x86_64/symv_L_sse.S | 8 +-
kernel/x86_64/symv_L_sse2.S | 8 +-
kernel/x86_64/symv_U_sse.S | 8 +-
kernel/x86_64/symv_U_sse2.S | 10 +-
kernel/x86_64/trsm_kernel_LN_2x8_nehalem.S | 68 +-
kernel/x86_64/trsm_kernel_LN_4x2_atom.S | 42 +-
kernel/x86_64/trsm_kernel_LN_4x4_barcelona.S | 58 +-
kernel/x86_64/trsm_kernel_LN_4x4_core2.S | 82 +-
kernel/x86_64/trsm_kernel_LN_4x4_penryn.S | 76 +-
kernel/x86_64/trsm_kernel_LN_4x4_sse2.S | 74 +-
kernel/x86_64/trsm_kernel_LN_4x4_sse3.S | 64 +-
kernel/x86_64/trsm_kernel_LN_4x8_nehalem.S | 102 +-
kernel/x86_64/trsm_kernel_LN_8x4_sse.S | 86 +-
kernel/x86_64/trsm_kernel_LT_2x8_nehalem.S | 70 +-
kernel/x86_64/trsm_kernel_LT_4x2_atom.S | 44 +-
kernel/x86_64/trsm_kernel_LT_4x4_barcelona.S | 60 +-
kernel/x86_64/trsm_kernel_LT_4x4_core2.S | 82 +-
kernel/x86_64/trsm_kernel_LT_4x4_penryn.S | 76 +-
kernel/x86_64/trsm_kernel_LT_4x4_sse2.S | 82 +-
kernel/x86_64/trsm_kernel_LT_4x4_sse3.S | 62 +-
kernel/x86_64/trsm_kernel_LT_4x8_nehalem.S | 106 +-
kernel/x86_64/trsm_kernel_LT_8x4_sse.S | 92 +-
kernel/x86_64/trsm_kernel_RT_2x8_nehalem.S | 68 +-
kernel/x86_64/trsm_kernel_RT_4x2_atom.S | 42 +-
kernel/x86_64/trsm_kernel_RT_4x4_barcelona.S | 60 +-
kernel/x86_64/trsm_kernel_RT_4x4_core2.S | 80 +-
kernel/x86_64/trsm_kernel_RT_4x4_penryn.S | 76 +-
kernel/x86_64/trsm_kernel_RT_4x4_sse2.S | 78 +-
kernel/x86_64/trsm_kernel_RT_4x4_sse3.S | 62 +-
kernel/x86_64/trsm_kernel_RT_4x8_nehalem.S | 106 +-
kernel/x86_64/trsm_kernel_RT_8x4_sse.S | 90 +-
kernel/x86_64/xdot.S | 2 +-
kernel/x86_64/xgemm3m_kernel_2x2.S | 40 +-
kernel/x86_64/xgemm_kernel_1x1.S | 28 +-
kernel/x86_64/xgemv_n.S | 6 +-
kernel/x86_64/xgemv_t.S | 4 +-
kernel/x86_64/xtrsm_kernel_LT_1x1.S | 30 +-
kernel/x86_64/zamax.S | 50 +-
kernel/x86_64/zamax_atom.S | 12 +-
kernel/x86_64/zamax_sse.S | 16 +-
kernel/x86_64/zamax_sse2.S | 10 +-
kernel/x86_64/zasum.S | 6 +-
kernel/x86_64/zasum_atom.S | 20 +-
kernel/x86_64/zasum_sse.S | 16 +-
kernel/x86_64/zasum_sse2.S | 14 +-
kernel/x86_64/zaxpy.S | 6 +-
kernel/x86_64/zaxpy_atom.S | 6 +-
kernel/x86_64/zaxpy_sse.S | 12 +-
kernel/x86_64/zaxpy_sse2.S | 18 +-
kernel/x86_64/zcopy.S | 2 +-
kernel/x86_64/zcopy_sse.S | 4 +-
kernel/x86_64/zdot.S | 2 +-
kernel/x86_64/zdot_sse.S | 488 ++---
kernel/x86_64/zdot_sse2.S | 236 +--
kernel/x86_64/zgemm3m_kernel_2x8_nehalem.S | 72 +-
kernel/x86_64/zgemm3m_kernel_4x2_atom.S | 16 +-
kernel/x86_64/zgemm3m_kernel_4x4_barcelona.S | 108 +-
kernel/x86_64/zgemm3m_kernel_4x4_core2.S | 116 +-
kernel/x86_64/zgemm3m_kernel_4x4_penryn.S | 82 +-
kernel/x86_64/zgemm3m_kernel_4x4_sse2.S | 108 +-
kernel/x86_64/zgemm3m_kernel_4x4_sse3.S | 86 +-
kernel/x86_64/zgemm3m_kernel_4x8_nehalem.S | 112 +-
kernel/x86_64/zgemm3m_kernel_8x4_barcelona.S | 118 +-
kernel/x86_64/zgemm3m_kernel_8x4_core2.S | 128 +-
kernel/x86_64/zgemm3m_kernel_8x4_penryn.S | 106 +-
kernel/x86_64/zgemm3m_kernel_8x4_sse.S | 124 +-
kernel/x86_64/zgemm3m_kernel_8x4_sse3.S | 116 +-
kernel/x86_64/zgemm_beta.S | 2 +-
kernel/x86_64/zgemm_kernel_1x4_nehalem.S | 30 +-
kernel/x86_64/zgemm_kernel_2x1_atom.S | 28 +-
kernel/x86_64/zgemm_kernel_2x2_barcelona.S | 42 +-
kernel/x86_64/zgemm_kernel_2x2_bulldozer.S | 51 +-
kernel/x86_64/zgemm_kernel_2x2_core2.S | 56 +-
kernel/x86_64/zgemm_kernel_2x2_penryn.S | 44 +-
kernel/x86_64/zgemm_kernel_2x2_piledriver.S | 53 +-
kernel/x86_64/zgemm_kernel_2x2_sse2.S | 68 +-
kernel/x86_64/zgemm_kernel_2x2_sse3.S | 48 +-
kernel/x86_64/zgemm_kernel_2x4_nehalem.S | 66 +-
kernel/x86_64/zgemm_kernel_4x2_barcelona.S | 78 +-
kernel/x86_64/zgemm_kernel_4x2_core2.S | 72 +-
kernel/x86_64/zgemm_kernel_4x2_haswell.S | 49 +-
kernel/x86_64/zgemm_kernel_4x2_penryn.S | 64 +-
kernel/x86_64/zgemm_kernel_4x2_sse.S | 80 +-
kernel/x86_64/zgemm_kernel_4x2_sse3.S | 84 +-
kernel/x86_64/zgemm_kernel_4x4_sandy.S | 92 +-
kernel/x86_64/zgemm_ncopy_1.S | 2 +-
kernel/x86_64/zgemm_ncopy_2.S | 2 +-
kernel/x86_64/zgemm_tcopy_1.S | 4 +-
kernel/x86_64/zgemm_tcopy_2.S | 2 +-
kernel/x86_64/zgemv_n.S | 22 +-
kernel/x86_64/zgemv_n_atom.S | 8 +-
kernel/x86_64/zgemv_n_dup.S | 8 +-
kernel/x86_64/zgemv_t.S | 22 +-
kernel/x86_64/zgemv_t_atom.S | 10 +-
kernel/x86_64/zgemv_t_dup.S | 16 +-
kernel/x86_64/znrm2.S | 8 +-
kernel/x86_64/znrm2_sse.S | 10 +-
kernel/x86_64/zrot.S | 6 +-
kernel/x86_64/zrot_sse.S | 6 +-
kernel/x86_64/zrot_sse2.S | 4 +-
kernel/x86_64/zscal_atom.S | 6 +-
kernel/x86_64/zscal_sse.S | 6 +-
kernel/x86_64/zscal_sse2.S | 18 +-
kernel/x86_64/zswap.S | 2 +-
kernel/x86_64/zswap_sse.S | 6 +-
kernel/x86_64/zsymv_L_sse.S | 8 +-
kernel/x86_64/zsymv_L_sse2.S | 8 +-
kernel/x86_64/zsymv_U_sse.S | 10 +-
kernel/x86_64/zsymv_U_sse2.S | 8 +-
kernel/x86_64/ztrsm_kernel_LN_2x1_atom.S | 24 +-
kernel/x86_64/ztrsm_kernel_LN_2x2_core2.S | 46 +-
kernel/x86_64/ztrsm_kernel_LN_2x2_penryn.S | 40 +-
kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S | 50 +-
kernel/x86_64/ztrsm_kernel_LN_2x2_sse3.S | 38 +-
kernel/x86_64/ztrsm_kernel_LN_2x4_nehalem.S | 56 +-
kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S | 52 +-
kernel/x86_64/ztrsm_kernel_LT_1x4_nehalem.S | 32 +-
kernel/x86_64/ztrsm_kernel_LT_2x1_atom.S | 26 +-
kernel/x86_64/ztrsm_kernel_LT_2x2_core2.S | 50 +-
kernel/x86_64/ztrsm_kernel_LT_2x2_penryn.S | 40 +-
kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S | 48 +-
kernel/x86_64/ztrsm_kernel_LT_2x2_sse3.S | 40 +-
kernel/x86_64/ztrsm_kernel_LT_2x4_nehalem.S | 62 +-
kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S | 56 +-
kernel/x86_64/ztrsm_kernel_RT_1x4_nehalem.S | 32 +-
kernel/x86_64/ztrsm_kernel_RT_2x2_core2.S | 48 +-
kernel/x86_64/ztrsm_kernel_RT_2x2_penryn.S | 40 +-
kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S | 50 +-
kernel/x86_64/ztrsm_kernel_RT_2x2_sse3.S | 40 +-
kernel/x86_64/ztrsm_kernel_RT_2x4_nehalem.S | 62 +-
kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S | 56 +-
lapack-devel.log | 14 +-
lapack-netlib/Makefile | 1 -
lapack-netlib/TESTING/Makefile | 1 -
lapack/getf2/getf2_k.c | 8 +-
lapack/getf2/zgetf2_k.c | 10 +-
lapack/getrf/getrf_parallel.c | 190 +-
lapack/getrf/getrf_parallel_omp.c | 36 +-
lapack/getrf/getrf_single.c | 38 +-
lapack/getrs/getrs_parallel.c | 8 +-
lapack/getrs/zgetrs_parallel.c | 2 +-
lapack/getrs/zgetrs_single.c | 4 +-
lapack/laswp/generic/Makefile | 2 +-
lapack/laswp/generic/laswp_k_1.c | 48 +-
lapack/laswp/generic/laswp_k_2.c | 98 +-
lapack/laswp/generic/laswp_k_4.c | 132 +-
lapack/laswp/generic/laswp_k_8.c | 180 +-
lapack/laswp/generic/zlaswp_k_1.c | 52 +-
lapack/laswp/generic/zlaswp_k_2.c | 90 +-
lapack/laswp/generic/zlaswp_k_4.c | 140 +-
lapack/lauu2/lauu2_L.c | 10 +-
lapack/lauu2/lauu2_U.c | 10 +-
lapack/lauu2/zlauu2_L.c | 8 +-
lapack/lauu2/zlauu2_U.c | 10 +-
lapack/lauum/lauum_L_parallel.c | 12 +-
lapack/lauum/lauum_L_single.c | 80 +-
lapack/lauum/lauum_U_parallel.c | 10 +-
lapack/lauum/lauum_U_single.c | 76 +-
lapack/potf2/potf2_L.c | 6 +-
lapack/potf2/potf2_U.c | 6 +-
lapack/potf2/zpotf2_L.c | 4 +-
lapack/potf2/zpotf2_U.c | 6 +-
lapack/potrf/potrf_L_parallel.c | 14 +-
lapack/potrf/potrf_L_single.c | 24 +-
lapack/potrf/potrf_U_parallel.c | 14 +-
lapack/potrf/potrf_U_single.c | 42 +-
lapack/potrf/potrf_parallel.c | 114 +-
lapack/trti2/trti2_L.c | 4 +-
lapack/trti2/trti2_U.c | 8 +-
lapack/trti2/ztrti2_L.c | 6 +-
lapack/trti2/ztrti2_U.c | 10 +-
lapack/trtri/trtri_L_parallel.c | 8 +-
lapack/trtri/trtri_U_parallel.c | 8 +-
make.inc | 1 -
param.h | 71 +-
reference/Makefile | 8 +-
reference/cspmvf.f | 2 +-
reference/ctpmvf.f | 2 +-
reference/sgetrff.f | 2 +-
reference/sgetrsf.f | 2 +-
reference/spotrff.f | 2 +-
reference/strtrif.f | 2 +-
reference/ztpmvf.f | 2 +-
reference/ztrmvf.f | 2 +-
segfaults.patch | 2 +-
symcopy.h | 352 ++--
test/Makefile | 2 +-
test/get_threading_model.c | 24 +-
test/sblat2.f | 2 +-
utest/Makefile | 10 +-
utest/common_utest.h | 22 +-
utest/main.c | 42 +-
utest/test_amax.c | 24 +-
utest/test_axpy.c | 22 +-
utest/test_dotu.c | 26 +-
utest/test_dsdot.c | 26 +-
utest/test_fork.c | 8 +-
utest/test_rot.c | 22 +-
utest/test_rotmg.c | 24 +-
version.h | 22 +-
1459 files changed, 26294 insertions(+), 22414 deletions(-)
diff --git a/.gitignore b/.gitignore
index ac96616..7422cea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,8 +21,10 @@ lapack-netlib/TESTING/testing_results.txt
lib.grd
nohup.out
config.h
+config_kernel.h
Makefile.conf
Makefile.conf_last
+Makefile_kernel.conf
config_last.h
getarch
getarch_2nd
@@ -41,6 +43,8 @@ ctest/xzcblat2
ctest/xzcblat3
exports/linktest.c
exports/linux.def
+kernel/setparam_*.c
+kernel/kernel_*.h
test/CBLAT2.SUMM
test/CBLAT3.SUMM
test/DBLAT2.SUMM
diff --git a/.travis.yml b/.travis.yml
index 46d70a0..7d625c9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,7 +11,7 @@ env:
before_install:
- sudo apt-get update -qq
- - sudo apt-get install -qq gfortran
+ - sudo apt-get install -qq gfortran
- if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi
- if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 32c30e4..58748ea 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -31,7 +31,7 @@
* Improve the windows build.
* Chen Shaohu <huhumartinwar at gmail.com>
- * Optimize GEMV on the Loongson 3A processor.
+ * Optimize GEMV on the Loongson 3A processor.
* Luo Wen
* Intern. Test Level-2 BLAS.
@@ -53,11 +53,11 @@ In chronological order:
* [2012-05-19] Fix building bug on FreeBSD and NetBSD.
* Sylvestre Ledru <https://github.com/sylvestre>
- * [2012-07-01] Improve the detection of sparc. Fix building bug under
+ * [2012-07-01] Improve the detection of sparc. Fix building bug under
Hurd and kfreebsd.
* Jameson Nash <https://github.com/vtjnash>
- * [2012-08-20] Provide support for passing CFLAGS, FFLAGS, PFLAGS, FPFLAGS to
+ * [2012-08-20] Provide support for passing CFLAGS, FFLAGS, PFLAGS, FPFLAGS to
make on the command line.
* Alexander Nasonov <alnsn at yandex.ru>
@@ -80,7 +80,7 @@ In chronological order:
* [2013-06-30] Add Intel Haswell support (using sandybridge optimizations).
* grisuthedragon <https://github.com/grisuthedragon>
- * [2013-07-11] create openblas_get_parallel to retrieve information which parallelization
+ * [2013-07-11] create openblas_get_parallel to retrieve information which parallelization
model is used by OpenBLAS.
* Elliot Saba <staticfloat at gmail.com>
diff --git a/Changelog.txt b/Changelog.txt
index 195d98b..5b85227 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,5 +1,25 @@
OpenBLAS ChangeLog
====================================================================
+Version 0.2.10
+16-Jul-2014
+common:
+ * Added BLAS extensions as following.
+ s/d/c/zaxpby, s/d/c/zimatcopy, s/d/c/zomatcopy.
+ * Added OPENBLAS_CORETYPE environment for dynamic_arch. (a86d34)
+ * Added NO_AVX2 flag for old binutils. (#401)
+ * Support outputing the CPU corename on runtime.(#407)
+ * Patched LAPACK to fix bug 114, 117, 118.
+ (http://www.netlib.org/lapack/bug_list.html)
+ * Disabled ?gemm3m for a work-around fix. (#400)
+x86/x86-64:
+ * Fixed lots of bugs for optimized kernels on sandybridge,Haswell,
+ bulldozer, and piledriver.
+ https://github.com/xianyi/OpenBLAS/wiki/Fixed-optimized-kernels-To-do-List
+
+ARM:
+ * Improved LAPACK testing.
+
+====================================================================
Version 0.2.9
10-Jun-2014
common:
@@ -55,25 +75,25 @@ Version 0.2.7
common:
* Support LSB (Linux Standard Base) 4.1.
e.g. make CC=lsbcc
- * Include LAPACK 3.4.2 source codes to the repo.
+ * Include LAPACK 3.4.2 source codes to the repo.
Avoid downloading at compile time.
* Add NO_PARALLEL_MAKE flag to disable parallel make.
- * Create openblas_get_parallel to retrieve information which
+ * Create openblas_get_parallel to retrieve information which
parallelization model is used by OpenBLAS. (Thank grisuthedragon)
* Detect LLVM/Clang compiler. The default compiler is Clang on Mac OS X.
* Change LIBSUFFIX from .lib to .a on windows.
* A work-around for dtrti_U single thread bug. Replace it with LAPACK codes. (#191)
x86/x86-64:
- * Optimize c/zgemm, trsm, dgemv_n, ddot, daxpy, dcopy on
+ * Optimize c/zgemm, trsm, dgemv_n, ddot, daxpy, dcopy on
AMD Bulldozer. (Thank Werner Saar)
* Add Intel Haswell support (using Sandybridge optimizations).
(Thank Dan Luu)
* Add AMD Piledriver support (using Bulldozer optimizations).
- * Fix the computational error in zgemm avx kernel on
+ * Fix the computational error in zgemm avx kernel on
Sandybridge. (#237)
* Fix the overflow bug in gemv.
- * Fix the overflow bug in multi-threaded BLAS3, getrf when NUM_THREADS
+ * Fix the overflow bug in multi-threaded BLAS3, getrf when NUM_THREADS
is very large.(#214, #221, #246).
MIPS64:
* Support loongcc (Open64 based) compiler for ICT Loongson 3A/B.
@@ -110,7 +130,7 @@ common:
* Fixed NetBSD build. (#155)
* Fixed compilation with TARGET=GENERIC. (#160)
x86/x86-64:
- * Restore the original CPU affinity when calling
+ * Restore the original CPU affinity when calling
openblas_set_num_threads(1) (#153)
* Fixed a SEGFAULT bug in dgemv_t when m is very large.(#154)
MIPS64:
@@ -120,13 +140,13 @@ Version 0.2.4
8-Oct-2012
common:
* Upgraded LAPACK to 3.4.2 version. (#145)
- * Provided support for passing CFLAGS, FFLAGS, PFLAGS,
+ * Provided support for passing CFLAGS, FFLAGS, PFLAGS,
FPFLAGS to make. (#137)
- * f77blas.h:compatibility for compilers without C99 complex
+ * f77blas.h:compatibility for compilers without C99 complex
number support. (#141)
x86/x86-64:
* Added NO_AVX flag. Check OS supporting AVX on runtime. (#139)
- * Fixed zdot incompatibility ABI issue with GCC 4.7 on
+ * Fixed zdot incompatibility ABI issue with GCC 4.7 on
Windows 32-bit. (#140)
MIPS64:
* Fixed the generation of shared library bug.
@@ -136,14 +156,14 @@ Version 0.2.3
20-Aug-2012
common:
* Fixed LAPACK unstable bug about ?laswp. (#130)
- * Fixed the shared library bug about unloading the library on
+ * Fixed the shared library bug about unloading the library on
Linux (#132).
* Fixed the compilation failure on BlueGene/P (TARGET=PPC440FP2)
Please use gcc and IBM xlf. (#134)
x86/x86-64:
- * Supported goto_set_num_threads and openblas_set_num_threads
+ * Supported goto_set_num_threads and openblas_set_num_threads
APIs in Windows. They can set the number of threads on runtime.
-
+
====================================================================
Version 0.2.2
6-July-2012
@@ -191,14 +211,14 @@ x86/x86_64:
* Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX.
* Test alpha=Nan in dscale.
* Fixed a SEGFAULT bug in samax on x86 windows.
-
+
====================================================================
Version 0.1.0
23-Mar-2012
common:
* Set soname of shared library on Linux.
- * Added LIBNAMESUFFIX flag in Makefile.rule. The user can use
- this flag to control the library name, e.g. libopenblas.a,
+ * Added LIBNAMESUFFIX flag in Makefile.rule. The user can use
+ this flag to control the library name, e.g. libopenblas.a,
libopenblas_ifort.a or libopenblas_omp.a.
* Added GEMM_MULTITHREAD_THRESHOLD flag in Makefile.rule.
The lib use single thread in GEMM function with small matrices.
@@ -229,7 +249,7 @@ x86/x86_64:
Version 0.1 alpha2.4
18-Sep-2011
common:
- * Fixed a bug about installation. The header file "fblas77.h"
+ * Fixed a bug about installation. The header file "fblas77.h"
works fine now.
* Fixed #61 a building bug about setting TARGET and DYNAMIC_ARCH.
* Try to handle absolute path of shared library in OSX. (#57)
@@ -238,16 +258,16 @@ common:
$(PREFIX)/lib
x86/x86_64:
- * Fixed #58 zdot/xdot SEGFAULT bug with GCC-4.6 on x86. According
- to i386 calling convention, The callee should remove the first
- hidden parameter.Thank Mr. John for this patch.
+ * Fixed #58 zdot/xdot SEGFAULT bug with GCC-4.6 on x86. According
+ to i386 calling convention, The callee should remove the first
+ hidden parameter.Thank Mr. John for this patch.
====================================================================
Version 0.1 alpha2.3
5-Sep-2011
x86/x86_64:
- * Added DTB_ENTRIES into dynamic arch setting parameters. Now,
+ * Added DTB_ENTRIES into dynamic arch setting parameters. Now,
it can read DTB_ENTRIES on runtime. (Refs issue #55 on github)
====================================================================
@@ -255,7 +275,7 @@ Version 0.1 alpha2.2
14-Jul-2011
common:
- * Fixed a building bug when DYNAMIC_ARCH=1 & INTERFACE64=1.
+ * Fixed a building bug when DYNAMIC_ARCH=1 & INTERFACE64=1.
(Refs issue #44 on github)
====================================================================
@@ -263,7 +283,7 @@ Version 0.1 alpha2.1
28-Jun-2011
common:
- * Stop the build and output the error message when detecting
+ * Stop the build and output the error message when detecting
fortran compiler failed. (Refs issue #42 on github)
====================================================================
@@ -271,16 +291,16 @@ Version 0.1 alpha2
23-Jun-2011
common:
- * Fixed blasint undefined bug in <cblas.h> file. Other software
+ * Fixed blasint undefined bug in <cblas.h> file. Other software
could include this header successfully(Refs issue #13 on github)
- * Fixed the SEGFAULT bug on 64 cores. On SMP server, the number
- of CPUs or cores should be less than or equal to 64.(Refs issue #14
+ * Fixed the SEGFAULT bug on 64 cores. On SMP server, the number
+ of CPUs or cores should be less than or equal to 64.(Refs issue #14
on github)
* Support "void goto_set_num_threads(int num_threads)" and "void
openblas_set_num_threads(int num_threads)" when USE_OPENMP=1
- * Added extern "C" to support C++. Thank Tasio for the patch(Refs
+ * Added extern "C" to support C++. Thank Tasio for the patch(Refs
issue #21 on github)
- * Provided an error message when the arch is not supported.(Refs
+ * Provided an error message when the arch is not supported.(Refs
issue #19 on github)
* Fixed issue #23. Fixed a bug of f_check script about generating link flags.
* Added openblas_set_num_threads for Fortran.
@@ -298,7 +318,7 @@ x86/x86_64:
* Work-around #27 the low performance axpy issue with small imput size & multithreads.
MIPS64:
- * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64.
+ * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64.
* Optimized single/double precision BLAS Level3 on Loongson3A/MIPS64. (Refs #2)
* Optimized single/double precision axpy function on Loongson3A/MIPS64. (Refs #3)
@@ -307,9 +327,9 @@ Version 0.1 alpha1
20-Mar-2011
common:
- * Support "make NO_LAPACK=1" to build the library without
+ * Support "make NO_LAPACK=1" to build the library without
LAPACK functions.
- * Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34.
+ * Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34.
Thank Mr.Ei-ji Nakama providing this patch. (Refs issue #12 on github)
* Added DEBUG=1 rule in Makefile.rule to build debug version.
* Disable compiling quad precision in reference BLAS library(netlib BLAS).
@@ -318,15 +338,15 @@ common:
* Imported GotoBLAS2 1.13 BSD version
x86/x86_64:
- * On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would casue
+ * On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would casue
zdotu & zdotc failures. Instead, work-around it. (Refs issue #8 #9 on github)
- * Modified ?axpy functions to return same netlib BLAS results
+ * Modified ?axpy functions to return same netlib BLAS results
when incx==0 or incy==0 (Refs issue #7 on github)
- * Modified ?swap functions to return same netlib BLAS results
+ * Modified ?swap functions to return same netlib BLAS results
when incx==0 or incy==0 (Refs issue #6 on github)
- * Modified ?rot functions to return same netlib BLAS results
+ * Modified ?rot functions to return same netlib BLAS results
when incx==0 or incy==0 (Refs issue #4 on github)
- * Detect Intel Westmere,Intel Clarkdale and Intel Arrandale
+ * Detect Intel Westmere,Intel Clarkdale and Intel Arrandale
to use Nehalem codes.
* Fixed a typo bug about compiling dynamic ARCH library.
MIPS64:
diff --git a/GotoBLAS_01Readme.txt b/GotoBLAS_01Readme.txt
index fdde1e3..8635ceb 100644
--- a/GotoBLAS_01Readme.txt
+++ b/GotoBLAS_01Readme.txt
@@ -83,7 +83,7 @@
4. Suported precision
Now x86/x86_64 version support 80bit FP precision in addition to
-normal double presicion and single precision. Currently only
+normal double presicion and single precision. Currently only
gfortran supports 80bit FP with "REAL*10".
diff --git a/GotoBLAS_02QuickInstall.txt b/GotoBLAS_02QuickInstall.txt
index abf3807..330c585 100644
--- a/GotoBLAS_02QuickInstall.txt
+++ b/GotoBLAS_02QuickInstall.txt
@@ -32,9 +32,9 @@
GotoBLAS2 build complete.
- OS ... Linux
- Architecture ... x86_64
- BINARY ... 64bit
+ OS ... Linux
+ Architecture ... x86_64
+ BINARY ... 64bit
C compiler ... GCC (command line : gcc)
Fortran compiler ... PATHSCALE (command line : pathf90)
Library Name ... libgoto_barcelonap-r1.27.a (Multi threaded; Max
diff --git a/GotoBLAS_03FAQ.txt b/GotoBLAS_03FAQ.txt
index be623d6..f3189ce 100644
--- a/GotoBLAS_03FAQ.txt
+++ b/GotoBLAS_03FAQ.txt
@@ -56,7 +56,7 @@
1.6 Q I use OpenMP compiler. How can I use GotoBLAS2 with it?
- A Please understand that OpenMP is a compromised method to use
+ A Please understand that OpenMP is a compromised method to use
thread. If you want to use OpenMP based code with GotoBLAS2, you
should enable "USE_OPENMP=1" in Makefile.rule.
diff --git a/GotoBLAS_05LargePage.txt b/GotoBLAS_05LargePage.txt
index fb7de6b..ec5106f 100644
--- a/GotoBLAS_05LargePage.txt
+++ b/GotoBLAS_05LargePage.txt
@@ -43,7 +43,7 @@
F) Other aarchitecture which doesn't have Large TLB enhancement
If you have root permission, please install device driver which
- located in drivers/mapper.
+ located in drivers/mapper.
$shell> cd drivers/mapper
$shell> make
diff --git a/GotoBLAS_06WeirdPerformance.txt b/GotoBLAS_06WeirdPerformance.txt
index 8046267..05766e1 100644
--- a/GotoBLAS_06WeirdPerformance.txt
+++ b/GotoBLAS_06WeirdPerformance.txt
@@ -4,7 +4,7 @@
probably you created too many threads or process. Basically GotoBLAS
assumes that available cores that you specify are exclusively for
BLAS computation. Even one small thread/process conflicts with BLAS
- threads, performance will become worse.
+ threads, performance will become worse.
The best solution is to reduce your number of threads or insert
some synchronization mechanism and suspend your threads until BLAS
@@ -19,4 +19,4 @@
Anyway, if you see any weird performance loss, it means your code or
-algorithm is not optimal.
+algorithm is not optimal.
diff --git a/LICENSE b/LICENSE
index 1e93a6a..d15634e 100644
--- a/LICENSE
+++ b/LICENSE
@@ -12,17 +12,17 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile b/Makefile
index 2f5d032..2e37888 100644
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@ include ./Makefile.system
BLASDIRS = interface driver/level2 driver/level3 driver/others
ifneq ($(DYNAMIC_ARCH), 1)
-BLASDIRS += kernel
+BLASDIRS += kernel
endif
ifdef UTEST_CHECK
@@ -23,7 +23,7 @@ endif
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
.PHONY : all libs netlib test ctest shared install
-.NOTPARALLEL : all libs prof lapack-test install
+.NOTPARALLEL : all libs prof lapack-test install blas-test
all :: libs netlib tests shared
@echo
@@ -36,9 +36,13 @@ ifndef BINARY64
else
@echo " BINARY ... 64bit "
endif
+
ifdef INTERFACE64
+ifneq ($(INTERFACE64), 0)
@echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) "
endif
+endif
+
@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
ifndef NOFORTRAN
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"
@@ -149,7 +153,7 @@ endif
ifeq ($(EXPRECISION), 1)
@echo "#define EXPRECISION">> config_last.h
endif
-##
+##
ifeq ($(DYNAMIC_ARCH), 1)
@$(MAKE) -C kernel commonlibs || exit 1
@for d in $(DYNAMIC_CORE) ; \
@@ -183,7 +187,7 @@ blas :
fi; \
done
-hpl :
+hpl :
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
for d in $(BLASDIRS) ../laswp exports ; \
do if test -d $$d; then \
@@ -206,7 +210,7 @@ hpl_p :
done
ifeq ($(NO_LAPACK), 1)
-netlib :
+netlib :
else
netlib : lapack_prebuild
@@ -243,15 +247,21 @@ ifndef NOFORTRAN
- at echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
- at echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
- at echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-ifeq ($(F_COMPILER), GFORTRAN)
+ifeq ($(FC), GFORTRAN)
- at echo "TIMER = INT_ETIME" >> $(NETLIB_LAPACK_DIR)/make.inc
+ifdef SMP
+ - at echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
+else
+ - at echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
+endif
else
- at echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc
+ - at echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
- at cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc
endif
-large.tgz :
+large.tgz :
ifndef NOFORTRAN
if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/large.tgz;
@@ -278,6 +288,11 @@ lapack-test :
make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
+blas-test:
+ (cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out)
+ make -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
+ (cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out)
+
dummy :
diff --git a/Makefile.alpha b/Makefile.alpha
index 2305483..bd4f4d5 100644
--- a/Makefile.alpha
+++ b/Makefile.alpha
@@ -50,7 +50,7 @@ endif
ifndef SMP
LIBCXML = -lcxml -lots -lm
-LIBATLAS = -L/usr/lib/atlas3.7.8 -lf77blas -latlas -lm
+LIBATLAS = -L/usr/lib/atlas3.7.8 -lf77blas -latlas -lm
else
LIBCXML = -lcxmlp -lots -lm
LIBATLAS = -L/usr/lib/atlas3.7.8p -llapack -lptcblas -lptf77blas -latlas -lpthread -lm
diff --git a/Makefile.ia64 b/Makefile.ia64
index 7ffcd1d..cdf3f7c 100644
--- a/Makefile.ia64
+++ b/Makefile.ia64
@@ -16,7 +16,7 @@ LIBMLIB = ../../level1/others/libmisc.a -L/opt/intel/fc/ia64/9.1.040/lib -L/opt
LIBSCSL = -L/opt/scsl/1.4.1.0/lib -Wl,-rpath,/opt/scsl/1.4.1.0/lib -lscs
ifndef SMP
-LIBATLAS = -L/usr/lib/atlas3.6.0 -lf77blas -latlas -lm
+LIBATLAS = -L/usr/lib/atlas3.6.0 -lf77blas -latlas -lm
else
LIBATLAS = -L$(HOME)/misc/lib -L/usr/lib/atlas3.6.0p -llapack -lptcblas -lptf77blas -latlas -lpthread -lm
endif
diff --git a/Makefile.install b/Makefile.install
index 9fc8d7a..e0ccccb 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -22,7 +22,7 @@ install : lib.grd
@-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@-mkdir -p $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
-#for inc
+#for inc
@echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@awk 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@@ -50,14 +50,14 @@ ifndef NO_LAPACKE
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
endif
-#for install static library
+#for install static library
ifndef NO_STATIC
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@install -pm644 $(LIBNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
-#for install shared library
+#for install shared library
ifndef NO_SHARED
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), Linux)
@@ -76,14 +76,14 @@ ifeq ($(OSNAME), NetBSD)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
-ifeq ($(OSNAME), Darwin)
+ifeq ($(OSNAME), Darwin)
@-cp $(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@-install_name_tool -id $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
@-ln -fs $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib
endif
ifeq ($(OSNAME), WINNT)
@-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR)
- @-cp $(LIBPREFIX).lib $(OPENBLAS_LIBRARY_DIR)
+ @-cp $(LIBDLLNAME).a $(OPENBLAS_LIBRARY_DIR)
endif
ifeq ($(OSNAME), CYGWIN_NT)
@-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR)
diff --git a/Makefile.power b/Makefile.power
index c6d6aeb..7e2b473 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -5,7 +5,7 @@ FLAMEPATH = $(HOME)/flame/lib
#ifeq ($(CORE), CELL)
#CELL_SDK_ROOT = /opt/IBM/cell-sdk-1.1/sysroot/usr
#SPU_CC = spu-gcc
-#EXTRALIB += -lspe
+#EXTRALIB += -lspe
#endif
ifeq ($(OSNAME), Linux)
@@ -38,7 +38,7 @@ ASFLAGS = -a32
endif
endif
-# CCOMMON_OPT += -maltivec -mabi=altivec
+# CCOMMON_OPT += -maltivec -mabi=altivec
LIBFLAME = -L$(FLAMEPATH) -llapack2flame -lflame-lapack -lflame-base $(LIBS)
@@ -57,7 +57,7 @@ endif
LIBVECLIB = -framework VecLib
ifndef SMP
-LIBATLAS = -L/usr/lib/atlas3.7.11 -lf77blas -latlas -lg2c -lm
+LIBATLAS = -L/usr/lib/atlas3.7.11 -lf77blas -latlas -lg2c -lm
LIBESSL = -lessl $(ESSLPATH) ../../level1/others/libmisc.a -lm
else
LIBATLAS = -L/usr/lib/atlas3.7.11p -lptf77blas -latlas -lm -lpthread
@@ -73,7 +73,7 @@ endif
LIBVECLIB = /System/Library/Frameworks/vecLib.framework/Versions/Current/vecLib
ifndef SMP
-LIBATLAS = -L/usr/lib64/atlas3.7.11 -lf77blas -latlas -lg2c -lm
+LIBATLAS = -L/usr/lib64/atlas3.7.11 -lf77blas -latlas -lg2c -lm
LIBESSL = -lessl $(ESSLPATH) -lm
else
LIBATLAS = -L/usr/lib64/atlas3.7.11p -lptf77blas -latlas -lm -lpthread
diff --git a/Makefile.rule b/Makefile.rule
index 5bbabe3..1969761 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -1,12 +1,12 @@
#
-# Beginning of user configuration
+# Beginning of user configuration
#
# This library's version
-VERSION = 0.2.9
+VERSION = 0.2.10
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
-# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
+# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
# is libopenblas_$(LIBNAMESUFFIX).so.0.
# LIBNAMESUFFIX = omp
@@ -25,9 +25,20 @@ VERSION = 0.2.9
# FC = gfortran
# Even you can specify cross compiler. Meanwhile, please set HOSTCC.
+
+# cross compiler for Windows
# CC = x86_64-w64-mingw32-gcc
# FC = x86_64-w64-mingw32-gfortran
+# cross compiler for 32bit ARM
+# CC = arm-linux-gnueabihf-gcc
+# FC = arm-linux-gnueabihf-gfortran
+
+# cross compiler for 64bit ARM
+# CC = aarch64-linux-gnu-gcc
+# FC = aarch64-linux-gnu-gfortran
+
+
# If you use the cross compiler, please set this host compiler.
# HOSTCC = gcc
@@ -57,11 +68,11 @@ VERSION = 0.2.9
# If you don't need CBLAS interface, please comment it in.
# NO_CBLAS = 1
-# If you only want CBLAS interface without installing Fortran compiler,
+# If you only want CBLAS interface without installing Fortran compiler,
# please comment it in.
# ONLY_CBLAS = 1
-# If you don't need LAPACK, please comment it in.
+# If you don't need LAPACK, please comment it in.
# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1.
# NO_LAPACK = 1
@@ -84,10 +95,13 @@ NO_WARMUP = 1
# If you want to disable CPU/Memory affinity on Linux.
NO_AFFINITY = 1
-# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
+# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
# and OS. However, the performance is low.
# NO_AVX = 1
+# Don't use Haswell optimizations if binutils is too old (e.g. RHEL6)
+# NO_AVX2 = 1
+
# Don't use parallel make.
# NO_PARALLEL_MAKE = 1
@@ -112,8 +126,8 @@ NO_AFFINITY = 1
# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
# CONSISTENT_FPCSR = 1
-# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
-# with single thread. You can use this flag to avoid the overhead of multi-threading
+# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
+# with single thread. You can use this flag to avoid the overhead of multi-threading
# in small matrix sizes. The default value is 4.
# GEMM_MULTITHREAD_THRESHOLD = 4
@@ -128,12 +142,13 @@ NO_AFFINITY = 1
# The installation directory.
# PREFIX = /opt/OpenBLAS
-# Common Optimization Flag;
+# Common Optimization Flag;
# The default -O2 is enough.
# COMMON_OPT = -O2
# gfortran option for LAPACK
-FCOMMON_OPT = -frecursive
+# enable this flag only on 64bit Linux and if you need a thread safe lapack library
+# FCOMMON_OPT = -frecursive
# Profiling flags
COMMON_PROF = -pg
@@ -142,5 +157,5 @@ COMMON_PROF = -pg
# DEBUG = 1
#
-# End of user configuration
+# End of user configuration
#
diff --git a/Makefile.sparc b/Makefile.sparc
index c58c77e..8895b96 100644
--- a/Makefile.sparc
+++ b/Makefile.sparc
@@ -27,7 +27,7 @@ LIBNAME = $(LIBPREFIX).a
ifndef SMP
LIBCXML = -L/opt/SUNWspro/lib/v9
-LIBATLAS = -L$(HOME)/misc/lib -lf77blas -latlas -lm
+LIBATLAS = -L$(HOME)/misc/lib -lf77blas -latlas -lm
else
LIBCXML = -lcxmlp -lots -lm
endif
diff --git a/Makefile.system b/Makefile.system
index ade4f93..370da59 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -35,7 +35,7 @@ include $(TOPDIR)/$(MAKEFILE_RULE)
endif
#
-# Beginning of system configuration
+# Beginning of system configuration
#
ifndef HOSTCC
@@ -46,25 +46,73 @@ ifdef TARGET
GETARCH_FLAGS := -DFORCE_$(TARGET)
endif
+# Force fallbacks for 32bit
+
+ifeq ($(BINARY), 32)
+ifeq ($(TARGET), HASWELL)
+GETARCH_FLAGS := -DFORCE_NEHALEM
+endif
+ifeq ($(TARGET), SANDYBRIDGE)
+GETARCH_FLAGS := -DFORCE_NEHALEM
+endif
+ifeq ($(TARGET), BULLDOZER)
+GETARCH_FLAGS := -DFORCE_BARCELONA
+endif
+ifeq ($(TARGET), PILEDRIVER)
+GETARCH_FLAGS := -DFORCE_BARCELONA
+endif
+endif
+
+
#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
#
ifdef TARGET_CORE
GETARCH_FLAGS := -DFORCE_$(TARGET_CORE)
endif
+# Force fallbacks for 32bit
+
+ifeq ($(BINARY), 32)
+ifeq ($(TARGET_CORE), HASWELL)
+GETARCH_FLAGS := -DFORCE_NEHALEM
+endif
+ifeq ($(TARGET_CORE), SANDYBRIDGE)
+GETARCH_FLAGS := -DFORCE_NEHALEM
+endif
+ifeq ($(TARGET_CORE), BULLDOZER)
+GETARCH_FLAGS := -DFORCE_BARCELONA
+endif
+ifeq ($(TARGET_CORE), PILEDRIVER)
+GETARCH_FLAGS := -DFORCE_BARCELONA
+endif
+endif
+
+
+
+
ifdef INTERFACE64
+ifneq ($(INTERFACE64), 0)
GETARCH_FLAGS += -DUSE64BITINT
endif
+endif
ifndef GEMM_MULTITHREAD_THRESHOLD
GEMM_MULTITHREAD_THRESHOLD=4
endif
-GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD)
+GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD)
ifeq ($(NO_AVX), 1)
GETARCH_FLAGS += -DNO_AVX
endif
+ifeq ($(BINARY), 32)
+GETARCH_FLAGS += -DNO_AVX
+endif
+
+ifeq ($(NO_AVX2), 1)
+GETARCH_FLAGS += -DNO_AVX2
+endif
+
ifeq ($(DEBUG), 1)
GETARCH_FLAGS += -g
endif
@@ -186,14 +234,14 @@ GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGT4), 1)
# GCC Majar version > 4
-# It is compatible with MSVC ABI.
+# It is compatible with MSVC ABI.
CCOMMON_OPT += -DMS_ABI
endif
ifeq ($(GCCVERSIONGTEQ4), 1)
ifeq ($(GCCMINORVERSIONGTEQ7), 1)
# GCC Version >=4.7
-# It is compatible with MSVC ABI.
+# It is compatible with MSVC ABI.
CCOMMON_OPT += -DMS_ABI
endif
endif
@@ -273,7 +321,7 @@ FCOMMON_OPT += -m128bit-long-double
endif
ifeq ($(C_COMPILER), CLANG)
EXPRECISION = 1
-CCOMMON_OPT += -DEXPRECISION
+CCOMMON_OPT += -DEXPRECISION
FCOMMON_OPT += -m128bit-long-double
endif
endif
@@ -291,7 +339,7 @@ FCOMMON_OPT += -m128bit-long-double
endif
ifeq ($(C_COMPILER), CLANG)
EXPRECISION = 1
-CCOMMON_OPT += -DEXPRECISION
+CCOMMON_OPT += -DEXPRECISION
FCOMMON_OPT += -m128bit-long-double
endif
endif
@@ -336,15 +384,15 @@ ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
-ifneq ($(NO_AVX), 1)
-DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
-endif
endif
ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
-DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
+DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
+endif
+ifneq ($(NO_AVX2), 1)
+DYNAMIC_CORE += HASWELL
endif
endif
@@ -414,12 +462,12 @@ endif
BINARY_DEFINED = 1
endif
-ifeq ($(CORE), LOONGSON3A)
+ifeq ($(CORE), LOONGSON3A)
CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64
endif
-ifeq ($(CORE), LOONGSON3B)
+ifeq ($(CORE), LOONGSON3B)
CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64
endif
@@ -489,7 +537,7 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT
FCOMMON_OPT += -Wall
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
ifneq ($(NO_LAPACK), 1)
-EXTRALIB += -lgfortran
+EXTRALIB += -lgfortran
endif
ifdef NO_BINARY_MODE
ifeq ($(ARCH), mips64)
@@ -503,8 +551,10 @@ else
ifdef BINARY64
FCOMMON_OPT += -m64
ifdef INTERFACE64
+ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -fdefault-integer-8
endif
+endif
else
FCOMMON_OPT += -m32
endif
@@ -517,8 +567,10 @@ endif
ifeq ($(F_COMPILER), INTEL)
CCOMMON_OPT += -DF_INTERFACE_INTEL
ifdef INTERFACE64
+ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
+endif
ifdef USE_OPENMP
FCOMMON_OPT += -openmp
endif
@@ -537,8 +589,10 @@ CCOMMON_OPT += -DF_INTERFACE_IBM
ifdef BINARY64
FCOMMON_OPT += -q64
ifdef INTERFACE64
+ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -qintsize=8
endif
+endif
else
FCOMMON_OPT += -q32
endif
@@ -552,8 +606,10 @@ CCOMMON_OPT += -DF_INTERFACE_PGI
COMMON_PROF += -DPGICOMPILER
ifdef BINARY64
ifdef INTERFACE64
+ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
+endif
FCOMMON_OPT += -tp p7-64
else
FCOMMON_OPT += -tp p7
@@ -567,9 +623,11 @@ ifeq ($(F_COMPILER), PATHSCALE)
CCOMMON_OPT += -DF_INTERFACE_PATHSCALE
ifdef BINARY64
ifdef INTERFACE64
+ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
+endif
ifneq ($(ARCH), mips64)
ifndef BINARY64
@@ -594,9 +652,11 @@ ifeq ($(F_COMPILER), OPEN64)
CCOMMON_OPT += -DF_INTERFACE_OPEN64
ifdef BINARY64
ifdef INTERFACE64
+ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
+endif
ifeq ($(ARCH), mips64)
ifndef BINARY64
@@ -604,11 +664,11 @@ FCOMMON_OPT += -n32
else
FCOMMON_OPT += -n64
endif
-ifeq ($(CORE), LOONGSON3A)
+ifeq ($(CORE), LOONGSON3A)
FCOMMON_OPT += -loongson3 -static
endif
-ifeq ($(CORE), LOONGSON3B)
+ifeq ($(CORE), LOONGSON3B)
FCOMMON_OPT += -loongson3 -static
endif
@@ -634,11 +694,11 @@ CCOMMON_OPT += -n32
else
CCOMMON_OPT += -n64
endif
-ifeq ($(CORE), LOONGSON3A)
+ifeq ($(CORE), LOONGSON3A)
CCOMMON_OPT += -loongson3 -static
endif
-ifeq ($(CORE), LOONGSON3B)
+ifeq ($(CORE), LOONGSON3B)
CCOMMON_OPT += -loongson3 -static
endif
@@ -682,21 +742,23 @@ endif
ifdef BINARY64
ifdef INTERFACE64
-CCOMMON_OPT +=
+ifneq ($(INTERFACE64), 0)
+CCOMMON_OPT +=
#-DUSE64BITINT
endif
endif
+endif
ifeq ($(NEED_PIC), 1)
ifeq ($(C_COMPILER), IBM)
-CCOMMON_OPT += -qpic=large
+CCOMMON_OPT += -qpic=large
else
-CCOMMON_OPT += -fPIC
+CCOMMON_OPT += -fPIC
endif
ifeq ($(F_COMPILER), SUN)
FCOMMON_OPT += -pic
else
-FCOMMON_OPT += -fPIC
+FCOMMON_OPT += -fPIC
endif
endif
@@ -718,6 +780,14 @@ ifeq ($(NO_AVX), 1)
CCOMMON_OPT += -DNO_AVX
endif
+ifeq ($(ARCH), x86)
+CCOMMON_OPT += -DNO_AVX
+endif
+
+ifeq ($(NO_AVX2), 1)
+CCOMMON_OPT += -DNO_AVX2
+endif
+
ifdef SMP
CCOMMON_OPT += -DSMP_SERVER
@@ -870,10 +940,13 @@ LAPACK_FPFLAGS := $(FPFLAGS)
endif
LAPACK_CFLAGS = $(CFLAGS)
-LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
+LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
ifdef INTERFACE64
+ifneq ($(INTERFACE64), 0)
LAPACK_CFLAGS += -DLAPACK_ILP64
endif
+endif
+
ifdef OS_WINDOWS
LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS
endif
diff --git a/Makefile.tail b/Makefile.tail
index 56f8d82..2adede1 100644
--- a/Makefile.tail
+++ b/Makefile.tail
@@ -57,7 +57,7 @@ commonlibs :: $(COMMONOBJS)
commonprof :: $(COMMONOBJS_P)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME_P) $^
-quick :
+quick :
$(MAKE) -C $(TOPDIR) libs
bms.$(SUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h
@@ -386,7 +386,7 @@ kbench_rank_k: kbench_rank_k.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS
smallbench: smallbench.$(SUFFIX) $(BLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS)
$(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB)
-smallbench.mkl: smallbench.$(SUFFIX)
+smallbench.mkl: smallbench.$(SUFFIX)
$(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB)
bench.sun: bench.$(SUFFIX) $(OBJS)
@@ -410,7 +410,7 @@ bench.acml: bench.$(SUFFIX) $(OBJS)
bench.flame: bench.$(SUFFIX) $(OBJS)
$(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) $(EXTRALIB) $(CEXTRALIB)
-kbench.mkl: kbench.$(SUFFIX) $(OBJS)
+kbench.mkl: kbench.$(SUFFIX) $(OBJS)
$(CC) -static -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB)
bench.mkl: bench.$(SUFFIX) $(OBJS)
@@ -537,10 +537,10 @@ params.$(SUFFIX):param.c $(TOPDIR)/../bench/bmcommon.h
$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -c $< -o $(@F)
paramd.$(SUFFIX):param.c $(TOPDIR)/../bench/bmcommon.h
- $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F)
+ $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F)
paramq.$(SUFFIX):param.c $(TOPDIR)/../bench/bmcommon.h
- $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F)
+ $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F)
paramc.$(SUFFIX):paramz.c $(TOPDIR)/../bench/bmcommon.h
$(CC) $(CFLAGS) -UDOUBLE -DCOMPLEX -c $< -o $(@F)
@@ -555,10 +555,10 @@ params-ex.$(SUFFIX):param-ex.c $(TOPDIR)/../bench/bmcommon.h
$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -c $< -o $(@F)
paramd-ex.$(SUFFIX):param-ex.c $(TOPDIR)/../bench/bmcommon.h
- $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F)
+ $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F)
paramq-ex.$(SUFFIX):param-ex.c $(TOPDIR)/../bench/bmcommon.h
- $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F)
+ $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F)
paramc-ex.$(SUFFIX):paramz-ex.c $(TOPDIR)/../bench/bmcommon.h
$(CC) $(CFLAGS) -UDOUBLE -DCOMPLEX -c $< -o $(@F)
diff --git a/Makefile.x86 b/Makefile.x86
index cd7cc9f..a6196d3 100644
--- a/Makefile.x86
+++ b/Makefile.x86
@@ -14,7 +14,7 @@ endif
# LIBMKL = -L$(MKLPATH)/32 -lmkl_lapack -lmkl_ia32 -lguide -lpthread -lm
ifndef SMP
-LIBATLAS = -L$(ATLAS) -lf77blas -latlas -lg2c -lm
+LIBATLAS = -L$(ATLAS) -lf77blas -latlas -lg2c -lm
else
LIBATLAS = -L$(ATLAS) -lptf77blas -latlas -lpthread -lg2c -lm
endif
@@ -50,7 +50,7 @@ LIBSUNPERF = -L/opt/SUNWspro/lib/sse2 -Wl,-R,/opt/SUNWspro/lib/sse2 -lsunperf
LIBVECLIB = /System/Library/Frameworks/vecLib.framework/Versions/Current/vecLib
ifndef SMP
-LIBATLAS = -L$(ATLASPATH)/32 -lcblas -lf77blas -latlas -lm
+LIBATLAS = -L$(ATLASPATH)/32 -lcblas -lf77blas -latlas -lm
else
LIBATLAS = -L$(ATLASPATH)/32 -lptf77blas -lptatlas -lpthread -lm
endif
diff --git a/Makefile.x86_64 b/Makefile.x86_64
index c8d4b23..1ba6327 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -28,7 +28,7 @@ endif
ifndef SMP
-LIBATLAS = -L$(ATLASPATH)64 -llapack -lcblas -lf77blas -latlas -lm
+LIBATLAS = -L$(ATLASPATH)64 -llapack -lcblas -lf77blas -latlas -lm
else
LIBATLAS = -L$(ATLASPATH)64 -llapack -lptcblas -lptf77blas -latlas -lpthread -lm
endif
diff --git a/README.md b/README.md
index 4ae9696..2e85117 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
[![Build Status](https://travis-ci.org/xianyi/OpenBLAS.png?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS)
## Introduction
-OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
+OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>.
@@ -74,7 +74,7 @@ Please read GotoBLAS_01Readme.txt
## Usages
Link with libopenblas.a or -lopenblas for shared library.
-### Set the number of threads with environment variables.
+### Set the number of threads with environment variables.
Examples:
@@ -84,7 +84,7 @@ Examples:
export GOTO_NUM_THREADS=4
- or
+ or
export OMP_NUM_THREADS=4
@@ -92,7 +92,7 @@ The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS.
If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1.
-### Set the number of threads on runtime.
+### Set the number of threads on runtime.
We provided the below functions to control the number of threads on runtime.
@@ -116,12 +116,12 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
-* The number of CPUs/Cores should less than or equal to 256.
+* The number of CPUs/Cores should less than or equal to 256.
* On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1.
-* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
+* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
## Contributing
-1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug.
+1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug.
1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
1. Write a test which shows that the bug was fixed or that the feature works as expected.
1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.
diff --git a/benchmark/Makefile b/benchmark/Makefile
index 0c37570..e3910ee 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -1,157 +1,607 @@
TOPDIR = ..
include $(TOPDIR)/Makefile.system
-CULA_INC = -I/usr/local/cula/include
-CULA_LIB = -L/usr/local/cula/lib64 -Wl,-rpath,/usr/local/cula/lib64 -lcula_fortran -lcula -lcublas
-
-all :: dlinpack.goto dlinpack.mkl dlinpack.acml dcholesky.goto dcholesky.mkl dcholesky.acml
- ./dlinpack.goto 4000 4000 1
- -./dlinpack.mkl 4000 4000 1
- -./dlinpack.acml 4000 4000 1
- ./dcholesky.goto 4000 4000 1
- -./dcholesky.mkl 4000 4000 1
- -./dcholesky.acml 4000 4000 1
-
+# ACML standard
+ACML=/opt/acml5.3.1/gfortran64_mp/lib
+LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
+
+# ACML custom
+#ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib
+#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
+
+# Atlas Ubuntu
+#ATLAS=/usr/lib/atlas-base
+#LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm
+
+# Atlas RHEL and Fedora
+ATLAS=/usr/lib64/atlas
+LIBATLAS = -fopenmp $(ATLAS)/liblapack.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm
+
+# Intel standard
+MKL=/opt/intel/mkl/lib/intel64
+LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm
+
+# Intel custom
+#MKL=/home/saar/intel_mkl
+#LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm
+
+
+
+goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
+ scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
+ sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
+ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \
+ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \
+ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \
+ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
+ chemm.goto zhemm.goto \
+ cherk.goto zherk.goto \
+ cher2k.goto zher2k.goto \
+ ssymm.goto dsymm.goto csymm.goto zsymm.goto
+
+acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
+ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
+ sgemm.acml dgemm.acml cgemm.acml zgemm.acml \
+ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \
+ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \
+ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \
+ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \
+ chemm.acml zhemm.acml \
+ cherk.acml zherk.acml \
+ cher2k.acml zher2k.acml \
+ ssymm.acml dsymm.acml csymm.acml zsymm.acml
+
+atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
+ scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \
+ sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \
+ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \
+ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \
+ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \
+ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \
+ chemm.atlas zhemm.atlas \
+ cherk.atlas zherk.atlas \
+ cher2k.atlas zher2k.atlas \
+ ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas
+
+mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
+ scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \
+ sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \
+ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \
+ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \
+ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \
+ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \
+ chemm.mkl zhemm.mkl \
+ cherk.mkl zherk.mkl \
+ cher2k.mkl zher2k.mkl \
+ ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl
+
+all :: goto atlas acml mkl
+
+##################################### Slinpack ####################################################
slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+slinpack.acml : slinpack.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+slinpack.atlas : slinpack.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+slinpack.mkl : slinpack.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dlinpack ####################################################
dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
-qlinpack.goto : qlinpack.$(SUFFIX) ../$(LIBNAME)
- $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+dlinpack.acml : dlinpack.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dlinpack.atlas : dlinpack.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dlinpack.mkl : dlinpack.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Clinpack ####################################################
clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+clinpack.acml : clinpack.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+clinpack.atlas : clinpack.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+clinpack.mkl : clinpack.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zlinpack ####################################################
+
zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
-xlinpack.goto : xlinpack.$(SUFFIX) ../$(LIBNAME)
- $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+zlinpack.acml : zlinpack.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zlinpack.atlas : zlinpack.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zlinpack.mkl : zlinpack.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Scholesky ###################################################
scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+scholesky.acml : scholesky.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+scholesky.atlas : scholesky.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+scholesky.mkl : scholesky.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dcholesky ###################################################
+
dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
-qcholesky.goto : qcholesky.$(SUFFIX) ../$(LIBNAME)
- $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+dcholesky.acml : dcholesky.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dcholesky.atlas : dcholesky.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dcholesky.mkl : dcholesky.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ccholesky ###################################################
ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+ccholesky.acml : ccholesky.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ccholesky.atlas : ccholesky.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ccholesky.mkl : ccholesky.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
zcholesky.goto : zcholesky.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+##################################### Zcholesky ###################################################
+
xcholesky.goto : xcholesky.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
-slinpack.mkl : slinpack.$(SUFFIX)
- -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+zcholesky.acml : zcholesky.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-dlinpack.mkl : dlinpack.$(SUFFIX)
- -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+zcholesky.atlas : zcholesky.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-clinpack.mkl : clinpack.$(SUFFIX)
- -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+zcholesky.mkl : zcholesky.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-zlinpack.mkl : zlinpack.$(SUFFIX)
- -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-scholesky.mkl : scholesky.$(SUFFIX)
- -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+##################################### Sgemm ####################################################
+sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
-dcholesky.mkl : dcholesky.$(SUFFIX)
- -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+sgemm.acml : sgemm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-ccholesky.mkl : ccholesky.$(SUFFIX)
- -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+sgemm.atlas : sgemm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-zcholesky.mkl : zcholesky.$(SUFFIX)
- -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+sgemm.mkl : sgemm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-slinpack.acml : slinpack.$(SUFFIX)
+##################################### Dgemm ####################################################
+dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+dgemm.acml : dgemm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-dlinpack.acml : dlinpack.$(SUFFIX)
+dgemm.atlas : dgemm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgemm.mkl : dgemm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cgemm ####################################################
+
+cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+cgemm.acml : cgemm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-clinpack.acml : clinpack.$(SUFFIX)
+cgemm.atlas : cgemm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgemm.mkl : cgemm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zgemm ####################################################
+
+zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+zgemm.acml : zgemm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-zlinpack.acml : zlinpack.$(SUFFIX)
+zgemm.atlas : zgemm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgemm.mkl : zgemm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ssymm ####################################################
+ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+ssymm.acml : ssymm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-scholesky.acml : scholesky.$(SUFFIX)
+ssymm.atlas : ssymm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssymm.mkl : ssymm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dsymm ####################################################
+dsymm.goto : dsymm.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+dsymm.acml : dsymm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-dcholesky.acml : dcholesky.$(SUFFIX)
+dsymm.atlas : dsymm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsymm.mkl : dsymm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Csymm ####################################################
+
+csymm.goto : csymm.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+csymm.acml : csymm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-ccholesky.acml : ccholesky.$(SUFFIX)
+csymm.atlas : csymm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csymm.mkl : csymm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zsymm ####################################################
+
+zsymm.goto : zsymm.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+zsymm.acml : zsymm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-zcholesky.acml : zcholesky.$(SUFFIX)
+zsymm.atlas : zsymm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zsymm.mkl : zsymm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Strmm ####################################################
+strmm.goto : strmm.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+strmm.acml : strmm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+strmm.atlas : strmm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+strmm.mkl : strmm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dtrmm ####################################################
+dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+dtrmm.acml : dtrmm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtrmm.atlas : dtrmm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtrmm.mkl : dtrmm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ctrmm ####################################################
+
+ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+ctrmm.acml : ctrmm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-slinpack.flame : slinpack.$(SUFFIX)
- -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+ctrmm.atlas : ctrmm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-dlinpack.flame : dlinpack.$(SUFFIX)
- -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+ctrmm.mkl : ctrmm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-clinpack.flame : clinpack.$(SUFFIX)
- -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+##################################### Ztrmm ####################################################
-zlinpack.flame : zlinpack.$(SUFFIX)
- -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
-scholesky.flame : scholesky.$(SUFFIX)
- -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+ztrmm.acml : ztrmm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-dcholesky.flame : dcholesky.$(SUFFIX)
- -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+ztrmm.atlas : ztrmm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-ccholesky.flame : ccholesky.$(SUFFIX)
- -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+ztrmm.mkl : ztrmm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-zcholesky.flame : zcholesky.$(SUFFIX)
- -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-slinpack.sun : slinpack.$(SUFFIX)
- -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+##################################### Strsm ####################################################
+strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
-dlinpack.sun : dlinpack.$(SUFFIX)
- -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+strsm.acml : strsm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-clinpack.sun : clinpack.$(SUFFIX)
- -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+strsm.atlas : strsm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-zlinpack.sun : zlinpack.$(SUFFIX)
- -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+strsm.mkl : strsm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-scholesky.sun : scholesky.$(SUFFIX)
- -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+##################################### Dtrsm ####################################################
+dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
-dcholesky.sun : dcholesky.$(SUFFIX)
- -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+dtrsm.acml : dtrsm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-ccholesky.sun : ccholesky.$(SUFFIX)
- -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+dtrsm.atlas : dtrsm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-zcholesky.sun : zcholesky.$(SUFFIX)
- -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+dtrsm.mkl : dtrsm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-slinpack.cula : slinpack.$(SUFFIX) cula_wrapper.$(SUFFIX)
- $(CC) $(CFLAGS) -o $(@F) $^ $(CULA_LIB) ../$(LIBNAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+##################################### Ctrsm ####################################################
-clinpack.cula : clinpack.$(SUFFIX) cula_wrapper.$(SUFFIX)
- $(CC) $(CFLAGS) -o $(@F) $^ $(CULA_LIB) ../$(LIBNAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+ctrsm.acml : ctrsm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-cula_wrapper.$(SUFFIX) : cula_wrapper.c
- $(CC) $(CFLAGS) -c $(CULA_INC) -o $(@F) $^
+ctrsm.atlas : ctrsm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctrsm.mkl : ctrsm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ztrsm ####################################################
+
+ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+ztrsm.acml : ztrsm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztrsm.atlas : ztrsm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztrsm.mkl : ztrsm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ssyrk ####################################################
+ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+ssyrk.acml : ssyrk.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssyrk.atlas : ssyrk.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssyrk.mkl : ssyrk.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dsyrk ####################################################
+dsyrk.goto : dsyrk.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+dsyrk.acml : dsyrk.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsyrk.atlas : dsyrk.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsyrk.mkl : dsyrk.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Csyrk ####################################################
+
+csyrk.goto : csyrk.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+csyrk.acml : csyrk.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csyrk.atlas : csyrk.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csyrk.mkl : csyrk.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zsyrk ####################################################
+
+zsyrk.goto : zsyrk.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+zsyrk.acml : zsyrk.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zsyrk.atlas : zsyrk.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zsyrk.mkl : zsyrk.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+
+##################################### Ssyr2k ####################################################
+ssyr2k.goto : ssyr2k.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+ssyr2k.acml : ssyr2k.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssyr2k.atlas : ssyr2k.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssyr2k.mkl : ssyr2k.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dsyr2k ####################################################
+dsyr2k.goto : dsyr2k.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+dsyr2k.acml : dsyr2k.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsyr2k.atlas : dsyr2k.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsyr2k.mkl : dsyr2k.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Csyr2k ####################################################
+
+csyr2k.goto : csyr2k.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+csyr2k.acml : csyr2k.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csyr2k.atlas : csyr2k.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csyr2k.mkl : csyr2k.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zsyr2k ####################################################
+
+zsyr2k.goto : zsyr2k.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+zsyr2k.acml : zsyr2k.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zsyr2k.atlas : zsyr2k.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zsyr2k.mkl : zsyr2k.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Chemm ####################################################
+
+chemm.goto : chemm.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+chemm.acml : chemm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+chemm.atlas : chemm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+chemm.mkl : chemm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zhemm ####################################################
+
+zhemm.goto : zhemm.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+zhemm.acml : zhemm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zhemm.atlas : zhemm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zhemm.mkl : zhemm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cherk ####################################################
+
+cherk.goto : cherk.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+cherk.acml : cherk.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cherk.atlas : cherk.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cherk.mkl : cherk.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zherk ####################################################
+
+zherk.goto : zherk.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+zherk.acml : zherk.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zherk.atlas : zherk.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zherk.mkl : zherk.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cher2k ####################################################
+
+cher2k.goto : cher2k.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+cher2k.acml : cher2k.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cher2k.atlas : cher2k.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cher2k.mkl : cher2k.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zher2k ####################################################
+
+zher2k.goto : zher2k.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+zher2k.acml : zher2k.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zher2k.atlas : zher2k.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zher2k.mkl : zher2k.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+###################################################################################################
slinpack.$(SUFFIX) : linpack.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
@@ -159,37 +609,119 @@ slinpack.$(SUFFIX) : linpack.c
dlinpack.$(SUFFIX) : linpack.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-qlinpack.$(SUFFIX) : linpack.c
- $(CC) $(CFLAGS) -c -UCOMPLEX -DXDOUBLE -o $(@F) $^
-
clinpack.$(SUFFIX) : linpack.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zlinpack.$(SUFFIX) : linpack.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-xlinpack.$(SUFFIX) : linpack.c
- $(CC) $(CFLAGS) -c -DCOMPLEX -DXDOUBLE -o $(@F) $^
-
scholesky.$(SUFFIX) : cholesky.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
dcholesky.$(SUFFIX) : cholesky.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-qcholesky.$(SUFFIX) : cholesky.c
- $(CC) $(CFLAGS) -c -UCOMPLEX -DXDOUBLE -o $(@F) $^
-
ccholesky.$(SUFFIX) : cholesky.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zcholesky.$(SUFFIX) : cholesky.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-xcholesky.$(SUFFIX) : cholesky.c
- $(CC) $(CFLAGS) -c -DCOMPLEX -DXDOUBLE -o $(@F) $^
+sgemm.$(SUFFIX) : gemm.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dgemm.$(SUFFIX) : gemm.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+cgemm.$(SUFFIX) : gemm.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zgemm.$(SUFFIX) : gemm.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+ssymm.$(SUFFIX) : symm.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dsymm.$(SUFFIX) : symm.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+csymm.$(SUFFIX) : symm.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zsymm.$(SUFFIX) : symm.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+strmm.$(SUFFIX) : trmm.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dtrmm.$(SUFFIX) : trmm.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+ctrmm.$(SUFFIX) : trmm.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+ztrmm.$(SUFFIX) : trmm.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+strsm.$(SUFFIX) : trsm.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dtrsm.$(SUFFIX) : trsm.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+ctrsm.$(SUFFIX) : trsm.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+ztrsm.$(SUFFIX) : trsm.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+ssyrk.$(SUFFIX) : syrk.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dsyrk.$(SUFFIX) : syrk.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+csyrk.$(SUFFIX) : syrk.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zsyrk.$(SUFFIX) : syrk.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+ssyr2k.$(SUFFIX) : syr2k.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dsyr2k.$(SUFFIX) : syr2k.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+csyr2k.$(SUFFIX) : syr2k.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zsyr2k.$(SUFFIX) : syr2k.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+chemm.$(SUFFIX) : hemm.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zhemm.$(SUFFIX) : hemm.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+cherk.$(SUFFIX) : herk.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zherk.$(SUFFIX) : herk.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+cher2k.$(SUFFIX) : her2k.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zher2k.$(SUFFIX) : her2k.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+
+
clean ::
- @rm -f *.goto *.mkl *.acml *.sun *.cula
+ @rm -f *.goto *.mkl *.acml *.atlas
include $(TOPDIR)/Makefile.tail
+
diff --git a/benchmark/cholesky.c b/benchmark/cholesky.c
index a40cdd2..1ae3748 100644
--- a/benchmark/cholesky.c
+++ b/benchmark/cholesky.c
@@ -78,29 +78,29 @@ int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
-
+
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
-
+
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
-
+
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
-
+
return 0;
}
#endif
static __inline double getmflops(int ratio, int m, double secs){
-
+
double mm = (double)m;
double mulflops, addflops;
@@ -137,7 +137,7 @@ int MAIN__(int argc, char *argv[]){
struct timeval start, stop;
double time1;
- argc--;argv++;
+ argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
@@ -148,17 +148,17 @@ int MAIN__(int argc, char *argv[]){
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
-
+
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
-
+
for(m = from; m <= to; m += step){
-
+
fprintf(stderr, "M = %6d : ", (int)m);
-
+
for (uplos = 0; uplos < 2; uplos ++) {
-
+
#ifndef COMPLEX
if (uplos & 1) {
for (j = 0; j < m; j++) {
@@ -219,11 +219,11 @@ int MAIN__(int argc, char *argv[]){
fprintf(stderr, "Info = %d\n", info);
exit(1);
}
-
+
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
maxerr = 0.;
-
+
if (!(uplos & 1)) {
for (j = 0; j < m; j++) {
for(i = 0; i <= j; i++) {
@@ -247,8 +247,8 @@ int MAIN__(int argc, char *argv[]){
}
}
}
-
- fprintf(stderr,
+
+ fprintf(stderr,
#ifdef XDOUBLE
" %Le %10.3f MFlops", maxerr,
#else
diff --git a/benchmark/gemm.c b/benchmark/gemm.c
new file mode 100644
index 0000000..fc482c0
--- /dev/null
+++ b/benchmark/gemm.c
@@ -0,0 +1,210 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+
+#undef GEMM
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+#define GEMM BLASFUNC(dgemm)
+#else
+#define GEMM BLASFUNC(sgemm)
+#endif
+
+#else
+
+#ifdef DOUBLE
+#define GEMM BLASFUNC(zgemm)
+#else
+#define GEMM BLASFUNC(cgemm)
+#endif
+
+#endif
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+ FILETIME ft;
+ unsigned __int64 tmpres = 0;
+ static int tzflag;
+
+ if (NULL != tv)
+ {
+ GetSystemTimeAsFileTime(&ft);
+
+ tmpres |= ft.dwHighDateTime;
+ tmpres <<= 32;
+ tmpres |= ft.dwLowDateTime;
+
+ /*converting file time to unix epoch*/
+ tmpres /= 10; /*convert into microseconds*/
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tv->tv_sec = (long)(tmpres / 1000000UL);
+ tv->tv_usec = (long)(tmpres % 1000000UL);
+ }
+
+ return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+ int shmid;
+ void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+ if ((shmid =shmget(IPC_PRIVATE,
+ (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+ SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+ printf( "Memory allocation failed(shmget).\n");
+ exit(1);
+ }
+
+ address = shmat(shmid, NULL, SHM_RND);
+
+ if ((BLASLONG)address == -1){
+ printf( "Memory allocation failed(shmat).\n");
+ exit(1);
+ }
+
+ shmctl(shmid, IPC_RMID, 0);
+
+ return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int MAIN__(int argc, char *argv[]){
+
+ FLOAT *a, *b, *c;
+ FLOAT alpha[] = {1.0, 1.0};
+ FLOAT beta [] = {1.0, 1.0};
+ char trans='N';
+ blasint m, i, j;
+ int loops = 1;
+ int l;
+ char *p;
+
+ int from = 1;
+ int to = 200;
+ int step = 1;
+
+ struct timeval start, stop;
+ double time1,timeg;
+
+ argc--;argv++;
+
+ if (argc > 0) { from = atol(*argv); argc--; argv++;}
+ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
+ if (argc > 0) { step = atol(*argv); argc--; argv++;}
+
+ fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step);
+
+ if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ p = getenv("OPENBLAS_LOOPS");
+ if ( p != NULL )
+ loops = atoi(p);
+
+
+#ifdef linux
+ srandom(getpid());
+#endif
+
+ fprintf(stderr, " SIZE Flops\n");
+
+ for(m = from; m <= to; m += step)
+ {
+
+ timeg=0;
+
+ fprintf(stderr, " %6d : ", (int)m);
+
+ for (l=0; l<loops; l++)
+ {
+
+ for(j = 0; j < m; j++){
+ for(i = 0; i < m * COMPSIZE; i++){
+ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+ }
+
+ gettimeofday( &start, (struct timezone *)0);
+
+ GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
+
+ gettimeofday( &stop, (struct timezone *)0);
+
+ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+
+ timeg += time1;
+
+ }
+
+ timeg /= loops;
+ fprintf(stderr,
+ " %10.2f MFlops\n",
+ COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / timeg * 1.e-6);
+
+ }
+
+ return 0;
+}
+
+void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/hemm.c b/benchmark/hemm.c
new file mode 100644
index 0000000..f5d4b4f
--- /dev/null
+++ b/benchmark/hemm.c
@@ -0,0 +1,192 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+
+#undef HEMM
+
+#ifdef DOUBLE
+#define HEMM BLASFUNC(zhemm)
+#else
+#define HEMM BLASFUNC(chemm)
+#endif
+
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+ FILETIME ft;
+ unsigned __int64 tmpres = 0;
+ static int tzflag;
+
+ if (NULL != tv)
+ {
+ GetSystemTimeAsFileTime(&ft);
+
+ tmpres |= ft.dwHighDateTime;
+ tmpres <<= 32;
+ tmpres |= ft.dwLowDateTime;
+
+ /*converting file time to unix epoch*/
+ tmpres /= 10; /*convert into microseconds*/
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tv->tv_sec = (long)(tmpres / 1000000UL);
+ tv->tv_usec = (long)(tmpres % 1000000UL);
+ }
+
+ return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+ int shmid;
+ void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+ if ((shmid =shmget(IPC_PRIVATE,
+ (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+ SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+ printf( "Memory allocation failed(shmget).\n");
+ exit(1);
+ }
+
+ address = shmat(shmid, NULL, SHM_RND);
+
+ if ((BLASLONG)address == -1){
+ printf( "Memory allocation failed(shmat).\n");
+ exit(1);
+ }
+
+ shmctl(shmid, IPC_RMID, 0);
+
+ return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int MAIN__(int argc, char *argv[]){
+
+ FLOAT *a, *b, *c;
+ FLOAT alpha[] = {1.0, 1.0};
+ FLOAT beta [] = {1.0, 1.0};
+ char *p;
+
+ char side='L';
+ char uplo='U';
+
+ if ((p = getenv("OPENBLAS_SIDE"))) side=*p;
+ if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
+
+ blasint m, i, j;
+
+ int from = 1;
+ int to = 200;
+ int step = 1;
+
+ struct timeval start, stop;
+ double time1;
+
+ argc--;argv++;
+
+ if (argc > 0) { from = atol(*argv); argc--; argv++;}
+ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
+ if (argc > 0) { step = atol(*argv); argc--; argv++;}
+
+ fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c\n", from, to, step,side,uplo);
+
+ if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+
+
+#ifdef linux
+ srandom(getpid());
+#endif
+
+ fprintf(stderr, " SIZE Flops\n");
+
+ for(m = from; m <= to; m += step)
+ {
+
+ fprintf(stderr, " %6d : ", (int)m);
+
+ for(j = 0; j < m; j++){
+ for(i = 0; i < m * COMPSIZE; i++){
+ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+ }
+
+ gettimeofday( &start, (struct timezone *)0);
+
+ HEMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
+
+ gettimeofday( &stop, (struct timezone *)0);
+
+ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+
+ gettimeofday( &start, (struct timezone *)0);
+
+ fprintf(stderr,
+ " %10.2f MFlops\n",
+ COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6);
+
+ }
+
+ return 0;
+}
+
+void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/her2k.c b/benchmark/her2k.c
new file mode 100644
index 0000000..49ab8d2
--- /dev/null
+++ b/benchmark/her2k.c
@@ -0,0 +1,191 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+
+#undef HER2K
+#ifdef DOUBLE
+#define HER2K BLASFUNC(zher2k)
+#else
+#define HER2K BLASFUNC(cher2k)
+#endif
+
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+ FILETIME ft;
+ unsigned __int64 tmpres = 0;
+ static int tzflag;
+
+ if (NULL != tv)
+ {
+ GetSystemTimeAsFileTime(&ft);
+
+ tmpres |= ft.dwHighDateTime;
+ tmpres <<= 32;
+ tmpres |= ft.dwLowDateTime;
+
+ /*converting file time to unix epoch*/
+ tmpres /= 10; /*convert into microseconds*/
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tv->tv_sec = (long)(tmpres / 1000000UL);
+ tv->tv_usec = (long)(tmpres % 1000000UL);
+ }
+
+ return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+ int shmid;
+ void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+ if ((shmid =shmget(IPC_PRIVATE,
+ (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+ SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+ printf( "Memory allocation failed(shmget).\n");
+ exit(1);
+ }
+
+ address = shmat(shmid, NULL, SHM_RND);
+
+ if ((BLASLONG)address == -1){
+ printf( "Memory allocation failed(shmat).\n");
+ exit(1);
+ }
+
+ shmctl(shmid, IPC_RMID, 0);
+
+ return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int MAIN__(int argc, char *argv[]){
+
+ FLOAT *a, *b, *c;
+ FLOAT alpha[] = {1.0, 1.0};
+ FLOAT beta [] = {1.0, 1.0};
+ char *p;
+
+ char uplo='U';
+ char trans='N';
+
+ if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
+ if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
+
+ blasint m, i, j;
+
+ int from = 1;
+ int to = 200;
+ int step = 1;
+
+ struct timeval start, stop;
+ double time1;
+
+ argc--;argv++;
+
+ if (argc > 0) { from = atol(*argv); argc--; argv++;}
+ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
+ if (argc > 0) { step = atol(*argv); argc--; argv++;}
+
+ fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
+
+ if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+
+
+#ifdef linux
+ srandom(getpid());
+#endif
+
+ fprintf(stderr, " SIZE Flops\n");
+
+ for(m = from; m <= to; m += step)
+ {
+
+ fprintf(stderr, " %6d : ", (int)m);
+
+ for(j = 0; j < m; j++){
+ for(i = 0; i < m * COMPSIZE; i++){
+ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+ }
+
+ gettimeofday( &start, (struct timezone *)0);
+
+ HER2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
+
+ gettimeofday( &stop, (struct timezone *)0);
+
+ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+
+ gettimeofday( &start, (struct timezone *)0);
+
+ fprintf(stderr,
+ " %10.2f MFlops\n",
+ COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6);
+
+ }
+
+ return 0;
+}
+
+void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/herk.c b/benchmark/herk.c
new file mode 100644
index 0000000..8c053b0
--- /dev/null
+++ b/benchmark/herk.c
@@ -0,0 +1,189 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+
+#undef HERK
+
+
+#ifdef DOUBLE
+#define HERK BLASFUNC(zherk)
+#else
+#define HERK BLASFUNC(cherk)
+#endif
+
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+ FILETIME ft;
+ unsigned __int64 tmpres = 0;
+ static int tzflag;
+
+ if (NULL != tv)
+ {
+ GetSystemTimeAsFileTime(&ft);
+
+ tmpres |= ft.dwHighDateTime;
+ tmpres <<= 32;
+ tmpres |= ft.dwLowDateTime;
+
+ /*converting file time to unix epoch*/
+ tmpres /= 10; /*convert into microseconds*/
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tv->tv_sec = (long)(tmpres / 1000000UL);
+ tv->tv_usec = (long)(tmpres % 1000000UL);
+ }
+
+ return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+ int shmid;
+ void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+ if ((shmid =shmget(IPC_PRIVATE,
+ (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+ SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+ printf( "Memory allocation failed(shmget).\n");
+ exit(1);
+ }
+
+ address = shmat(shmid, NULL, SHM_RND);
+
+ if ((BLASLONG)address == -1){
+ printf( "Memory allocation failed(shmat).\n");
+ exit(1);
+ }
+
+ shmctl(shmid, IPC_RMID, 0);
+
+ return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int MAIN__(int argc, char *argv[]){
+
+ FLOAT *a, *c;
+ FLOAT alpha[] = {1.0, 1.0};
+ FLOAT beta [] = {1.0, 1.0};
+ char *p;
+
+ char uplo='U';
+ char trans='N';
+
+ if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
+ if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
+
+ blasint m, i, j;
+
+ int from = 1;
+ int to = 200;
+ int step = 1;
+
+ struct timeval start, stop;
+ double time1;
+
+ argc--;argv++;
+
+ if (argc > 0) { from = atol(*argv); argc--; argv++;}
+ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
+ if (argc > 0) { step = atol(*argv); argc--; argv++;}
+
+ fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
+
+
+ if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+
+
+#ifdef linux
+ srandom(getpid());
+#endif
+
+ fprintf(stderr, " SIZE Flops\n");
+
+ for(m = from; m <= to; m += step)
+ {
+
+ fprintf(stderr, " %6d : ", (int)m);
+
+ for(j = 0; j < m; j++){
+ for(i = 0; i < m * COMPSIZE; i++){
+ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+ }
+
+ gettimeofday( &start, (struct timezone *)0);
+
+ HERK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m );
+
+ gettimeofday( &stop, (struct timezone *)0);
+
+ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+
+ gettimeofday( &start, (struct timezone *)0);
+
+ fprintf(stderr,
+ " %10.2f MFlops\n",
+ COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
+
+ }
+
+ return 0;
+}
+
+void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/linpack.c b/benchmark/linpack.c
index 0261859..98a8742 100644
--- a/benchmark/linpack.c
+++ b/benchmark/linpack.c
@@ -83,22 +83,22 @@ int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
-
+
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
-
+
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
-
+
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
-
+
return 0;
}
@@ -154,7 +154,7 @@ int MAIN__(int argc, char *argv[]){
struct timeval start, stop;
double time1, time2;
- argc--;argv++;
+ argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
@@ -165,15 +165,15 @@ int MAIN__(int argc, char *argv[]){
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
-
+
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
-
+
if (( ipiv = (blasint *)malloc(sizeof(blasint) * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
-
+
#ifdef linux
srandom(getpid());
#endif
@@ -181,7 +181,7 @@ int MAIN__(int argc, char *argv[]){
fprintf(stderr, " SIZE Residual Decompose Solve Total\n");
for(m = from; m <= to; m += step){
-
+
fprintf(stderr, " %6d : ", (int)m);
for(j = 0; j < m; j++){
@@ -189,9 +189,9 @@ int MAIN__(int argc, char *argv[]){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
-
+
for (i = 0; i < m * COMPSIZE; ++i) b[i] = 0.;
-
+
for (j = 0; j < m; ++j) {
for (i = 0; i < m * COMPSIZE; ++i) {
b[i] += a[i + j * m * COMPSIZE];
@@ -208,7 +208,7 @@ int MAIN__(int argc, char *argv[]){
fprintf(stderr, "Matrix is not singular .. %d\n", info);
exit(1);
}
-
+
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
@@ -221,7 +221,7 @@ int MAIN__(int argc, char *argv[]){
fprintf(stderr, "Matrix is not singular .. %d\n", info);
exit(1);
}
-
+
time2 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
maxerr = 0.;
@@ -239,7 +239,7 @@ int MAIN__(int argc, char *argv[]){
#endif
#endif
}
-
+
#ifdef XDOUBLE
fprintf(stderr," %Le ", maxerr);
#else
@@ -247,7 +247,7 @@ int MAIN__(int argc, char *argv[]){
#endif
fprintf(stderr,
- " %10.2f MFlops %10.2f MFlops %10.2f MFlops\n",
+ " %10.2f MFlops %10.2f MFlops %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. / 3. * (double)m * (double)m * (double)m / time1 * 1.e-6,
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time2 * 1.e-6,
COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m) / (time1 + time2) * 1.e-6);
diff --git a/benchmark/symm.c b/benchmark/symm.c
new file mode 100644
index 0000000..187dfe2
--- /dev/null
+++ b/benchmark/symm.c
@@ -0,0 +1,203 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+
+#undef SYMM
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+#define SYMM BLASFUNC(dsymm)
+#else
+#define SYMM BLASFUNC(ssymm)
+#endif
+
+#else
+
+#ifdef DOUBLE
+#define SYMM BLASFUNC(zsymm)
+#else
+#define SYMM BLASFUNC(csymm)
+#endif
+
+#endif
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+ FILETIME ft;
+ unsigned __int64 tmpres = 0;
+ static int tzflag;
+
+ if (NULL != tv)
+ {
+ GetSystemTimeAsFileTime(&ft);
+
+ tmpres |= ft.dwHighDateTime;
+ tmpres <<= 32;
+ tmpres |= ft.dwLowDateTime;
+
+ /*converting file time to unix epoch*/
+ tmpres /= 10; /*convert into microseconds*/
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tv->tv_sec = (long)(tmpres / 1000000UL);
+ tv->tv_usec = (long)(tmpres % 1000000UL);
+ }
+
+ return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+ int shmid;
+ void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+ if ((shmid =shmget(IPC_PRIVATE,
+ (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+ SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+ printf( "Memory allocation failed(shmget).\n");
+ exit(1);
+ }
+
+ address = shmat(shmid, NULL, SHM_RND);
+
+ if ((BLASLONG)address == -1){
+ printf( "Memory allocation failed(shmat).\n");
+ exit(1);
+ }
+
+ shmctl(shmid, IPC_RMID, 0);
+
+ return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int MAIN__(int argc, char *argv[]){
+
+ FLOAT *a, *b, *c;
+ FLOAT alpha[] = {1.0, 1.0};
+ FLOAT beta [] = {1.0, 1.0};
+ char *p;
+
+ char side='L';
+ char uplo='U';
+
+ if ((p = getenv("OPENBLAS_SIDE"))) side=*p;
+ if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
+
+ blasint m, i, j;
+
+ int from = 1;
+ int to = 200;
+ int step = 1;
+
+ struct timeval start, stop;
+ double time1;
+
+ argc--;argv++;
+
+ if (argc > 0) { from = atol(*argv); argc--; argv++;}
+ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
+ if (argc > 0) { step = atol(*argv); argc--; argv++;}
+
+ fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c\n", from, to, step,side,uplo);
+
+ if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+
+
+#ifdef linux
+ srandom(getpid());
+#endif
+
+ fprintf(stderr, " SIZE Flops\n");
+
+ for(m = from; m <= to; m += step)
+ {
+
+ fprintf(stderr, " %6d : ", (int)m);
+
+ for(j = 0; j < m; j++){
+ for(i = 0; i < m * COMPSIZE; i++){
+ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+ }
+
+ gettimeofday( &start, (struct timezone *)0);
+
+ SYMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
+
+ gettimeofday( &stop, (struct timezone *)0);
+
+ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+
+ gettimeofday( &start, (struct timezone *)0);
+
+ fprintf(stderr,
+ " %10.2f MFlops\n",
+ COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6);
+
+ }
+
+ return 0;
+}
+
+void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/syr2k.c b/benchmark/syr2k.c
new file mode 100644
index 0000000..e11b04e
--- /dev/null
+++ b/benchmark/syr2k.c
@@ -0,0 +1,203 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+
+#undef SYR2K
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+#define SYR2K BLASFUNC(dsyr2k)
+#else
+#define SYR2K BLASFUNC(ssyr2k)
+#endif
+
+#else
+
+#ifdef DOUBLE
+#define SYR2K BLASFUNC(zsyr2k)
+#else
+#define SYR2K BLASFUNC(csyr2k)
+#endif
+
+#endif
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+ FILETIME ft;
+ unsigned __int64 tmpres = 0;
+ static int tzflag;
+
+ if (NULL != tv)
+ {
+ GetSystemTimeAsFileTime(&ft);
+
+ tmpres |= ft.dwHighDateTime;
+ tmpres <<= 32;
+ tmpres |= ft.dwLowDateTime;
+
+ /*converting file time to unix epoch*/
+ tmpres /= 10; /*convert into microseconds*/
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tv->tv_sec = (long)(tmpres / 1000000UL);
+ tv->tv_usec = (long)(tmpres % 1000000UL);
+ }
+
+ return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+ int shmid;
+ void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+ if ((shmid =shmget(IPC_PRIVATE,
+ (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+ SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+ printf( "Memory allocation failed(shmget).\n");
+ exit(1);
+ }
+
+ address = shmat(shmid, NULL, SHM_RND);
+
+ if ((BLASLONG)address == -1){
+ printf( "Memory allocation failed(shmat).\n");
+ exit(1);
+ }
+
+ shmctl(shmid, IPC_RMID, 0);
+
+ return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int MAIN__(int argc, char *argv[]){
+
+ FLOAT *a, *b, *c;
+ FLOAT alpha[] = {1.0, 1.0};
+ FLOAT beta [] = {1.0, 1.0};
+ char *p;
+
+ char uplo='U';
+ char trans='N';
+
+ if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
+ if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
+
+ blasint m, i, j;
+
+ int from = 1;
+ int to = 200;
+ int step = 1;
+
+ struct timeval start, stop;
+ double time1;
+
+ argc--;argv++;
+
+ if (argc > 0) { from = atol(*argv); argc--; argv++;}
+ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
+ if (argc > 0) { step = atol(*argv); argc--; argv++;}
+
+ fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
+
+ if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+
+
+#ifdef linux
+ srandom(getpid());
+#endif
+
+ fprintf(stderr, " SIZE Flops\n");
+
+ for(m = from; m <= to; m += step)
+ {
+
+ fprintf(stderr, " %6d : ", (int)m);
+
+ for(j = 0; j < m; j++){
+ for(i = 0; i < m * COMPSIZE; i++){
+ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+ }
+
+ gettimeofday( &start, (struct timezone *)0);
+
+ SYR2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
+
+ gettimeofday( &stop, (struct timezone *)0);
+
+ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+
+ gettimeofday( &start, (struct timezone *)0);
+
+ fprintf(stderr,
+ " %10.2f MFlops\n",
+ COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6);
+
+ }
+
+ return 0;
+}
+
+void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/syrk.c b/benchmark/syrk.c
new file mode 100644
index 0000000..f015496
--- /dev/null
+++ b/benchmark/syrk.c
@@ -0,0 +1,199 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+
+#undef SYRK
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+#define SYRK BLASFUNC(dsyrk)
+#else
+#define SYRK BLASFUNC(ssyrk)
+#endif
+
+#else
+
+#ifdef DOUBLE
+#define SYRK BLASFUNC(zsyrk)
+#else
+#define SYRK BLASFUNC(csyrk)
+#endif
+
+#endif
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+ FILETIME ft;
+ unsigned __int64 tmpres = 0;
+ static int tzflag;
+
+ if (NULL != tv)
+ {
+ GetSystemTimeAsFileTime(&ft);
+
+ tmpres |= ft.dwHighDateTime;
+ tmpres <<= 32;
+ tmpres |= ft.dwLowDateTime;
+
+ /*converting file time to unix epoch*/
+ tmpres /= 10; /*convert into microseconds*/
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tv->tv_sec = (long)(tmpres / 1000000UL);
+ tv->tv_usec = (long)(tmpres % 1000000UL);
+ }
+
+ return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+ int shmid;
+ void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+ if ((shmid =shmget(IPC_PRIVATE,
+ (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+ SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+ printf( "Memory allocation failed(shmget).\n");
+ exit(1);
+ }
+
+ address = shmat(shmid, NULL, SHM_RND);
+
+ if ((BLASLONG)address == -1){
+ printf( "Memory allocation failed(shmat).\n");
+ exit(1);
+ }
+
+ shmctl(shmid, IPC_RMID, 0);
+
+ return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int MAIN__(int argc, char *argv[]){
+
+ FLOAT *a, *c;
+ FLOAT alpha[] = {1.0, 1.0};
+ FLOAT beta [] = {1.0, 1.0};
+ char *p;
+
+ char uplo='U';
+ char trans='N';
+
+ if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
+ if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
+
+ blasint m, i, j;
+
+ int from = 1;
+ int to = 200;
+ int step = 1;
+
+ struct timeval start, stop;
+ double time1;
+
+ argc--;argv++;
+
+ if (argc > 0) { from = atol(*argv); argc--; argv++;}
+ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
+ if (argc > 0) { step = atol(*argv); argc--; argv++;}
+
+ fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
+
+
+ if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+
+
+#ifdef linux
+ srandom(getpid());
+#endif
+
+ fprintf(stderr, " SIZE Flops\n");
+
+ for(m = from; m <= to; m += step)
+ {
+
+ fprintf(stderr, " %6d : ", (int)m);
+
+ for(j = 0; j < m; j++){
+ for(i = 0; i < m * COMPSIZE; i++){
+ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+ }
+
+ gettimeofday( &start, (struct timezone *)0);
+
+ SYRK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m );
+
+ gettimeofday( &stop, (struct timezone *)0);
+
+ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+
+ gettimeofday( &start, (struct timezone *)0);
+
+ fprintf(stderr,
+ " %10.2f MFlops\n",
+ COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
+
+ }
+
+ return 0;
+}
+
+void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/trmm.c b/benchmark/trmm.c
new file mode 100644
index 0000000..328dc9a
--- /dev/null
+++ b/benchmark/trmm.c
@@ -0,0 +1,202 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+
+#undef TRMM
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+#define TRMM BLASFUNC(dtrmm)
+#else
+#define TRMM BLASFUNC(strmm)
+#endif
+
+#else
+
+#ifdef DOUBLE
+#define TRMM BLASFUNC(ztrmm)
+#else
+#define TRMM BLASFUNC(ctrmm)
+#endif
+
+#endif
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+ FILETIME ft;
+ unsigned __int64 tmpres = 0;
+ static int tzflag;
+
+ if (NULL != tv)
+ {
+ GetSystemTimeAsFileTime(&ft);
+
+ tmpres |= ft.dwHighDateTime;
+ tmpres <<= 32;
+ tmpres |= ft.dwLowDateTime;
+
+ /*converting file time to unix epoch*/
+ tmpres /= 10; /*convert into microseconds*/
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tv->tv_sec = (long)(tmpres / 1000000UL);
+ tv->tv_usec = (long)(tmpres % 1000000UL);
+ }
+
+ return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+ int shmid;
+ void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+ if ((shmid =shmget(IPC_PRIVATE,
+ (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+ SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+ printf( "Memory allocation failed(shmget).\n");
+ exit(1);
+ }
+
+ address = shmat(shmid, NULL, SHM_RND);
+
+ if ((BLASLONG)address == -1){
+ printf( "Memory allocation failed(shmat).\n");
+ exit(1);
+ }
+
+ shmctl(shmid, IPC_RMID, 0);
+
+ return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int MAIN__(int argc, char *argv[]){
+
+ FLOAT *a, *b;
+ FLOAT alpha[] = {1.0, 1.0};
+ FLOAT beta [] = {1.0, 1.0};
+ char *p;
+
+ char side ='L';
+ char uplo ='U';
+ char trans='N';
+ char diag ='U';
+
+ if ((p = getenv("OPENBLAS_SIDE"))) side=*p;
+ if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
+ if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
+ if ((p = getenv("OPENBLAS_DIAG"))) diag=*p;
+
+ blasint m, i, j;
+
+ int from = 1;
+ int to = 200;
+ int step = 1;
+
+ struct timeval start, stop;
+ double time1;
+
+ argc--;argv++;
+
+ if (argc > 0) { from = atol(*argv); argc--; argv++;}
+ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
+ if (argc > 0) { step = atol(*argv); argc--; argv++;}
+
+ fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c Trans = %c Diag = %c\n", from, to, step,side,uplo,trans,diag);
+
+ if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+
+
+#ifdef linux
+ srandom(getpid());
+#endif
+
+ fprintf(stderr, " SIZE Flops\n");
+
+ for(m = from; m <= to; m += step)
+ {
+
+ fprintf(stderr, " %6d : ", (int)m);
+
+ for(j = 0; j < m; j++){
+ for(i = 0; i < m * COMPSIZE; i++){
+ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+ }
+
+ gettimeofday( &start, (struct timezone *)0);
+
+ TRMM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m);
+
+ gettimeofday( &stop, (struct timezone *)0);
+
+ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+
+ gettimeofday( &start, (struct timezone *)0);
+
+ fprintf(stderr,
+ " %10.2f MFlops\n",
+ COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
+
+ }
+
+ return 0;
+}
+
+void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/trsm.c b/benchmark/trsm.c
new file mode 100644
index 0000000..908a0fc
--- /dev/null
+++ b/benchmark/trsm.c
@@ -0,0 +1,202 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+
+#undef TRSM
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+#define TRSM BLASFUNC(dtrsm)
+#else
+#define TRSM BLASFUNC(strsm)
+#endif
+
+#else
+
+#ifdef DOUBLE
+#define TRSM BLASFUNC(ztrsm)
+#else
+#define TRSM BLASFUNC(ctrsm)
+#endif
+
+#endif
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+ FILETIME ft;
+ unsigned __int64 tmpres = 0;
+ static int tzflag;
+
+ if (NULL != tv)
+ {
+ GetSystemTimeAsFileTime(&ft);
+
+ tmpres |= ft.dwHighDateTime;
+ tmpres <<= 32;
+ tmpres |= ft.dwLowDateTime;
+
+ /*converting file time to unix epoch*/
+ tmpres /= 10; /*convert into microseconds*/
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tv->tv_sec = (long)(tmpres / 1000000UL);
+ tv->tv_usec = (long)(tmpres % 1000000UL);
+ }
+
+ return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+ int shmid;
+ void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+ if ((shmid =shmget(IPC_PRIVATE,
+ (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+ SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+ printf( "Memory allocation failed(shmget).\n");
+ exit(1);
+ }
+
+ address = shmat(shmid, NULL, SHM_RND);
+
+ if ((BLASLONG)address == -1){
+ printf( "Memory allocation failed(shmat).\n");
+ exit(1);
+ }
+
+ shmctl(shmid, IPC_RMID, 0);
+
+ return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int MAIN__(int argc, char *argv[]){
+
+ FLOAT *a, *b;
+ FLOAT alpha[] = {1.0, 1.0};
+ FLOAT beta [] = {1.0, 1.0};
+ char *p;
+
+ char side ='L';
+ char uplo ='U';
+ char trans='N';
+ char diag ='U';
+
+ if ((p = getenv("OPENBLAS_SIDE"))) side=*p;
+ if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
+ if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
+ if ((p = getenv("OPENBLAS_DIAG"))) diag=*p;
+
+ blasint m, i, j;
+
+ int from = 1;
+ int to = 200;
+ int step = 1;
+
+ struct timeval start, stop;
+ double time1;
+
+ argc--;argv++;
+
+ if (argc > 0) { from = atol(*argv); argc--; argv++;}
+ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
+ if (argc > 0) { step = atol(*argv); argc--; argv++;}
+
+ fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c Trans = %c Diag = %c\n", from, to, step,side,uplo,trans,diag);
+
+ if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+
+
+#ifdef linux
+ srandom(getpid());
+#endif
+
+ fprintf(stderr, " SIZE Flops\n");
+
+ for(m = from; m <= to; m += step)
+ {
+
+ fprintf(stderr, " %6d : ", (int)m);
+
+ for(j = 0; j < m; j++){
+ for(i = 0; i < m * COMPSIZE; i++){
+ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+ }
+
+ gettimeofday( &start, (struct timezone *)0);
+
+ TRSM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m);
+
+ gettimeofday( &stop, (struct timezone *)0);
+
+ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+
+ gettimeofday( &start, (struct timezone *)0);
+
+ fprintf(stderr,
+ " %10.2f MFlops\n",
+ COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
+
+ }
+
+ return 0;
+}
+
+void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/c_check b/c_check
index 0828a5b..94b0bf3 100644
--- a/c_check
+++ b/c_check
@@ -180,9 +180,9 @@ $linker_a = "";
{
$link = `$compiler_name -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`;
-
+
$link =~ s/\-Y\sP\,/\-Y/g;
-
+
@flags = split(/[\s\,\n]/, $link);
# remove leading and trailing quotes from each flag.
@flags = map {s/^['"]|['"]$//g; $_} @flags;
@@ -193,15 +193,15 @@ $linker_a = "";
&& ($flags !~ /^-LIST:/)
&& ($flags !~ /^-LANG:/)
) {
- $linker_L .= $flags . " "
+ $linker_L .= $flags . " "
}
if ($flags =~ /^\-Y/) {
- $linker_L .= "-Wl,". $flags . " "
+ $linker_L .= "-Wl,". $flags . " "
}
-
+
if (
- ($flags =~ /^\-l/)
+ ($flags =~ /^\-l/)
&& ($flags !~ /gfortranbegin/)
&& ($flags !~ /frtbegin/)
&& ($flags !~ /pathfstart/)
@@ -213,7 +213,7 @@ $linker_a = "";
&& ($flags !~ /advapi32/)
&& ($flags !~ /shell32/)
) {
- $linker_l .= $flags . " "
+ $linker_l .= $flags . " "
}
$linker_a .= $flags . " " if $flags =~ /\.a$/;
@@ -250,9 +250,9 @@ print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64;
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
if ($os eq "LINUX") {
-
+
# @pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`);
-
+
# if ($pthread[2] ne "") {
# print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n";
# } else {
diff --git a/cblas.h b/cblas.h
index 971c132..ef072e6 100644
--- a/cblas.h
+++ b/cblas.h
@@ -16,14 +16,17 @@ void goto_set_num_threads(int num_threads);
/*Get the build configure on runtime.*/
char* openblas_get_config(void);
+/*Get the CPU corename on runtime.*/
+char* openblas_get_corename(void);
+
/* Get the parallelization type which is used by OpenBLAS */
-int openblas_get_parallel(void);
+int openblas_get_parallel(void);
/* OpenBLAS is compiled for sequential use */
#define OPENBLAS_SEQUENTIAL 0
/* OpenBLAS is compiled using normal threading model */
-#define OPENBLAS_THREAD 1
+#define OPENBLAS_THREAD 1
/* OpenBLAS is compiled using OpenMP threading model */
-#define OPENBLAS_OPENMP 2
+#define OPENBLAS_OPENMP 2
/*
@@ -305,6 +308,16 @@ void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBL
void cblas_xerbla(blasint p, char *rout, char *form, ...);
+/*** BLAS extensions ***/
+
+void cblas_saxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);
+
+void cblas_daxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST double beta, double *y, OPENBLAS_CONST blasint incy);
+
+void cblas_caxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST float *beta, float *y, OPENBLAS_CONST blasint incy);
+
+void cblas_zaxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST double *beta, double *y, OPENBLAS_CONST blasint incy);
+
#ifdef __cplusplus
}
#endif /* __cplusplus */
diff --git a/cblas_noconst.h b/cblas_noconst.h
index fd2e940..1f79e81 100644
--- a/cblas_noconst.h
+++ b/cblas_noconst.h
@@ -17,13 +17,13 @@ void goto_set_num_threads(int num_threads);
char* openblas_get_config(void);
/* Get the parallelization type which is used by OpenBLAS */
-int openblas_get_parallel(void);
+int openblas_get_parallel(void);
/* OpenBLAS is compiled for sequential use */
#define OPENBLAS_SEQUENTIAL 0
/* OpenBLAS is compiled using normal threading model */
-#define OPENBLAS_THREAD 1
+#define OPENBLAS_THREAD 1
/* OpenBLAS is compiled using OpenMP threading model */
-#define OPENBLAS_OPENMP 2
+#define OPENBLAS_OPENMP 2
#define CBLAS_INDEX size_t
@@ -296,6 +296,17 @@ void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANS
void cblas_xerbla(blasint p, char *rout, char *form, ...);
+/*** BLAS extensions ***/
+
+void cblas_saxpby(blasint n, float alpha, float *x, blasint incx,float beta, float *y, blasint incy);
+
+void cblas_daxpby(blasint n, double alpha, double *x, blasint incx,double beta, double *y, blasint incy);
+
+void cblas_caxpby(blasint n, float *alpha, float *x, blasint incx,float *beta, float *y, blasint incy);
+
+void cblas_zaxpby(blasint n, double *alpha, double *x, blasint incx,double *beta, double *y, blasint incy);
+
+
#ifdef __cplusplus
}
#endif /* __cplusplus */
diff --git a/common.h b/common.h
index 49e2946..7125ce3 100644
--- a/common.h
+++ b/common.h
@@ -388,6 +388,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#include "common_arm64.h"
#endif
+#ifndef ASSEMBLER
+#ifdef OS_WINDOWS
+typedef char env_var_t[MAX_PATH];
+#define readenv(p, n) GetEnvironmentVariable((n), (p), sizeof(p))
+#else
+typedef char* env_var_t;
+#define readenv(p, n) ((p)=getenv(n))
+#endif
+#endif
#ifdef OS_LINUX
#include "common_linux.h"
@@ -515,13 +524,9 @@ static __inline void blas_unlock(volatile BLASULONG *address){
*address = 0;
}
-static __inline int readenv(char *env) {
-
- char *p;
-
- p = getenv(env);
-
- if (p == NULL) return 0; else return atoi(p);
+static __inline int readenv_atoi(char *env) {
+ env_var_t p;
+ return readenv(p,env) ? 0 : atoi(p);
}
@@ -531,7 +536,7 @@ static __inline void compinv(FLOAT *b, FLOAT ar, FLOAT ai){
#ifndef UNIT
FLOAT ratio, den;
-
+
if (
#ifdef XDOUBLE
(fabsl(ar)) >= (fabsl(ai))
@@ -557,7 +562,7 @@ static __inline void compinv(FLOAT *b, FLOAT ar, FLOAT ai){
b[0] = ONE;
b[1] = ZERO;
#endif
-
+
}
#endif
@@ -687,13 +692,13 @@ extern int gotoblas_profile;
#define PRINT_DEBUG_CNAME
#define PRINT_DEBUG_NAME
#else
-#define PRINT_DEBUG_CNAME if (readenv("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_CNAME)
-#define PRINT_DEBUG_NAME if (readenv("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_NAME)
+#define PRINT_DEBUG_CNAME if (readenv_atoi("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_CNAME)
+#define PRINT_DEBUG_NAME if (readenv_atoi("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_NAME)
#endif
#ifdef __cplusplus
}
-
+
#endif /* __cplusplus */
#endif
diff --git a/common_arm.h b/common_arm.h
index 8c9752d..1301000 100644
--- a/common_arm.h
+++ b/common_arm.h
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -94,7 +94,7 @@ static void __inline blas_lock(volatile BLASULONG *address){
"mov %0 , r3 \n\t"
: "=r"(ret), "=r"(address)
: "1"(address)
- : "memory", "r2" , "r3"
+ : "memory", "r2" , "r3"
);
@@ -143,7 +143,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
.func REALNAME ;\
REALNAME:
-#define EPILOGUE
+#define EPILOGUE
#define PROFCODE
diff --git a/common_arm64.h b/common_arm64.h
index 2da0d89..8a66a17 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -94,7 +94,7 @@ static void __inline blas_lock(volatile BLASULONG *address){
"mov %0 , r3 \n\t"
: "=r"(ret), "=r"(address)
: "1"(address)
- : "memory", "r2" , "r3"
+ : "memory", "r2" , "r3"
);
@@ -143,7 +143,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
.func REALNAME ;\
REALNAME:
-#define EPILOGUE
+#define EPILOGUE
#define PROFCODE
diff --git a/common_c.h b/common_c.h
index f78f172..724d1e2 100644
--- a/common_c.h
+++ b/common_c.h
@@ -209,6 +209,18 @@
#define CNEG_TCOPY cneg_tcopy
#define CLASWP_NCOPY claswp_ncopy
+#define CAXPBY_K caxpby_k
+
+#define COMATCOPY_K_CN comatcopy_k_cn
+#define COMATCOPY_K_RN comatcopy_k_rn
+#define COMATCOPY_K_CT comatcopy_k_ct
+#define COMATCOPY_K_RT comatcopy_k_rt
+#define COMATCOPY_K_CNC comatcopy_k_cnc
+#define COMATCOPY_K_RNC comatcopy_k_rnc
+#define COMATCOPY_K_CTC comatcopy_k_ctc
+#define COMATCOPY_K_RTC comatcopy_k_rtc
+
+
#else
#define CAMAX_K gotoblas -> camax_k
@@ -380,6 +392,17 @@
#define CNEG_TCOPY gotoblas -> cneg_tcopy
#define CLASWP_NCOPY gotoblas -> claswp_ncopy
+#define CAXPBY_K gotoblas -> caxpby_k
+
+#define COMATCOPY_K_CN gotoblas -> comatcopy_k_cn
+#define COMATCOPY_K_RN gotoblas -> comatcopy_k_rn
+#define COMATCOPY_K_CT gotoblas -> comatcopy_k_ct
+#define COMATCOPY_K_RT gotoblas -> comatcopy_k_rt
+#define COMATCOPY_K_CNC gotoblas -> comatcopy_k_cnc
+#define COMATCOPY_K_RNC gotoblas -> comatcopy_k_rnc
+#define COMATCOPY_K_CTC gotoblas -> comatcopy_k_ctc
+#define COMATCOPY_K_RTC gotoblas -> comatcopy_k_rtc
+
#endif
#define CGEMM_NN cgemm_nn
diff --git a/common_d.h b/common_d.h
index 4c9a53f..c34e1f2 100644
--- a/common_d.h
+++ b/common_d.h
@@ -144,6 +144,12 @@
#define DNEG_TCOPY dneg_tcopy
#define DLASWP_NCOPY dlaswp_ncopy
+#define DAXPBY_K daxpby_k
+#define DOMATCOPY_K_CN domatcopy_k_cn
+#define DOMATCOPY_K_RN domatcopy_k_rn
+#define DOMATCOPY_K_CT domatcopy_k_ct
+#define DOMATCOPY_K_RT domatcopy_k_rt
+
#else
#define DAMAX_K gotoblas -> damax_k
@@ -255,6 +261,12 @@
#define DNEG_TCOPY gotoblas -> dneg_tcopy
#define DLASWP_NCOPY gotoblas -> dlaswp_ncopy
+#define DAXPBY_K gotoblas -> daxpby_k
+#define DOMATCOPY_K_CN gotoblas -> domatcopy_k_cn
+#define DOMATCOPY_K_RN gotoblas -> domatcopy_k_rn
+#define DOMATCOPY_K_CT gotoblas -> domatcopy_k_ct
+#define DOMATCOPY_K_RT gotoblas -> domatcopy_k_rt
+
#endif
#define DGEMM_NN dgemm_nn
diff --git a/common_ia64.h b/common_ia64.h
index 79b3c81..8e92b59 100644
--- a/common_ia64.h
+++ b/common_ia64.h
@@ -58,10 +58,10 @@
static __inline void blas_lock(volatile unsigned long *address){
unsigned long ret;
-
+
do {
while (*address) {YIELDING;};
-
+
__asm__ __volatile__ ("mov ar.ccv=r0\n;;\n"
"cmpxchg4.acq %0=[%2],%1,ar.ccv\n"
: "=r"(ret) : "r"(1), "r"(address)
diff --git a/common_interface.h b/common_interface.h
index 14c2cf7..6ab3450 100644
--- a/common_interface.h
+++ b/common_interface.h
@@ -238,17 +238,17 @@ void BLASFUNC(xgeru)(blasint *, blasint *, xdouble *, xdouble *, blasint *,
void BLASFUNC(xgerc)(blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, blasint *);
-void BLASFUNC(sgemv)(char *, blasint *, blasint *, float *, float *, blasint *,
+void BLASFUNC(sgemv)(char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
-void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *,
+void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
-void BLASFUNC(qgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
+void BLASFUNC(qgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);
-void BLASFUNC(cgemv)(char *, blasint *, blasint *, float *, float *, blasint *,
+void BLASFUNC(cgemv)(char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
-void BLASFUNC(zgemv)(char *, blasint *, blasint *, double *, double *, blasint *,
+void BLASFUNC(zgemv)(char *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
-void BLASFUNC(xgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
+void BLASFUNC(xgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);
void BLASFUNC(strsv) (char *, char *, char *, blasint *, float *, blasint *,
@@ -257,24 +257,24 @@ void BLASFUNC(dtrsv) (char *, char *, char *, blasint *, double *, blasint *,
double *, blasint *);
void BLASFUNC(qtrsv) (char *, char *, char *, blasint *, xdouble *, blasint *,
xdouble *, blasint *);
-void BLASFUNC(ctrsv) (char *, char *, char *, blasint *, float *, blasint *,
+void BLASFUNC(ctrsv) (char *, char *, char *, blasint *, float *, blasint *,
float *, blasint *);
-void BLASFUNC(ztrsv) (char *, char *, char *, blasint *, double *, blasint *,
+void BLASFUNC(ztrsv) (char *, char *, char *, blasint *, double *, blasint *,
double *, blasint *);
-void BLASFUNC(xtrsv) (char *, char *, char *, blasint *, xdouble *, blasint *,
+void BLASFUNC(xtrsv) (char *, char *, char *, blasint *, xdouble *, blasint *,
xdouble *, blasint *);
void BLASFUNC(strmv) (char *, char *, char *, blasint *, float *, blasint *,
float *, blasint *);
-void BLASFUNC(dtrmv) (char *, char *, char *, blasint *, double *, blasint *,
+void BLASFUNC(dtrmv) (char *, char *, char *, blasint *, double *, blasint *,
double *, blasint *);
-void BLASFUNC(qtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *,
+void BLASFUNC(qtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *,
xdouble *, blasint *);
-void BLASFUNC(ctrmv) (char *, char *, char *, blasint *, float *, blasint *,
+void BLASFUNC(ctrmv) (char *, char *, char *, blasint *, float *, blasint *,
float *, blasint *);
-void BLASFUNC(ztrmv) (char *, char *, char *, blasint *, double *, blasint *,
+void BLASFUNC(ztrmv) (char *, char *, char *, blasint *, double *, blasint *,
double *, blasint *);
-void BLASFUNC(xtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *,
+void BLASFUNC(xtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *,
xdouble *, blasint *);
void BLASFUNC(stpsv) (char *, char *, char *, blasint *, float *, float *, blasint *);
@@ -305,24 +305,24 @@ void BLASFUNC(ctbsv) (char *, char *, char *, blasint *, blasint *, float *, bl
void BLASFUNC(ztbsv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(xtbsv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
-void BLASFUNC(ssymv) (char *, blasint *, float *, float *, blasint *,
+void BLASFUNC(ssymv) (char *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
-void BLASFUNC(dsymv) (char *, blasint *, double *, double *, blasint *,
+void BLASFUNC(dsymv) (char *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
-void BLASFUNC(qsymv) (char *, blasint *, xdouble *, xdouble *, blasint *,
+void BLASFUNC(qsymv) (char *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);
-void BLASFUNC(csymv) (char *, blasint *, float *, float *, blasint *,
+void BLASFUNC(csymv) (char *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
-void BLASFUNC(zsymv) (char *, blasint *, double *, double *, blasint *,
+void BLASFUNC(zsymv) (char *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
-void BLASFUNC(xsymv) (char *, blasint *, xdouble *, xdouble *, blasint *,
+void BLASFUNC(xsymv) (char *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);
void BLASFUNC(sspmv) (char *, blasint *, float *, float *,
float *, blasint *, float *, float *, blasint *);
-void BLASFUNC(dspmv) (char *, blasint *, double *, double *,
+void BLASFUNC(dspmv) (char *, blasint *, double *, double *,
double *, blasint *, double *, double *, blasint *);
-void BLASFUNC(qspmv) (char *, blasint *, xdouble *, xdouble *,
+void BLASFUNC(qspmv) (char *, blasint *, xdouble *, xdouble *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);
void BLASFUNC(cspmv) (char *, blasint *, float *, float *,
float *, blasint *, float *, float *, blasint *);
@@ -344,17 +344,17 @@ void BLASFUNC(zsyr) (char *, blasint *, double *, double *, blasint *,
void BLASFUNC(xsyr) (char *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *);
-void BLASFUNC(ssyr2) (char *, blasint *, float *,
+void BLASFUNC(ssyr2) (char *, blasint *, float *,
float *, blasint *, float *, blasint *, float *, blasint *);
-void BLASFUNC(dsyr2) (char *, blasint *, double *,
+void BLASFUNC(dsyr2) (char *, blasint *, double *,
double *, blasint *, double *, blasint *, double *, blasint *);
-void BLASFUNC(qsyr2) (char *, blasint *, xdouble *,
+void BLASFUNC(qsyr2) (char *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
-void BLASFUNC(csyr2) (char *, blasint *, float *,
+void BLASFUNC(csyr2) (char *, blasint *, float *,
float *, blasint *, float *, blasint *, float *, blasint *);
-void BLASFUNC(zsyr2) (char *, blasint *, double *,
+void BLASFUNC(zsyr2) (char *, blasint *, double *,
double *, blasint *, double *, blasint *, double *, blasint *);
-void BLASFUNC(xsyr2) (char *, blasint *, xdouble *,
+void BLASFUNC(xsyr2) (char *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
void BLASFUNC(sspr) (char *, blasint *, float *, float *, blasint *,
@@ -370,17 +370,17 @@ void BLASFUNC(zspr) (char *, blasint *, double *, double *, blasint *,
void BLASFUNC(xspr) (char *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *);
-void BLASFUNC(sspr2) (char *, blasint *, float *,
+void BLASFUNC(sspr2) (char *, blasint *, float *,
float *, blasint *, float *, blasint *, float *);
-void BLASFUNC(dspr2) (char *, blasint *, double *,
+void BLASFUNC(dspr2) (char *, blasint *, double *,
double *, blasint *, double *, blasint *, double *);
-void BLASFUNC(qspr2) (char *, blasint *, xdouble *,
+void BLASFUNC(qspr2) (char *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *);
-void BLASFUNC(cspr2) (char *, blasint *, float *,
+void BLASFUNC(cspr2) (char *, blasint *, float *,
float *, blasint *, float *, blasint *, float *);
-void BLASFUNC(zspr2) (char *, blasint *, double *,
+void BLASFUNC(zspr2) (char *, blasint *, double *,
double *, blasint *, double *, blasint *, double *);
-void BLASFUNC(xspr2) (char *, blasint *, xdouble *,
+void BLASFUNC(xspr2) (char *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *);
void BLASFUNC(cher) (char *, blasint *, float *, float *, blasint *,
@@ -394,25 +394,25 @@ void BLASFUNC(chpr) (char *, blasint *, float *, float *, blasint *, float *
void BLASFUNC(zhpr) (char *, blasint *, double *, double *, blasint *, double *);
void BLASFUNC(xhpr) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *);
-void BLASFUNC(cher2) (char *, blasint *, float *,
+void BLASFUNC(cher2) (char *, blasint *, float *,
float *, blasint *, float *, blasint *, float *, blasint *);
-void BLASFUNC(zher2) (char *, blasint *, double *,
+void BLASFUNC(zher2) (char *, blasint *, double *,
double *, blasint *, double *, blasint *, double *, blasint *);
-void BLASFUNC(xher2) (char *, blasint *, xdouble *,
+void BLASFUNC(xher2) (char *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
-void BLASFUNC(chpr2) (char *, blasint *, float *,
+void BLASFUNC(chpr2) (char *, blasint *, float *,
float *, blasint *, float *, blasint *, float *);
-void BLASFUNC(zhpr2) (char *, blasint *, double *,
+void BLASFUNC(zhpr2) (char *, blasint *, double *,
double *, blasint *, double *, blasint *, double *);
-void BLASFUNC(xhpr2) (char *, blasint *, xdouble *,
+void BLASFUNC(xhpr2) (char *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *);
-void BLASFUNC(chemv) (char *, blasint *, float *, float *, blasint *,
+void BLASFUNC(chemv) (char *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
-void BLASFUNC(zhemv) (char *, blasint *, double *, double *, blasint *,
+void BLASFUNC(zhemv) (char *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
-void BLASFUNC(xhemv) (char *, blasint *, xdouble *, xdouble *, blasint *,
+void BLASFUNC(xhemv) (char *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);
void BLASFUNC(chpmv) (char *, blasint *, float *, float *,
@@ -427,37 +427,37 @@ int BLASFUNC(dnorm)(char *, blasint *, blasint *, double *, blasint *);
int BLASFUNC(cnorm)(char *, blasint *, blasint *, float *, blasint *);
int BLASFUNC(znorm)(char *, blasint *, blasint *, double *, blasint *);
-void BLASFUNC(sgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float *, float *, blasint *,
+void BLASFUNC(sgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
-void BLASFUNC(dgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *,
+void BLASFUNC(dgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
-void BLASFUNC(qgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
+void BLASFUNC(qgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);
-void BLASFUNC(cgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float *, float *, blasint *,
+void BLASFUNC(cgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
-void BLASFUNC(zgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *,
+void BLASFUNC(zgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
-void BLASFUNC(xgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
+void BLASFUNC(xgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);
-void BLASFUNC(ssbmv)(char *, blasint *, blasint *, float *, float *, blasint *,
+void BLASFUNC(ssbmv)(char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
-void BLASFUNC(dsbmv)(char *, blasint *, blasint *, double *, double *, blasint *,
+void BLASFUNC(dsbmv)(char *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
-void BLASFUNC(qsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
+void BLASFUNC(qsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);
-void BLASFUNC(csbmv)(char *, blasint *, blasint *, float *, float *, blasint *,
+void BLASFUNC(csbmv)(char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
-void BLASFUNC(zsbmv)(char *, blasint *, blasint *, double *, double *, blasint *,
+void BLASFUNC(zsbmv)(char *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
-void BLASFUNC(xsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
+void BLASFUNC(xsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);
-void BLASFUNC(chbmv)(char *, blasint *, blasint *, float *, float *, blasint *,
+void BLASFUNC(chbmv)(char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
-void BLASFUNC(zhbmv)(char *, blasint *, blasint *, double *, double *, blasint *,
+void BLASFUNC(zhbmv)(char *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
-void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
+void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);
/* Level 3 routines */
@@ -606,18 +606,18 @@ int BLASFUNC(sgemt)(char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *);
int BLASFUNC(dgemt)(char *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *);
-int BLASFUNC(cgemt)(char *, blasint *, blasint *, float *, float *, blasint *,
+int BLASFUNC(cgemt)(char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *);
int BLASFUNC(zgemt)(char *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *);
-int BLASFUNC(sgema)(char *, char *, blasint *, blasint *, float *,
+int BLASFUNC(sgema)(char *, char *, blasint *, blasint *, float *,
float *, blasint *, float *, float *, blasint *, float *, blasint *);
int BLASFUNC(dgema)(char *, char *, blasint *, blasint *, double *,
double *, blasint *, double*, double *, blasint *, double*, blasint *);
int BLASFUNC(cgema)(char *, char *, blasint *, blasint *, float *,
float *, blasint *, float *, float *, blasint *, float *, blasint *);
-int BLASFUNC(zgema)(char *, char *, blasint *, blasint *, double *,
+int BLASFUNC(zgema)(char *, char *, blasint *, blasint *, double *,
double *, blasint *, double*, double *, blasint *, double*, blasint *);
int BLASFUNC(sgems)(char *, char *, blasint *, blasint *, float *,
@@ -757,9 +757,26 @@ FLOATRET BLASFUNC(slamc3)(float *, float *);
double BLASFUNC(dlamc3)(double *, double *);
xdouble BLASFUNC(qlamc3)(xdouble *, xdouble *);
+/* BLAS extensions */
+
+void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *);
+void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *);
+void BLASFUNC(caxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *);
+void BLASFUNC(zaxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *);
+
+void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *);
+void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *);
+void BLASFUNC(comatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *);
+void BLASFUNC(zomatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *);
+
+void BLASFUNC(simatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, blasint *);
+void BLASFUNC(dimatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, blasint *);
+void BLASFUNC(cimatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, blasint *);
+void BLASFUNC(zimatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, blasint *);
+
#ifdef __cplusplus
}
-
+
#endif /* __cplusplus */
#endif
diff --git a/common_level1.h b/common_level1.h
index f51ced6..2a1b4f1 100644
--- a/common_level1.h
+++ b/common_level1.h
@@ -54,11 +54,11 @@ double _Complex zdotu_k (BLASLONG, double *, BLASLONG, double *, BLASLONG);
xdouble _Complex xdotc_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
xdouble _Complex xdotu_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
-int saxpy_k (BLASLONG, BLASLONG, BLASLONG, float,
+int saxpy_k (BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
-int daxpy_k (BLASLONG, BLASLONG, BLASLONG, double,
+int daxpy_k (BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
-int qaxpy_k (BLASLONG, BLASLONG, BLASLONG, xdouble,
+int qaxpy_k (BLASLONG, BLASLONG, BLASLONG, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
int caxpy_k (BLASLONG, BLASLONG, BLASLONG, float, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
@@ -80,11 +80,11 @@ int ccopy_k(BLASLONG, float *, BLASLONG, float *, BLASLONG);
int zcopy_k(BLASLONG, double *, BLASLONG, double *, BLASLONG);
int xcopy_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
-int sswap_k (BLASLONG, BLASLONG, BLASLONG, float,
+int sswap_k (BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
-int dswap_k (BLASLONG, BLASLONG, BLASLONG, double,
+int dswap_k (BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, double *, BLASLONG, double*, BLASLONG);
-int qswap_k (BLASLONG, BLASLONG, BLASLONG, xdouble,
+int qswap_k (BLASLONG, BLASLONG, BLASLONG, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble*, BLASLONG);
int cswap_k (BLASLONG, BLASLONG, BLASLONG, float, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
@@ -156,11 +156,11 @@ BLASLONG icmin_k(BLASLONG, float *, BLASLONG);
BLASLONG izmin_k(BLASLONG, double *, BLASLONG);
BLASLONG ixmin_k(BLASLONG, xdouble *, BLASLONG);
-int sscal_k(BLASLONG, BLASLONG, BLASLONG, float,
+int sscal_k(BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
-int dscal_k(BLASLONG, BLASLONG, BLASLONG, double,
+int dscal_k(BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
-int qscal_k(BLASLONG, BLASLONG, BLASLONG, xdouble,
+int qscal_k(BLASLONG, BLASLONG, BLASLONG, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
int cscal_k(BLASLONG, BLASLONG, BLASLONG, float, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
@@ -204,6 +204,13 @@ int srotm_k (BLASLONG, float, BLASLONG, float, BLASLONG, float);
int drotm_k (BLASLONG, double, BLASLONG, double, BLASLONG, double);
int qrotm_k (BLASLONG, xdouble, BLASLONG, xdouble, BLASLONG, xdouble);
+
+int saxpby_k (BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG);
+int daxpby_k (BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG);
+int caxpby_k (BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG);
+int zaxpby_k (BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG);
+
+
#ifdef __CUDACC__
}
#endif
diff --git a/common_level2.h b/common_level2.h
index 2ab682a..640d4a0 100644
--- a/common_level2.h
+++ b/common_level2.h
@@ -986,24 +986,24 @@ int cnorm_t(BLASLONG, BLASLONG, float *a, BLASLONG);
int znorm_n(BLASLONG, BLASLONG, double *a, BLASLONG);
int znorm_t(BLASLONG, BLASLONG, double *a, BLASLONG);
-void sgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float,
+void sgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer);
void sgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer);
void dgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer);
-void dgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double,
+void dgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer);
void qgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer);
-void qgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble,
+void qgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer);
void cgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer);
-void cgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float,
+void cgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer);
void cgbmv_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer);
@@ -1052,24 +1052,24 @@ void xgbmv_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
void xgbmv_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer);
-int sgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float,
+int sgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int);
int sgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int);
int dgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int);
-int dgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double,
+int dgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int);
int qgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int);
-int qgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble,
+int qgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int);
int cgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int);
-int cgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *,
+int cgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int);
int cgbmv_thread_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int);
diff --git a/common_level3.h b/common_level3.h
index cbc67a6..0babd45 100644
--- a/common_level3.h
+++ b/common_level3.h
@@ -47,9 +47,9 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *);
extern "C" {
#endif
-int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
+int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
-int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double,
+int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
int cgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
@@ -57,12 +57,12 @@ int zgemm_beta(BLASLONG, BLASLONG, BLASLONG, double, double,
double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
#ifdef EXPRECISION
-int qgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble,
+int qgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
#else
-int qgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *,
+int qgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
@@ -1732,6 +1732,37 @@ int zgemc_otcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b, BLA
int xgemc_oncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b, BLASLONG ldb, xdouble *c);
int xgemc_otcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b, BLASLONG ldb, xdouble *c);
+int somatcopy_k_cn(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG);
+int somatcopy_k_rn(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG);
+int somatcopy_k_ct(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG);
+int somatcopy_k_rt(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG);
+
+int domatcopy_k_cn(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG);
+int domatcopy_k_rn(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG);
+int domatcopy_k_ct(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG);
+int domatcopy_k_rt(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG);
+
+int comatcopy_k_cn(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG);
+int comatcopy_k_rn(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG);
+int comatcopy_k_ct(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG);
+int comatcopy_k_rt(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG);
+
+int comatcopy_k_cnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG);
+int comatcopy_k_rnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG);
+int comatcopy_k_ctc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG);
+int comatcopy_k_rtc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG);
+
+int zomatcopy_k_cn(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
+int zomatcopy_k_rn(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
+int zomatcopy_k_ct(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
+int zomatcopy_k_rt(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
+
+int zomatcopy_k_cnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
+int zomatcopy_k_rnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
+int zomatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
+int zomatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
+
+
#ifdef __CUDACC__
}
#endif
diff --git a/common_linux.h b/common_linux.h
index afc77b4..cab5e5f 100644
--- a/common_linux.h
+++ b/common_linux.h
@@ -75,7 +75,7 @@ static inline int my_mbind(void *addr, unsigned long len, int mode,
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
return 0;
#else
-#if defined (LOONGSON3B)
+#if defined (LOONGSON3B)
#if defined (__64BIT__)
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
#else
@@ -99,9 +99,9 @@ static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned
#endif
}
-static inline int my_gettid(void) {
+static inline int my_gettid(void) {
#ifdef SYS_gettid
-return syscall(SYS_gettid);
+return syscall(SYS_gettid);
#else
return getpid();
#endif
diff --git a/common_macro.h b/common_macro.h
index 0c34ecb..f9de377 100644
--- a/common_macro.h
+++ b/common_macro.h
@@ -628,6 +628,13 @@
#define HERK_THREAD_LR DSYRK_THREAD_LN
#define HERK_THREAD_LC DSYRK_THREAD_LT
+#define AXPBY_K DAXPBY_K
+
+#define OMATCOPY_K_CN DOMATCOPY_K_CN
+#define OMATCOPY_K_RN DOMATCOPY_K_RN
+#define OMATCOPY_K_CT DOMATCOPY_K_CT
+#define OMATCOPY_K_RT DOMATCOPY_K_RT
+
#else
#define AMAX_K SAMAX_K
@@ -918,6 +925,13 @@
#define HERK_THREAD_LR SSYRK_THREAD_LN
#define HERK_THREAD_LC SSYRK_THREAD_LT
+#define AXPBY_K SAXPBY_K
+
+#define OMATCOPY_K_CN SOMATCOPY_K_CN
+#define OMATCOPY_K_RN SOMATCOPY_K_RN
+#define OMATCOPY_K_CT SOMATCOPY_K_CT
+#define OMATCOPY_K_RT SOMATCOPY_K_RT
+
#endif
#else
#ifdef XDOUBLE
@@ -1722,6 +1736,17 @@
#define SYMM_OUTCOPY ZSYMM_OUTCOPY
#define SYMM_OLTCOPY ZSYMM_OLTCOPY
+#define AXPBY_K ZAXPBY_K
+
+#define OMATCOPY_K_CN ZOMATCOPY_K_CN
+#define OMATCOPY_K_RN ZOMATCOPY_K_RN
+#define OMATCOPY_K_CT ZOMATCOPY_K_CT
+#define OMATCOPY_K_RT ZOMATCOPY_K_RT
+#define OMATCOPY_K_CNC ZOMATCOPY_K_CNC
+#define OMATCOPY_K_RNC ZOMATCOPY_K_RNC
+#define OMATCOPY_K_CTC ZOMATCOPY_K_CTC
+#define OMATCOPY_K_RTC ZOMATCOPY_K_RTC
+
#else
#define AMAX_K CAMAX_K
@@ -2123,6 +2148,17 @@
#define SYMM_OUTCOPY CSYMM_OUTCOPY
#define SYMM_OLTCOPY CSYMM_OLTCOPY
+#define AXPBY_K CAXPBY_K
+
+#define OMATCOPY_K_CN COMATCOPY_K_CN
+#define OMATCOPY_K_RN COMATCOPY_K_RN
+#define OMATCOPY_K_CT COMATCOPY_K_CT
+#define OMATCOPY_K_RT COMATCOPY_K_RT
+#define OMATCOPY_K_CNC COMATCOPY_K_CNC
+#define OMATCOPY_K_RNC COMATCOPY_K_RNC
+#define OMATCOPY_K_CTC COMATCOPY_K_CTC
+#define OMATCOPY_K_RTC COMATCOPY_K_RTC
+
#endif
#endif
diff --git a/common_mips64.h b/common_mips64.h
index d9cdc49..aa85ff2 100644
--- a/common_mips64.h
+++ b/common_mips64.h
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -111,9 +111,9 @@ static inline unsigned int rpcc(void){
".set pop": "=r"(ret):: "memory");
#else
- __asm__ __volatile__(".set push \n"
- ".set mips32r2\n"
- "rdhwr %0, $30 \n"
+ __asm__ __volatile__(".set push \n"
+ ".set mips32r2\n"
+ "rdhwr %0, $30 \n"
".set pop" : "=r"(ret) : : "memory");
#endif
return ret;
@@ -191,13 +191,13 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define CMPEQ c.eq.s
#define CMPLE c.le.s
#define CMPLT c.lt.s
-#define PLU plu.ps
-#define PLL pll.ps
-#define PUU puu.ps
-#define PUL pul.ps
-#define MADPS madd.ps
-#define CVTU cvt.s.pu
-#define CVTL cvt.s.pl
+#define PLU plu.ps
+#define PLL pll.ps
+#define PUU puu.ps
+#define PUL pul.ps
+#define MADPS madd.ps
+#define CVTU cvt.s.pu
+#define CVTL cvt.s.pl
#define NEG neg.s
#endif
@@ -279,9 +279,9 @@ REALNAME: ;\
#if defined(LOONGSON3A) || defined(LOONGSON3B)
#define PREFETCHD_(x) ld $0, x
-#define PREFETCHD(x) PREFETCHD_(x)
+#define PREFETCHD(x) PREFETCHD_(x)
#else
-#define PREFETCHD(x)
+#define PREFETCHD(x)
#endif
#endif
diff --git a/common_param.h b/common_param.h
index e978193..8632164 100644
--- a/common_param.h
+++ b/common_param.h
@@ -87,12 +87,12 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
-
+
int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*strsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
-
+
int (*strsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*strsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*strsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
@@ -114,7 +114,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*strmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
-
+
int (*strmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*strmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*strmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
@@ -131,7 +131,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
int (*strmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*strmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*strmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-
+
int (*ssymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
@@ -176,12 +176,12 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
int (*dgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
-
+
int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
int (*dtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
-
+
int (*dtrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
int (*dtrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
int (*dtrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
@@ -203,7 +203,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
int (*dtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
int (*dtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
int (*dtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
-
+
int (*dtrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*dtrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*dtrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
@@ -220,7 +220,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
int (*dtrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*dtrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*dtrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-
+
int (*dsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*dsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*dsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
@@ -267,12 +267,12 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
int (*qgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
int (*qgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
int (*qgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
-
+
int (*qtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
int (*qtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
int (*qtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
int (*qtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-
+
int (*qtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
int (*qtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
int (*qtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
@@ -294,7 +294,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
int (*qtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
int (*qtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
int (*qtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-
+
int (*qtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*qtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*qtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
@@ -311,7 +311,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
int (*qtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*qtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*qtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-
+
int (*qsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*qsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*qsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
@@ -372,7 +372,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
int (*cgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
-
+
int (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
@@ -381,7 +381,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
int (*ctrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*ctrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*ctrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
-
+
int (*ctrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*ctrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*ctrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
@@ -407,7 +407,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
int (*ctrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*ctrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*ctrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
-
+
int (*ctrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*ctrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*ctrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
@@ -424,7 +424,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
int (*ctrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*ctrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*ctrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-
+
int (*csymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*csymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*csymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
@@ -443,7 +443,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
int (*cgemm3m_itcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *);
int (*cgemm3m_itcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *);
int (*cgemm3m_itcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *);
-
+
int (*cgemm3m_oncopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *);
int (*cgemm3m_oncopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *);
int (*cgemm3m_oncopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *);
@@ -457,21 +457,21 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
int (*csymm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*csymm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*csymm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-
+
int (*csymm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
int (*csymm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
int (*csymm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
int (*csymm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
int (*csymm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
int (*csymm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
-
+
int (*chemm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*chemm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*chemm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*chemm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*chemm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*chemm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
-
+
int (*chemm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
int (*chemm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
int (*chemm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
@@ -532,7 +532,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
int (*zgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
int (*zgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
-
+
int (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
int (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
int (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
@@ -541,7 +541,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
int (*ztrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
int (*ztrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
int (*ztrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
-
+
int (*ztrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
int (*ztrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
int (*ztrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *);
@@ -567,7 +567,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
int (*ztrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
int (*ztrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
int (*ztrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
-
+
int (*ztrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*ztrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*ztrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
@@ -584,7 +584,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
int (*ztrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*ztrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*ztrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-
+
int (*zsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*zsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*zsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
@@ -603,7 +603,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
int (*zgemm3m_itcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *);
int (*zgemm3m_itcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *);
int (*zgemm3m_itcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *);
-
+
int (*zgemm3m_oncopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *);
int (*zgemm3m_oncopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *);
int (*zgemm3m_oncopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *);
@@ -617,28 +617,28 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
int (*zsymm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*zsymm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*zsymm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-
+
int (*zsymm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
int (*zsymm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
int (*zsymm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
int (*zsymm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
int (*zsymm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
int (*zsymm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
-
+
int (*zhemm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*zhemm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*zhemm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*zhemm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*zhemm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*zhemm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
-
+
int (*zhemm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
int (*zhemm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
int (*zhemm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
int (*zhemm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
int (*zhemm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
int (*zhemm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *);
-
+
int (*zneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *);
int (*zlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *);
@@ -694,7 +694,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
int (*xgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
int (*xgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
int (*xgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
-
+
int (*xtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
int (*xtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
int (*xtrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
@@ -703,7 +703,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
int (*xtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
int (*xtrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
int (*xtrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-
+
int (*xtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
int (*xtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
int (*xtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *);
@@ -729,7 +729,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
int (*xtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
int (*xtrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
int (*xtrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG);
-
+
int (*xtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*xtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*xtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
@@ -746,7 +746,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
int (*xtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*xtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*xtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-
+
int (*xsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*xsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*xsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
@@ -765,7 +765,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
int (*xgemm3m_itcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
int (*xgemm3m_itcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
int (*xgemm3m_itcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
-
+
int (*xgemm3m_oncopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *);
int (*xgemm3m_oncopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *);
int (*xgemm3m_oncopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *);
@@ -779,21 +779,21 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
int (*xsymm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*xsymm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*xsymm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-
+
int (*xsymm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
int (*xsymm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
int (*xsymm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
int (*xsymm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
int (*xsymm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
int (*xsymm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
-
+
int (*xhemm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*xhemm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*xhemm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*xhemm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*xhemm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*xhemm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
-
+
int (*xhemm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
int (*xhemm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
int (*xhemm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *);
@@ -806,10 +806,47 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
#endif
+
void (*init)(void);
int snum_opt, dnum_opt, qnum_opt;
+ int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG);
+ int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG);
+ int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG);
+ int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG);
+
+ int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
+ int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
+ int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
+ int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
+
+ int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
+ int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
+ int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
+ int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
+
+ int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
+ int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
+ int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
+ int (*comatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
+
+ int (*comatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
+ int (*comatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
+ int (*comatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
+ int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
+
+ int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
+ int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
+ int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
+ int (*zomatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
+
+ int (*zomatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
+ int (*zomatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
+ int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
+ int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
+
+
} gotoblas_t;
extern gotoblas_t *gotoblas;
@@ -865,7 +902,7 @@ extern gotoblas_t *gotoblas;
#else
-#define DTB_ENTRIES DTB_DEFAULT_ENTRIES
+#define DTB_ENTRIES DTB_DEFAULT_ENTRIES
#define GEMM_OFFSET_A GEMM_DEFAULT_OFFSET_A
#define GEMM_OFFSET_B GEMM_DEFAULT_OFFSET_B
diff --git a/common_power.h b/common_power.h
index 34a6153..f88f527 100644
--- a/common_power.h
+++ b/common_power.h
@@ -114,7 +114,7 @@ static inline unsigned long getstackaddr(void){
__asm__ __volatile__ ("mr %0, 1"
: "=r"(addr) : : "memory");
- return addr;
+ return addr;
};
#if defined(OS_LINUX) || defined(OS_AIX)
diff --git a/common_reference.h b/common_reference.h
index be151e0..75bae1f 100644
--- a/common_reference.h
+++ b/common_reference.h
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -65,5 +65,5 @@ void BLASFUNC_REF(drotmg)(double *, double *, double *, double *, double *);
double BLASFUNC_REF(dsdot)(blasint *, float *, blasint *, float *, blasint*);
FLOATRET BLASFUNC_REF(samax) (blasint *, float *, blasint *);
-
+
#endif
diff --git a/common_s.h b/common_s.h
index 267c5f3..4e9b6db 100644
--- a/common_s.h
+++ b/common_s.h
@@ -146,6 +146,14 @@
#define SNEG_TCOPY sneg_tcopy
#define SLASWP_NCOPY slaswp_ncopy
+#define SAXPBY_K saxpby_k
+
+#define SOMATCOPY_K_CN somatcopy_k_cn
+#define SOMATCOPY_K_RN somatcopy_k_rn
+#define SOMATCOPY_K_CT somatcopy_k_ct
+#define SOMATCOPY_K_RT somatcopy_k_rt
+
+
#else
#define SAMAX_K gotoblas -> samax_k
@@ -259,6 +267,14 @@
#define SNEG_TCOPY gotoblas -> sneg_tcopy
#define SLASWP_NCOPY gotoblas -> slaswp_ncopy
+#define SAXPBY_K gotoblas -> saxpby_k
+
+#define SOMATCOPY_K_CN gotoblas -> somatcopy_k_cn
+#define SOMATCOPY_K_RN gotoblas -> somatcopy_k_rn
+#define SOMATCOPY_K_CT gotoblas -> somatcopy_k_ct
+#define SOMATCOPY_K_RT gotoblas -> somatcopy_k_rt
+
+
#endif
#define SGEMM_NN sgemm_nn
diff --git a/common_sparc.h b/common_sparc.h
index daa2e49..87ef752 100644
--- a/common_sparc.h
+++ b/common_sparc.h
@@ -130,7 +130,7 @@ static __inline int blas_quickdivide(blasint x, blasint y){
#define FSQRT fsqrts
#define FDIV fdivs
#endif
-
+
#define HALT prefetch [%g0], 5
#define FMADDS(rs1, rs2, rs3, rd) \
@@ -170,19 +170,19 @@ static __inline int blas_quickdivide(blasint x, blasint y){
.word ((2 << 30) | ((rd) << 25) | ( 0x36 << 19) | ( 0x7e << 5))
#ifndef DOUBLE
-#define FCLR(a) FCLRS(a)
-#define FONE(a) FONES(a)
-#define FMADD(a, b, c, d) FMADDS(a, b, c, d)
-#define FMSUB(a, b, c, d) FMSUBS(a, b, c, d)
-#define FNMADD(a, b, c, d) FNMADDS(a, b, c, d)
-#define FNMSUB(a, b, c, d) FNMSUBS(a, b, c, d)
+#define FCLR(a) FCLRS(a)
+#define FONE(a) FONES(a)
+#define FMADD(a, b, c, d) FMADDS(a, b, c, d)
+#define FMSUB(a, b, c, d) FMSUBS(a, b, c, d)
+#define FNMADD(a, b, c, d) FNMADDS(a, b, c, d)
+#define FNMSUB(a, b, c, d) FNMSUBS(a, b, c, d)
#else
-#define FCLR(a) FCLRD(a)
-#define FONE(a) FONED(a)
-#define FMADD(a, b, c, d) FMADDD(a, b, c, d)
-#define FMSUB(a, b, c, d) FMSUBD(a, b, c, d)
-#define FNMADD(a, b, c, d) FNMADDD(a, b, c, d)
-#define FNMSUB(a, b, c, d) FNMSUBD(a, b, c, d)
+#define FCLR(a) FCLRD(a)
+#define FONE(a) FONED(a)
+#define FMADD(a, b, c, d) FMADDD(a, b, c, d)
+#define FMSUB(a, b, c, d) FMSUBD(a, b, c, d)
+#define FNMADD(a, b, c, d) FNMADDD(a, b, c, d)
+#define FNMSUB(a, b, c, d) FNMSUBD(a, b, c, d)
#endif
#ifndef F_INTERFACE
diff --git a/common_thread.h b/common_thread.h
index ad386a4..bd96444 100644
--- a/common_thread.h
+++ b/common_thread.h
@@ -176,7 +176,7 @@ int exec_blas(BLASLONG num_cpu, blas_param_t *param, void *buffer);
int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha,
void *a, BLASLONG lda,
- void *b, BLASLONG ldb,
+ void *b, BLASLONG ldb,
void *c, BLASLONG ldc, int (*function)(), int threads);
int gemm_thread_m (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
@@ -187,14 +187,14 @@ int gemm_thread_mn(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*functio
int gemm_thread_variable(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG, BLASLONG);
-int trsm_thread(int mode, BLASLONG m, BLASLONG n,
+int trsm_thread(int mode, BLASLONG m, BLASLONG n,
double alpha_r, double alpha_i,
void *a, BLASLONG lda,
void *c, BLASLONG ldc, int (*function)(), void *buffer);
int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
-int beta_thread(int mode, BLASLONG m, BLASLONG n,
+int beta_thread(int mode, BLASLONG m, BLASLONG n,
double alpha_r, double alpha_i,
void *c, BLASLONG ldc, int (*fuction)());
diff --git a/common_x86.h b/common_x86.h
index 5f42843..f97fd34 100644
--- a/common_x86.h
+++ b/common_x86.h
@@ -55,7 +55,7 @@ static void __inline blas_lock(volatile BLASULONG *address){
do {
while (*address) {YIELDING;};
-
+
__asm__ __volatile__(
"xchgl %0, %1\n"
: "=r"(ret), "=m"(*address)
@@ -70,8 +70,8 @@ static __inline unsigned long long rpcc(void){
unsigned int a, d;
__asm__ __volatile__ ("rdtsc" : "=a" (a), "=d" (d));
-
- return ((unsigned long long)a + ((unsigned long long)d << 32));
+
+ return ((unsigned long long)a + ((unsigned long long)d << 32));
};
static __inline unsigned long getstackaddr(void){
@@ -80,7 +80,7 @@ static __inline unsigned long getstackaddr(void){
__asm__ __volatile__ ("mov %%esp, %0"
: "=r"(addr) : : "memory");
- return addr;
+ return addr;
};
@@ -365,9 +365,9 @@ REALNAME:
#ifndef ALIGN_6
#define ALIGN_6 .align 64
#endif
-// ffreep %st(0).
+// ffreep %st(0).
// Because Clang didn't support ffreep, we directly use the opcode.
-// Please check out http://www.sandpile.org/x86/opc_fpu.htm
+// Please check out http://www.sandpile.org/x86/opc_fpu.htm
#ifndef ffreep
#define ffreep .byte 0xdf, 0xc0 #
#endif
diff --git a/common_x86_64.h b/common_x86_64.h
index 39e5a5e..0f842ee 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -60,7 +60,7 @@ static void __inline blas_lock(volatile BLASULONG *address){
do {
while (*address) {YIELDING;};
-
+
__asm__ __volatile__(
"xchgl %0, %1\n"
: "=r"(ret), "=m"(*address)
@@ -74,8 +74,8 @@ static __inline BLASULONG rpcc(void){
BLASULONG a, d;
__asm__ __volatile__ ("rdtsc" : "=a" (a), "=d" (d));
-
- return ((BLASULONG)a + ((BLASULONG)d << 32));
+
+ return ((BLASULONG)a + ((BLASULONG)d << 32));
}
#define RPCC64BIT
@@ -86,7 +86,7 @@ static __inline BLASULONG getstackaddr(void){
__asm__ __volatile__ ("movq %%rsp, %0"
: "=r"(addr) : : "memory");
- return addr;
+ return addr;
}
static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
@@ -457,9 +457,9 @@ REALNAME:
#define ALIGN_6 .align 64
#endif
-// ffreep %st(0).
+// ffreep %st(0).
// Because Clang didn't support ffreep, we directly use the opcode.
-// Please check out http://www.sandpile.org/x86/opc_fpu.htm
+// Please check out http://www.sandpile.org/x86/opc_fpu.htm
#ifndef ffreep
#define ffreep .byte 0xdf, 0xc0 #
#endif
diff --git a/common_z.h b/common_z.h
index 8832cac..133dea8 100644
--- a/common_z.h
+++ b/common_z.h
@@ -209,6 +209,18 @@
#define ZNEG_TCOPY zneg_tcopy
#define ZLASWP_NCOPY zlaswp_ncopy
+#define ZAXPBY_K zaxpby_k
+
+#define ZOMATCOPY_K_CN zomatcopy_k_cn
+#define ZOMATCOPY_K_RN zomatcopy_k_rn
+#define ZOMATCOPY_K_CT zomatcopy_k_ct
+#define ZOMATCOPY_K_RT zomatcopy_k_rt
+#define ZOMATCOPY_K_CNC zomatcopy_k_cnc
+#define ZOMATCOPY_K_RNC zomatcopy_k_rnc
+#define ZOMATCOPY_K_CTC zomatcopy_k_ctc
+#define ZOMATCOPY_K_RTC zomatcopy_k_rtc
+
+
#else
#define ZAMAX_K gotoblas -> zamax_k
@@ -380,6 +392,17 @@
#define ZNEG_TCOPY gotoblas -> zneg_tcopy
#define ZLASWP_NCOPY gotoblas -> zlaswp_ncopy
+#define ZAXPBY_K gotoblas -> zaxpby_k
+
+#define ZOMATCOPY_K_CN gotoblas -> zomatcopy_k_cn
+#define ZOMATCOPY_K_RN gotoblas -> zomatcopy_k_rn
+#define ZOMATCOPY_K_CT gotoblas -> zomatcopy_k_ct
+#define ZOMATCOPY_K_RT gotoblas -> zomatcopy_k_rt
+#define ZOMATCOPY_K_CNC gotoblas -> zomatcopy_k_cnc
+#define ZOMATCOPY_K_RNC gotoblas -> zomatcopy_k_rnc
+#define ZOMATCOPY_K_CTC gotoblas -> zomatcopy_k_ctc
+#define ZOMATCOPY_K_RTC gotoblas -> zomatcopy_k_rtc
+
#endif
#define ZGEMM_NN zgemm_nn
diff --git a/cpuid.S b/cpuid.S
index 3f7bf5f..851fe34 100644
--- a/cpuid.S
+++ b/cpuid.S
@@ -39,10 +39,10 @@
#if defined(__APPLE__) && defined(__i386__)
/* Quick hack for Darwin/x86 */
-
+
.text
.globl _cpuid
-_cpuid:
+_cpuid:
pushl %esi
pushl %ebx
diff --git a/cpuid_alpha.c b/cpuid_alpha.c
index adcc314..58dccde 100644
--- a/cpuid_alpha.c
+++ b/cpuid_alpha.c
@@ -50,7 +50,7 @@ int implver(void){
#endif
return arch;
}
-
+
void get_architecture(void){
printf("ALPHA");
}
@@ -67,7 +67,7 @@ void get_cpuconfig(void){
printf("#define EV%d\n", implver() + 4);
switch (implver()){
- case 0:
+ case 0:
printf("#define L1_DATA_SIZE 16384\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 2097152\n");
@@ -76,7 +76,7 @@ void get_cpuconfig(void){
printf("#define DTB_SIZE 8192\n");
break;
- case 1:
+ case 1:
printf("#define L1_DATA_SIZE 16384\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 2097152\n");
@@ -85,7 +85,7 @@ void get_cpuconfig(void){
printf("#define DTB_SIZE 8192\n");
break;
- case 2:
+ case 2:
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 4194304\n");
diff --git a/cpuid_arm.c b/cpuid_arm.c
index efd1369..809ef3d 100644
--- a/cpuid_arm.c
+++ b/cpuid_arm.c
@@ -67,7 +67,7 @@ int get_feature(char *search)
t = strtok(p," ");
while( t = strtok(NULL," "))
- {
+ {
if (!strcmp(t, search)) { return(1); }
}
@@ -102,7 +102,7 @@ int detect(void)
if(p != NULL)
{
- if (strstr(p, "ARMv7"))
+ if (strstr(p, "ARMv7"))
{
if ( get_feature("vfpv4"))
return CPU_ARMV7;
@@ -116,7 +116,7 @@ int detect(void)
}
- if (strstr(p, "ARMv6"))
+ if (strstr(p, "ARMv6"))
{
if ( get_feature("vfp"))
return CPU_ARMV6;
@@ -248,7 +248,7 @@ void get_features(void)
t = strtok(p," ");
while( t = strtok(NULL," "))
- {
+ {
if (!strcmp(t, "vfp")) { printf("HAVE_VFP=1\n"); continue; }
if (!strcmp(t, "vfpv3")) { printf("HAVE_VFPV3=1\n"); continue; }
if (!strcmp(t, "vfpv4")) { printf("HAVE_VFPV4=1\n"); continue; }
diff --git a/cpuid_ia64.c b/cpuid_ia64.c
index d372182..e7e200c 100644
--- a/cpuid_ia64.c
+++ b/cpuid_ia64.c
@@ -45,7 +45,7 @@
#include <ia64intrin.h>
#endif
-static inline unsigned long cpuid(unsigned long regnum){
+static inline unsigned long cpuid(unsigned long regnum){
unsigned long value;
#ifdef __ECC
@@ -65,7 +65,7 @@ int get_vendor(void){
cpuid0 = cpuid(0);
cpuid1 = cpuid(1);
-
+
*(unsigned long *)(&vendor[0]) = cpuid0;
*(unsigned long *)(&vendor[8]) = cpuid1;
vendor[17] = (char)0;
@@ -79,7 +79,7 @@ int get_cputype(int gettype){
unsigned long cpuid3;
cpuid3 = cpuid(3);
-
+
switch (gettype) {
case GET_ARCHREV :
return BITMASK(cpuid3, 32, 0xff);
diff --git a/cpuid_mips.c b/cpuid_mips.c
index 45171da..fad1057 100644
--- a/cpuid_mips.c
+++ b/cpuid_mips.c
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
diff --git a/cpuid_power.c b/cpuid_power.c
index 9fd9ec9..2fc333d 100644
--- a/cpuid_power.c
+++ b/cpuid_power.c
@@ -134,7 +134,7 @@ int detect(void){
if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7450) return CPUTYPE_PPCG4;
if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_970) return CPUTYPE_PPC970;
-
+
return CPUTYPE_PPC970;
#endif
}
diff --git a/cpuid_x86.c b/cpuid_x86.c
index 7bcd168..53016e1 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -40,6 +40,7 @@
#include <string.h>
#include "cpuid.h"
+/*
#ifdef NO_AVX
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM
#define CORE_HASWELL CORE_NEHALEM
@@ -50,6 +51,7 @@
#define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA
#define CORE_PILEDRIVER CORE_BARCELONA
#endif
+*/
#ifndef CPUIDEMU
@@ -91,7 +93,7 @@ void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *
if ((current < start) || (current > stop)) current = start;
while ((count > 0) && (idlist[current].id != op)) {
-
+
current ++;
if (current > stop) current = start;
count --;
@@ -132,7 +134,7 @@ int support_avx(){
#ifndef NO_AVX
int eax, ebx, ecx, edx;
int ret=0;
-
+
cpuid(1, &eax, &ebx, &ecx, &edx);
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
xgetbv(0, &eax, &edx);
@@ -152,7 +154,7 @@ int get_vendor(void){
char vendor[13];
cpuid(0, &eax, &ebx, &ecx, &edx);
-
+
*(int *)(&vendor[0]) = ebx;
*(int *)(&vendor[4]) = edx;
*(int *)(&vendor[8]) = ecx;
@@ -173,7 +175,7 @@ int get_vendor(void){
return VENDOR_UNKNOWN;
}
-
+
int get_cputype(int gettype){
int eax, ebx, ecx, edx;
int extend_family, family;
@@ -182,7 +184,7 @@ int get_cputype(int gettype){
int feature = 0;
cpuid(1, &eax, &ebx, &ecx, &edx);
-
+
switch (gettype) {
case GET_EXFAMILY :
return BITMASK(eax, 20, 0xff);
@@ -252,12 +254,12 @@ int get_cputype(int gettype){
}
return feature;
}
-
+
int get_cacheinfo(int type, cache_info_t *cacheinfo){
int eax, ebx, ecx, edx, cpuid_level;
int info[15];
int i;
- cache_info_t LC1, LD1, L2, L3,
+ cache_info_t LC1, LD1, L2, L3,
ITB, DTB, LITB, LDTB,
L2ITB, L2DTB, L2LITB, L2LDTB;
@@ -283,22 +285,22 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
info[ 0] = BITMASK(eax, 8, 0xff);
info[ 1] = BITMASK(eax, 16, 0xff);
info[ 2] = BITMASK(eax, 24, 0xff);
-
+
info[ 3] = BITMASK(ebx, 0, 0xff);
info[ 4] = BITMASK(ebx, 8, 0xff);
info[ 5] = BITMASK(ebx, 16, 0xff);
info[ 6] = BITMASK(ebx, 24, 0xff);
-
+
info[ 7] = BITMASK(ecx, 0, 0xff);
info[ 8] = BITMASK(ecx, 8, 0xff);
info[ 9] = BITMASK(ecx, 16, 0xff);
info[10] = BITMASK(ecx, 24, 0xff);
-
+
info[11] = BITMASK(edx, 0, 0xff);
info[12] = BITMASK(edx, 8, 0xff);
info[13] = BITMASK(edx, 16, 0xff);
info[14] = BITMASK(edx, 24, 0xff);
-
+
for (i = 0; i < 15; i++){
switch (info[i]){
@@ -864,7 +866,7 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
LITB.associative = BITMASK(eax, 8, 0xff);
if (LITB.associative == 0xff) LITB.associative = 0;
LITB.linesize = BITMASK(eax, 0, 0xff);
-
+
DTB.size = 4;
DTB.associative = BITMASK(ebx, 24, 0xff);
if (DTB.associative == 0xff) DTB.associative = 0;
@@ -896,7 +898,7 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
L2LITB.associative = BITMASK(eax, 8, 0xff);
if (L2LITB.associative == 0xff) L2LITB.associative = 0;
L2LITB.linesize = BITMASK(eax, 0, 0xff);
-
+
L2DTB.size = 4;
L2DTB.associative = BITMASK(ebx, 24, 0xff);
if (L2DTB.associative == 0xff) L2DTB.associative = 0;
@@ -920,7 +922,7 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
}
switch (type) {
-
+
case CACHE_INFO_L1_I :
*cacheinfo = LC1;
break;
@@ -982,7 +984,7 @@ int get_cpuname(void){
return CPUTYPE_PENTIUM;
case 0x6:
switch (exmodel) {
- case 0:
+ case 0:
switch (model) {
case 1:
case 3:
@@ -1022,8 +1024,8 @@ int get_cpuname(void){
case 2:
switch (model) {
case 5:
- //Intel Core (Clarkdale) / Core (Arrandale)
- // Pentium (Clarkdale) / Pentium Mobile (Arrandale)
+ //Intel Core (Clarkdale) / Core (Arrandale)
+ // Pentium (Clarkdale) / Pentium Mobile (Arrandale)
// Xeon (Clarkdale), 32nm
return CPUTYPE_NEHALEM;
case 10:
@@ -1060,7 +1062,11 @@ int get_cpuname(void){
case 12:
case 15:
if(support_avx())
+#ifndef NO_AVX2
return CPUTYPE_HASWELL;
+#else
+ return CPUTYPE_SANDYBRIDGE;
+#endif
else
return CPUTYPE_NEHALEM;
}
@@ -1070,11 +1076,15 @@ int get_cpuname(void){
case 5:
case 6:
if(support_avx())
+#ifndef NO_AVX2
return CPUTYPE_HASWELL;
+#else
+ return CPUTYPE_SANDYBRIDGE;
+#endif
else
return CPUTYPE_NEHALEM;
}
- break;
+ break;
}
break;
case 0x7:
@@ -1119,7 +1129,7 @@ int get_cpuname(void){
if(support_avx())
return CPUTYPE_PILEDRIVER;
else
- return CPUTYPE_BARCELONA; //OS don't support AVX.
+ return CPUTYPE_BARCELONA; //OS don't support AVX.
}
break;
case 5:
@@ -1303,7 +1313,7 @@ static char *lowercpuname[] = {
static char *corename[] = {
"UNKOWN",
- "80486",
+ "80486",
"P5",
"P6",
"KATMAI",
@@ -1331,7 +1341,7 @@ static char *corename[] = {
static char *corename_lower[] = {
"unknown",
- "80486",
+ "80486",
"p5",
"p6",
"katmai",
@@ -1432,8 +1442,8 @@ int get_coretype(void){
case 2:
switch (model) {
case 5:
- //Intel Core (Clarkdale) / Core (Arrandale)
- // Pentium (Clarkdale) / Pentium Mobile (Arrandale)
+ //Intel Core (Clarkdale) / Core (Arrandale)
+ // Pentium (Clarkdale) / Pentium Mobile (Arrandale)
// Xeon (Clarkdale), 32nm
return CORE_NEHALEM;
case 10:
@@ -1469,7 +1479,11 @@ int get_coretype(void){
case 12:
case 15:
if(support_avx())
+#ifndef NO_AVX2
return CORE_HASWELL;
+#else
+ return CORE_SANDYBRIDGE;
+#endif
else
return CORE_NEHALEM;
}
@@ -1479,11 +1493,15 @@ int get_coretype(void){
case 5:
case 6:
if(support_avx())
+#ifndef NO_AVX2
return CORE_HASWELL;
+#else
+ return CORE_SANDYBRIDGE;
+#endif
else
return CORE_NEHALEM;
}
- break;
+ break;
}
break;
@@ -1497,8 +1515,8 @@ int get_coretype(void){
if (family <= 0x5) return CORE_80486;
if (family <= 0xe) return CORE_ATHLON;
if (family == 0xf){
- if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
- else if (exfamily == 5) return CORE_BOBCAT;
+ if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
+ else if (exfamily == 5) return CORE_BOBCAT;
else if (exfamily == 6) {
switch (model) {
case 1:
@@ -1511,7 +1529,7 @@ int get_coretype(void){
if(support_avx())
return CORE_PILEDRIVER;
else
- return CORE_BARCELONA; //OS don't support AVX.
+ return CORE_BARCELONA; //OS don't support AVX.
}
}else return CORE_BARCELONA;
}
@@ -1545,14 +1563,14 @@ void get_cpuconfig(void){
printf("#define L1_CODE_ASSOCIATIVE %d\n", info.associative);
printf("#define L1_CODE_LINESIZE %d\n", info.linesize);
}
-
+
get_cacheinfo(CACHE_INFO_L1_D, &info);
if (info.size > 0) {
printf("#define L1_DATA_SIZE %d\n", info.size * 1024);
printf("#define L1_DATA_ASSOCIATIVE %d\n", info.associative);
printf("#define L1_DATA_LINESIZE %d\n", info.linesize);
}
-
+
get_cacheinfo(CACHE_INFO_L2, &info);
if (info.size > 0) {
printf("#define L2_SIZE %d\n", info.size * 1024);
@@ -1565,21 +1583,21 @@ void get_cpuconfig(void){
printf("#define L2_LINESIZE 64\n");
}
-
+
get_cacheinfo(CACHE_INFO_L3, &info);
if (info.size > 0) {
printf("#define L3_SIZE %d\n", info.size * 1024);
printf("#define L3_ASSOCIATIVE %d\n", info.associative);
printf("#define L3_LINESIZE %d\n", info.linesize);
}
-
+
get_cacheinfo(CACHE_INFO_L1_ITB, &info);
if (info.size > 0) {
printf("#define ITB_SIZE %d\n", info.size * 1024);
printf("#define ITB_ASSOCIATIVE %d\n", info.associative);
printf("#define ITB_ENTRIES %d\n", info.linesize);
}
-
+
get_cacheinfo(CACHE_INFO_L1_DTB, &info);
if (info.size > 0) {
printf("#define DTB_SIZE %d\n", info.size * 1024);
@@ -1589,7 +1607,7 @@ void get_cpuconfig(void){
//fall back for some virtual machines.
printf("#define DTB_DEFAULT_ENTRIES 32\n");
}
-
+
features = get_cputype(GET_FEATURE);
if (features & HAVE_CMOV ) printf("#define HAVE_CMOV\n");
@@ -1612,7 +1630,7 @@ void get_cpuconfig(void){
if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n");
if (features & HAVE_128BITFPU) printf("#define HAVE_128BITFPU\n");
if (features & HAVE_FASTMOVU) printf("#define HAVE_FASTMOVU\n");
-
+
printf("#define NUM_SHAREDCACHE %d\n", get_cputype(GET_NUMSHARE) + 1);
printf("#define NUM_CORES %d\n", get_cputype(GET_NUMCORES) + 1);
diff --git a/ctest/Makefile b/ctest/Makefile
index 0991168..70d3f97 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -1,5 +1,5 @@
#
-# The Makefile compiles c wrappers and testers for CBLAS.
+# The Makefile compiles c wrappers and testers for CBLAS.
#
TOPDIR = ..
@@ -27,13 +27,13 @@ ctestl2o = c_cblas2.o c_c2chke.o auxiliary.o c_xerbla.o constant.o
ctestl3o = c_cblas3.o c_c3chke.o auxiliary.o c_xerbla.o constant.o
-ztestl1o = c_zblas1.o
+ztestl1o = c_zblas1.o
ztestl2o = c_zblas2.o c_z2chke.o auxiliary.o c_xerbla.o constant.o
ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o
-all :: all1 all2 all3
+all :: all1 all2 all3
all1: xscblat1 xdcblat1 xccblat1 xzcblat1
ifeq ($(USE_OPENMP), 1)
@@ -75,10 +75,10 @@ else
endif
clean ::
- rm -f x*
+ rm -f x*
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
-CEXTRALIB =
+CEXTRALIB =
# Single real
xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)
@@ -94,7 +94,7 @@ xdcblat2: $(dtestl2o) c_dblat2.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xdcblat2 c_dblat2.o $(dtestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xdcblat3: $(dtestl3o) c_dblat3.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xdcblat3 c_dblat3.o $(dtestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
-
+
# Single complex
xccblat1: $(ctestl1o) c_cblat1.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat1 c_cblat1.o $(ctestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
@@ -103,12 +103,12 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME)
xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
-# Double complex
+# Double complex
xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
-
+
include $(TOPDIR)/Makefile.tail
diff --git a/ctest/c_c2chke.c b/ctest/c_c2chke.c
index 611cc21..eb5b990 100644
--- a/ctest/c_c2chke.c
+++ b/ctest/c_c2chke.c
@@ -26,11 +26,11 @@ void chkxer(void) {
void F77_c2chke(char *rout) {
char *sf = ( rout ) ;
- float A[2] = {0.0,0.0},
- X[2] = {0.0,0.0},
- Y[2] = {0.0,0.0},
+ float A[2] = {0.0,0.0},
+ X[2] = {0.0,0.0},
+ Y[2] = {0.0,0.0},
ALPHA[2] = {0.0,0.0},
- BETA[2] = {0.0,0.0},
+ BETA[2] = {0.0,0.0},
RALPHA = 0.0;
extern int cblas_info, cblas_lerr, cblas_ok;
extern int RowMajorStrg;
@@ -48,588 +48,588 @@ void F77_c2chke(char *rout) {
if (strncmp( sf,"cblas_cgemv",11)==0) {
cblas_rout = "cblas_cgemv";
cblas_info = 1;
- cblas_cgemv(INVALID, CblasNoTrans, 0, 0,
+ cblas_cgemv(INVALID, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_cgemv(CblasColMajor, INVALID, 0, 0,
+ cblas_cgemv(CblasColMajor, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_cgemv(CblasColMajor, CblasNoTrans, INVALID, 0,
+ cblas_cgemv(CblasColMajor, CblasNoTrans, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_cgemv(CblasColMajor, CblasNoTrans, 0, INVALID,
+ cblas_cgemv(CblasColMajor, CblasNoTrans, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_cgemv(CblasColMajor, CblasNoTrans, 2, 0,
+ cblas_cgemv(CblasColMajor, CblasNoTrans, 2, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_cgemv(CblasColMajor, CblasNoTrans, 0, 0,
+ cblas_cgemv(CblasColMajor, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 12; RowMajorStrg = FALSE;
- cblas_cgemv(CblasColMajor, CblasNoTrans, 0, 0,
+ cblas_cgemv(CblasColMajor, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE;
- cblas_cgemv(CblasRowMajor, INVALID, 0, 0,
+ cblas_cgemv(CblasRowMajor, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_cgemv(CblasRowMajor, CblasNoTrans, INVALID, 0,
+ cblas_cgemv(CblasRowMajor, CblasNoTrans, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, INVALID,
+ cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 2,
+ cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 2,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 0,
+ cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 12; RowMajorStrg = TRUE;
- cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 0,
+ cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_cgbmv",11)==0) {
cblas_rout = "cblas_cgbmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_cgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0,
+ cblas_cgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_cgbmv(CblasColMajor, INVALID, 0, 0, 0, 0,
+ cblas_cgbmv(CblasColMajor, INVALID, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_cgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0,
+ cblas_cgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0,
+ cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0,
+ cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
- cblas_cgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID,
+ cblas_cgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0,
+ cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
- cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0,
+ cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
- cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0,
+ cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_cgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0,
+ cblas_cgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_cgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0,
+ cblas_cgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0,
+ cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0,
+ cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
- cblas_cgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID,
+ cblas_cgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0,
+ cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
- cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0,
+ cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
- cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0,
+ cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_chemv",11)==0) {
cblas_rout = "cblas_chemv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_chemv(INVALID, CblasUpper, 0,
+ cblas_chemv(INVALID, CblasUpper, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_chemv(CblasColMajor, INVALID, 0,
+ cblas_chemv(CblasColMajor, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_chemv(CblasColMajor, CblasUpper, INVALID,
+ cblas_chemv(CblasColMajor, CblasUpper, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
- cblas_chemv(CblasColMajor, CblasUpper, 2,
+ cblas_chemv(CblasColMajor, CblasUpper, 2,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_chemv(CblasColMajor, CblasUpper, 0,
+ cblas_chemv(CblasColMajor, CblasUpper, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
- cblas_chemv(CblasColMajor, CblasUpper, 0,
+ cblas_chemv(CblasColMajor, CblasUpper, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_chemv(CblasRowMajor, INVALID, 0,
+ cblas_chemv(CblasRowMajor, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_chemv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_chemv(CblasRowMajor, CblasUpper, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
- cblas_chemv(CblasRowMajor, CblasUpper, 2,
+ cblas_chemv(CblasRowMajor, CblasUpper, 2,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_chemv(CblasRowMajor, CblasUpper, 0,
+ cblas_chemv(CblasRowMajor, CblasUpper, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
- cblas_chemv(CblasRowMajor, CblasUpper, 0,
+ cblas_chemv(CblasRowMajor, CblasUpper, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_chbmv",11)==0) {
cblas_rout = "cblas_chbmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_chbmv(INVALID, CblasUpper, 0, 0,
+ cblas_chbmv(INVALID, CblasUpper, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_chbmv(CblasColMajor, INVALID, 0, 0,
+ cblas_chbmv(CblasColMajor, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_chbmv(CblasColMajor, CblasUpper, INVALID, 0,
+ cblas_chbmv(CblasColMajor, CblasUpper, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_chbmv(CblasColMajor, CblasUpper, 0, INVALID,
+ cblas_chbmv(CblasColMajor, CblasUpper, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_chbmv(CblasColMajor, CblasUpper, 0, 1,
+ cblas_chbmv(CblasColMajor, CblasUpper, 0, 1,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_chbmv(CblasColMajor, CblasUpper, 0, 0,
+ cblas_chbmv(CblasColMajor, CblasUpper, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 12; RowMajorStrg = FALSE;
- cblas_chbmv(CblasColMajor, CblasUpper, 0, 0,
+ cblas_chbmv(CblasColMajor, CblasUpper, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_chbmv(CblasRowMajor, INVALID, 0, 0,
+ cblas_chbmv(CblasRowMajor, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_chbmv(CblasRowMajor, CblasUpper, INVALID, 0,
+ cblas_chbmv(CblasRowMajor, CblasUpper, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_chbmv(CblasRowMajor, CblasUpper, 0, INVALID,
+ cblas_chbmv(CblasRowMajor, CblasUpper, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_chbmv(CblasRowMajor, CblasUpper, 0, 1,
+ cblas_chbmv(CblasRowMajor, CblasUpper, 0, 1,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_chbmv(CblasRowMajor, CblasUpper, 0, 0,
+ cblas_chbmv(CblasRowMajor, CblasUpper, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 12; RowMajorStrg = TRUE;
- cblas_chbmv(CblasRowMajor, CblasUpper, 0, 0,
+ cblas_chbmv(CblasRowMajor, CblasUpper, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_chpmv",11)==0) {
cblas_rout = "cblas_chpmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_chpmv(INVALID, CblasUpper, 0,
+ cblas_chpmv(INVALID, CblasUpper, 0,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_chpmv(CblasColMajor, INVALID, 0,
+ cblas_chpmv(CblasColMajor, INVALID, 0,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_chpmv(CblasColMajor, CblasUpper, INVALID,
+ cblas_chpmv(CblasColMajor, CblasUpper, INVALID,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_chpmv(CblasColMajor, CblasUpper, 0,
+ cblas_chpmv(CblasColMajor, CblasUpper, 0,
ALPHA, A, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = FALSE;
- cblas_chpmv(CblasColMajor, CblasUpper, 0,
+ cblas_chpmv(CblasColMajor, CblasUpper, 0,
ALPHA, A, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_chpmv(CblasRowMajor, INVALID, 0,
+ cblas_chpmv(CblasRowMajor, INVALID, 0,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_chpmv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_chpmv(CblasRowMajor, CblasUpper, INVALID,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_chpmv(CblasRowMajor, CblasUpper, 0,
+ cblas_chpmv(CblasRowMajor, CblasUpper, 0,
ALPHA, A, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = TRUE;
- cblas_chpmv(CblasRowMajor, CblasUpper, 0,
+ cblas_chpmv(CblasRowMajor, CblasUpper, 0,
ALPHA, A, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_ctrmv",11)==0) {
cblas_rout = "cblas_ctrmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_ctrmv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_ctrmv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_ctrmv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_ctrmv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_ctrmv(CblasColMajor, CblasUpper, INVALID,
+ cblas_ctrmv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 2, A, 1, X, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_ctrmv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_ctrmv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_ctrmv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_ctrmv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 2, A, 1, X, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_ctbmv",11)==0) {
cblas_rout = "cblas_ctbmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_ctbmv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_ctbmv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_ctbmv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_ctbmv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_ctbmv(CblasColMajor, CblasUpper, INVALID,
+ cblas_ctbmv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
- cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 1, A, 1, X, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = FALSE;
- cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_ctbmv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_ctbmv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_ctbmv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_ctbmv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
- cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 1, A, 1, X, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = TRUE;
- cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_ctpmv",11)==0) {
cblas_rout = "cblas_ctpmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_ctpmv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_ctpmv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_ctpmv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_ctpmv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_ctpmv(CblasColMajor, CblasUpper, INVALID,
+ cblas_ctpmv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_ctpmv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_ctpmv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_ctpmv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_ctpmv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_ctrsv",11)==0) {
cblas_rout = "cblas_ctrsv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_ctrsv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_ctrsv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_ctrsv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_ctrsv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_ctrsv(CblasColMajor, CblasUpper, INVALID,
+ cblas_ctrsv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 2, A, 1, X, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_ctrsv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_ctrsv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_ctrsv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_ctrsv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 2, A, 1, X, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_ctbsv",11)==0) {
cblas_rout = "cblas_ctbsv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_ctbsv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_ctbsv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_ctbsv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_ctbsv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_ctbsv(CblasColMajor, CblasUpper, INVALID,
+ cblas_ctbsv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
- cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 1, A, 1, X, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = FALSE;
- cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_ctbsv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_ctbsv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_ctbsv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_ctbsv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
- cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 1, A, 1, X, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = TRUE;
- cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_ctpsv",11)==0) {
cblas_rout = "cblas_ctpsv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_ctpsv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_ctpsv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_ctpsv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_ctpsv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_ctpsv(CblasColMajor, CblasUpper, INVALID,
+ cblas_ctpsv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_ctpsv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_ctpsv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_ctpsv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_ctpsv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_cgeru",10)==0) {
@@ -818,7 +818,7 @@ void F77_c2chke(char *rout) {
cblas_info = 6; RowMajorStrg = FALSE;
cblas_chpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A );
chkxer();
- }
+ }
if (cblas_ok == TRUE)
printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout);
else
diff --git a/ctest/c_c3chke.c b/ctest/c_c3chke.c
index 2951552..1c133fb 100644
--- a/ctest/c_c3chke.c
+++ b/ctest/c_c3chke.c
@@ -30,7 +30,7 @@ void F77_c3chke(char * rout) {
B[4] = {0.0,0.0,0.0,0.0},
C[4] = {0.0,0.0,0.0,0.0},
ALPHA[2] = {0.0,0.0},
- BETA[2] = {0.0,0.0},
+ BETA[2] = {0.0,0.0},
RALPHA = 0.0, RBETA = 0.0;
extern int cblas_info, cblas_lerr, cblas_ok;
extern int RowMajorStrg;
@@ -49,15 +49,15 @@ void F77_c3chke(char * rout) {
cblas_rout = "cblas_cgemm" ;
cblas_info = 1;
- cblas_cgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0,
+ cblas_cgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
- cblas_cgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0,
+ cblas_cgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
- cblas_cgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0,
+ cblas_cgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
@@ -272,7 +272,7 @@ void F77_c3chke(char * rout) {
cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
-
+
} else if (strncmp( sf,"cblas_chemm" ,11)==0) {
cblas_rout = "cblas_chemm" ;
@@ -1696,7 +1696,7 @@ void F77_c3chke(char * rout) {
cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
-
+
}
if (cblas_ok == 1 )
diff --git a/ctest/c_cblas1.c b/ctest/c_cblas1.c
index f5ffc14..d723fd6 100644
--- a/ctest/c_cblas1.c
+++ b/ctest/c_cblas1.c
@@ -16,21 +16,21 @@ void F77_caxpy(const int *N, const void *alpha, void *X,
return;
}
-void F77_ccopy(const int *N, void *X, const int *incX,
+void F77_ccopy(const int *N, void *X, const int *incX,
void *Y, const int *incY)
{
cblas_ccopy(*N, X, *incX, Y, *incY);
return;
}
-void F77_cdotc(const int *N, void *X, const int *incX,
+void F77_cdotc(const int *N, void *X, const int *incX,
void *Y, const int *incY, void *dotc)
{
cblas_cdotc_sub(*N, X, *incX, Y, *incY, dotc);
return;
}
-void F77_cdotu(const int *N, void *X, const int *incX,
+void F77_cdotu(const int *N, void *X, const int *incX,
void *Y, const int *incY,void *dotu)
{
cblas_cdotu_sub(*N, X, *incX, Y, *incY, dotu);
diff --git a/ctest/c_cblas2.c b/ctest/c_cblas2.c
index 7a886ac..8fbe3b0 100644
--- a/ctest/c_cblas2.c
+++ b/ctest/c_cblas2.c
@@ -8,9 +8,9 @@
#include "common.h"
#include "cblas_test.h"
-void F77_cgemv(int *order, char *transp, int *m, int *n,
+void F77_cgemv(int *order, char *transp, int *m, int *n,
const void *alpha,
- CBLAS_TEST_COMPLEX *a, int *lda, const void *x, int *incx,
+ CBLAS_TEST_COMPLEX *a, int *lda, const void *x, int *incx,
const void *beta, void *y, int *incy) {
CBLAS_TEST_COMPLEX *A;
@@ -38,9 +38,9 @@ void F77_cgemv(int *order, char *transp, int *m, int *n,
*m, *n, alpha, a, *lda, x, *incx, beta, y, *incy );
}
-void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
- CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
- CBLAS_TEST_COMPLEX *x, int *incx,
+void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
+ CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
+ CBLAS_TEST_COMPLEX *x, int *incx,
CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *y, int *incy) {
CBLAS_TEST_COMPLEX *A;
@@ -85,8 +85,8 @@ void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
*incx, beta, y, *incy );
}
-void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha,
- CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *y, int *incy,
+void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha,
+ CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *y, int *incy,
CBLAS_TEST_COMPLEX *a, int *lda){
CBLAS_TEST_COMPLEX *A;
@@ -114,8 +114,8 @@ void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha,
cblas_cgeru( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda );
}
-void F77_cgerc(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha,
- CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *y, int *incy,
+void F77_cgerc(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha,
+ CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *y, int *incy,
CBLAS_TEST_COMPLEX *a, int *lda) {
CBLAS_TEST_COMPLEX *A;
int i,j,LDA;
@@ -165,7 +165,7 @@ void F77_chemv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
free(A);
}
else if (*order == TEST_COL_MJR)
- cblas_chemv( CblasColMajor, uplo, *n, alpha, a, *lda, x, *incx,
+ cblas_chemv( CblasColMajor, uplo, *n, alpha, a, *lda, x, *incx,
beta, y, *incy );
else
cblas_chemv( UNDEFINED, uplo, *n, alpha, a, *lda, x, *incx,
@@ -173,7 +173,7 @@ void F77_chemv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
}
void F77_chbmv(int *order, char *uplow, int *n, int *k,
- CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
+ CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *beta,
CBLAS_TEST_COMPLEX *y, int *incy){
@@ -186,7 +186,7 @@ int i,irow,j,jcol,LDA;
if (*order == TEST_ROW_MJR) {
if (uplo != CblasUpper && uplo != CblasLower )
- cblas_chbmv(CblasRowMajor, UNDEFINED, *n, *k, alpha, a, *lda, x,
+ cblas_chbmv(CblasRowMajor, UNDEFINED, *n, *k, alpha, a, *lda, x,
*incx, beta, y, *incy );
else {
LDA = *k+2;
@@ -237,7 +237,7 @@ int i,irow,j,jcol,LDA;
}
void F77_chpmv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
- CBLAS_TEST_COMPLEX *ap, CBLAS_TEST_COMPLEX *x, int *incx,
+ CBLAS_TEST_COMPLEX *ap, CBLAS_TEST_COMPLEX *x, int *incx,
CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *y, int *incy){
CBLAS_TEST_COMPLEX *A, *AP;
@@ -247,7 +247,7 @@ void F77_chpmv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
get_uplo_type(uplow,&uplo);
if (*order == TEST_ROW_MJR) {
if (uplo != CblasUpper && uplo != CblasLower )
- cblas_chpmv(CblasRowMajor, UNDEFINED, *n, alpha, ap, x, *incx,
+ cblas_chpmv(CblasRowMajor, UNDEFINED, *n, alpha, ap, x, *incx,
beta, y, *incy);
else {
LDA = *n;
@@ -344,7 +344,7 @@ void F77_ctbmv(int *order, char *uplow, char *transp, char *diagn,
}
}
}
- cblas_ctbmv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x,
+ cblas_ctbmv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x,
*incx);
free(A);
}
@@ -371,7 +371,7 @@ void F77_ctbsv(int *order, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
if (uplo != CblasUpper && uplo != CblasLower )
- cblas_ctbsv(CblasRowMajor, UNDEFINED, trans, diag, *n, *k, a, *lda, x,
+ cblas_ctbsv(CblasRowMajor, UNDEFINED, trans, diag, *n, *k, a, *lda, x,
*incx);
else {
LDA = *k+2;
@@ -408,7 +408,7 @@ void F77_ctbsv(int *order, char *uplow, char *transp, char *diagn,
}
}
}
- cblas_ctbsv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA,
+ cblas_ctbsv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA,
x, *incx);
free(A);
}
@@ -674,7 +674,7 @@ void F77_chpr2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
if (*order == TEST_ROW_MJR) {
if (uplo != CblasUpper && uplo != CblasLower )
- cblas_chpr2( CblasRowMajor, UNDEFINED, *n, alpha, x, *incx, y,
+ cblas_chpr2( CblasRowMajor, UNDEFINED, *n, alpha, x, *incx, y,
*incy, ap );
else {
LDA = *n;
@@ -752,7 +752,7 @@ void F77_cher(int *order, char *uplow, int *n, float *alpha,
LDA = *n+1;
A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_COMPLEX ));
- for( i=0; i<*n; i++ )
+ for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag;
@@ -786,7 +786,7 @@ void F77_cher2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
LDA = *n+1;
A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
- for( i=0; i<*n; i++ )
+ for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag;
diff --git a/ctest/c_cblas3.c b/ctest/c_cblas3.c
index 9f0da6c..0b2f6b9 100644
--- a/ctest/c_cblas3.c
+++ b/ctest/c_cblas3.c
@@ -12,9 +12,9 @@
#define TEST_ROW_MJR 1
#define UNDEFINED -1
-void F77_cgemm(int *order, char *transpa, char *transpb, int *m, int *n,
+void F77_cgemm(int *order, char *transpa, char *transpb, int *m, int *n,
int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
- CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta,
+ CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta,
CBLAS_TEST_COMPLEX *c, int *ldc ) {
CBLAS_TEST_COMPLEX *A, *B, *C;
@@ -134,7 +134,7 @@ void F77_chemm(int *order, char *rtlf, char *uplow, int *m, int *n,
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
- cblas_chemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB,
+ cblas_chemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB,
beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
@@ -146,10 +146,10 @@ void F77_chemm(int *order, char *rtlf, char *uplow, int *m, int *n,
free(C);
}
else if (*order == TEST_COL_MJR)
- cblas_chemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
+ cblas_chemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
beta, c, *ldc );
else
- cblas_chemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
+ cblas_chemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
beta, c, *ldc );
}
void F77_csymm(int *order, char *rtlf, char *uplow, int *m, int *n,
@@ -190,7 +190,7 @@ void F77_csymm(int *order, char *rtlf, char *uplow, int *m, int *n,
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
C[i*LDC+j]=c[j*(*ldc)+i];
- cblas_csymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB,
+ cblas_csymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB,
beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
@@ -200,15 +200,15 @@ void F77_csymm(int *order, char *rtlf, char *uplow, int *m, int *n,
free(C);
}
else if (*order == TEST_COL_MJR)
- cblas_csymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
+ cblas_csymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
beta, c, *ldc );
else
- cblas_csymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
+ cblas_csymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
beta, c, *ldc );
}
void F77_cherk(int *order, char *uplow, char *transp, int *n, int *k,
- float *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
+ float *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
float *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) {
int i,j,LDA,LDC;
@@ -245,7 +245,7 @@ void F77_cherk(int *order, char *uplow, char *transp, int *n, int *k,
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
- cblas_cherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta,
+ cblas_cherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta,
C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ ) {
@@ -256,15 +256,15 @@ void F77_cherk(int *order, char *uplow, char *transp, int *n, int *k,
free(C);
}
else if (*order == TEST_COL_MJR)
- cblas_cherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
+ cblas_cherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
c, *ldc );
else
- cblas_cherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
+ cblas_cherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
c, *ldc );
}
void F77_csyrk(int *order, char *uplow, char *transp, int *n, int *k,
- CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
+ CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) {
int i,j,LDA,LDC;
@@ -301,7 +301,7 @@ void F77_csyrk(int *order, char *uplow, char *transp, int *n, int *k,
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
- cblas_csyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta,
+ cblas_csyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta,
C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ ) {
@@ -312,10 +312,10 @@ void F77_csyrk(int *order, char *uplow, char *transp, int *n, int *k,
free(C);
}
else if (*order == TEST_COL_MJR)
- cblas_csyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta,
+ cblas_csyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta,
c, *ldc );
else
- cblas_csyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta,
+ cblas_csyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta,
c, *ldc );
}
void F77_cher2k(int *order, char *uplow, char *transp, int *n, int *k,
@@ -364,7 +364,7 @@ void F77_cher2k(int *order, char *uplow, char *transp, int *n, int *k,
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
- cblas_cher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA,
+ cblas_cher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA,
B, LDB, *beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ ) {
@@ -376,10 +376,10 @@ void F77_cher2k(int *order, char *uplow, char *transp, int *n, int *k,
free(C);
}
else if (*order == TEST_COL_MJR)
- cblas_cher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda,
+ cblas_cher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda,
b, *ldb, *beta, c, *ldc );
else
- cblas_cher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda,
+ cblas_cher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda,
b, *ldb, *beta, c, *ldc );
}
void F77_csyr2k(int *order, char *uplow, char *transp, int *n, int *k,
@@ -428,7 +428,7 @@ void F77_csyr2k(int *order, char *uplow, char *transp, int *n, int *k,
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
- cblas_csyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA,
+ cblas_csyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA,
B, LDB, beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ ) {
@@ -440,14 +440,14 @@ void F77_csyr2k(int *order, char *uplow, char *transp, int *n, int *k,
free(C);
}
else if (*order == TEST_COL_MJR)
- cblas_csyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda,
+ cblas_csyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
else
- cblas_csyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda,
+ cblas_csyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
}
void F77_ctrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
- int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a,
+ int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a,
int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) {
int i,j,LDA,LDB;
CBLAS_TEST_COMPLEX *A, *B;
@@ -487,7 +487,7 @@ void F77_ctrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
- cblas_ctrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha,
+ cblas_ctrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha,
A, LDA, B, LDB );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
@@ -498,15 +498,15 @@ void F77_ctrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
free(B);
}
else if (*order == TEST_COL_MJR)
- cblas_ctrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha,
+ cblas_ctrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb);
else
- cblas_ctrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha,
+ cblas_ctrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb);
}
void F77_ctrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
- int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a,
+ int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a,
int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) {
int i,j,LDA,LDB;
CBLAS_TEST_COMPLEX *A, *B;
@@ -546,7 +546,7 @@ void F77_ctrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
- cblas_ctrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha,
+ cblas_ctrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha,
A, LDA, B, LDB );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
@@ -557,9 +557,9 @@ void F77_ctrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
free(B);
}
else if (*order == TEST_COL_MJR)
- cblas_ctrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha,
+ cblas_ctrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb);
else
- cblas_ctrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha,
+ cblas_ctrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb);
}
diff --git a/ctest/c_cblat2.f b/ctest/c_cblat2.f
index 545ba4b..d934ebb 100644
--- a/ctest/c_cblat2.f
+++ b/ctest/c_cblat2.f
@@ -348,13 +348,13 @@
160 IF (CORDER) THEN
CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
$ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC,
- $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z,
+ $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z,
$ 0 )
END IF
IF (RORDER) THEN
CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
$ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC,
- $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z,
+ $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z,
$ 1 )
END IF
GO TO 200
@@ -581,7 +581,7 @@
CTRANS = ' CblasNoTrans'
ELSE IF (TRANS.EQ.'T')THEN
CTRANS = ' CblasTrans'
- ELSE
+ ELSE
CTRANS = 'CblasConjTrans'
END IF
TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C'
@@ -684,7 +684,7 @@
*
* See what data changed inside subroutines.
*
-* IF(TRANS .NE. 'C' .OR. (INCX .GT. 0 .AND. INCY .GT. 0)) THEN
+* IF(TRANS .NE. 'C' .OR. (INCX .GT. 0 .AND. INCY .GT. 0)) THEN
ISAME( 1 ) = TRANS.EQ.TRANSS
ISAME( 2 ) = MS.EQ.M
ISAME( 3 ) = NS.EQ.N
@@ -925,7 +925,7 @@
UPLO = ICH( IC: IC )
IF (UPLO.EQ.'U')THEN
CUPLO = ' CblasUpper'
- ELSE
+ ELSE
CUPLO = ' CblasLower'
END IF
*
@@ -1284,7 +1284,7 @@
UPLO = ICHU( ICU: ICU )
IF (UPLO.EQ.'U')THEN
CUPLO = ' CblasUpper'
- ELSE
+ ELSE
CUPLO = ' CblasLower'
END IF
*
@@ -1294,7 +1294,7 @@
CTRANS = ' CblasNoTrans'
ELSE IF (TRANS.EQ.'T')THEN
CTRANS = ' CblasTrans'
- ELSE
+ ELSE
CTRANS = 'CblasConjTrans'
END IF
*
diff --git a/ctest/c_cblat3.f b/ctest/c_cblat3.f
index b03d479..7d1743b 100644
--- a/ctest/c_cblat3.f
+++ b/ctest/c_cblat3.f
@@ -424,7 +424,7 @@
END
SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
$ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
- $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G,
+ $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G,
$ IORDER )
*
* Tests CGEMM.
@@ -600,7 +600,7 @@
IF( REWI )
$ REWIND NTRA
CALL CCGEMM( IORDER, TRANSA, TRANSB, M, N,
- $ K, ALPHA, AA, LDA, BB, LDB,
+ $ K, ALPHA, AA, LDA, BB, LDB,
$ BETA, CC, LDC )
*
* Check if error-exit was taken incorrectly.
@@ -688,7 +688,7 @@
*
120 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
- CALL CPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB,
+ CALL CPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB,
$ M, N, K, ALPHA, LDA, LDB, BETA, LDC)
*
130 CONTINUE
@@ -724,24 +724,24 @@
CHARACTER*1 TRANSA, TRANSB
CHARACTER*12 SNAME
CHARACTER*14 CRC, CTA,CTB
-
+
IF (TRANSA.EQ.'N')THEN
CTA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CTA = ' CblasTrans'
- ELSE
+ ELSE
CTA = 'CblasConjTrans'
END IF
IF (TRANSB.EQ.'N')THEN
CTB = ' CblasNoTrans'
ELSE IF (TRANSB.EQ.'T')THEN
CTB = ' CblasTrans'
- ELSE
+ ELSE
CTB = 'CblasConjTrans'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB
@@ -754,7 +754,7 @@
*
SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
$ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
- $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G,
+ $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G,
$ IORDER )
*
* Tests CHEMM and CSYMM.
@@ -910,9 +910,9 @@
* Call the subroutine.
*
IF( TRACE )
- $ CALL CPRCN2(NTRA, NC, SNAME, IORDER,
- $ SIDE, UPLO, M, N, ALPHA, LDA, LDB,
- $ BETA, LDC)
+ $ CALL CPRCN2(NTRA, NC, SNAME, IORDER,
+ $ SIDE, UPLO, M, N, ALPHA, LDA, LDB,
+ $ BETA, LDC)
IF( REWI )
$ REWIND NTRA
IF( CONJ )THEN
@@ -1015,7 +1015,7 @@
110 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
CALL CPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA,
- $ LDB, BETA, LDC)
+ $ LDB, BETA, LDC)
*
120 CONTINUE
RETURN
@@ -1050,20 +1050,20 @@
CHARACTER*1 SIDE, UPLO
CHARACTER*12 SNAME
CHARACTER*14 CRC, CS,CU
-
+
IF (SIDE.EQ.'L')THEN
CS = ' CblasLeft'
- ELSE
+ ELSE
CS = ' CblasRight'
END IF
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU
@@ -1401,22 +1401,22 @@
CHARACTER*1 SIDE, UPLO, TRANSA, DIAG
CHARACTER*12 SNAME
CHARACTER*14 CRC, CS, CU, CA, CD
-
+
IF (SIDE.EQ.'L')THEN
CS = ' CblasLeft'
- ELSE
+ ELSE
CS = ' CblasRight'
END IF
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (TRANSA.EQ.'N')THEN
CA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CA = ' CblasTrans'
- ELSE
+ ELSE
CA = 'CblasConjTrans'
END IF
IF (DIAG.EQ.'N')THEN
@@ -1426,7 +1426,7 @@
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU
@@ -1787,22 +1787,22 @@
CHARACTER*1 UPLO, TRANSA
CHARACTER*12 SNAME
CHARACTER*14 CRC, CU, CA
-
+
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (TRANSA.EQ.'N')THEN
CA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CA = ' CblasTrans'
- ELSE
+ ELSE
CA = 'CblasConjTrans'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
@@ -1821,29 +1821,29 @@
CHARACTER*1 UPLO, TRANSA
CHARACTER*12 SNAME
CHARACTER*14 CRC, CU, CA
-
+
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (TRANSA.EQ.'N')THEN
CA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CA = ' CblasTrans'
- ELSE
+ ELSE
CA = 'CblasConjTrans'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC
9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') )
- 9994 FORMAT( 10X, 2( I3, ',' ),
+ 9994 FORMAT( 10X, 2( I3, ',' ),
$ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' )
END
*
@@ -2040,7 +2040,7 @@
IF( REWI )
$ REWIND NTRA
CALL CCSYR2K( IORDER, UPLO, TRANS, N, K,
- $ ALPHA, AA, LDA, BB, LDB, BETA,
+ $ ALPHA, AA, LDA, BB, LDB, BETA,
$ CC, LDC )
END IF
*
@@ -2240,22 +2240,22 @@
CHARACTER*1 UPLO, TRANSA
CHARACTER*12 SNAME
CHARACTER*14 CRC, CU, CA
-
+
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (TRANSA.EQ.'N')THEN
CA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CA = ' CblasTrans'
- ELSE
+ ELSE
CA = 'CblasConjTrans'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
@@ -2275,22 +2275,22 @@
CHARACTER*1 UPLO, TRANSA
CHARACTER*12 SNAME
CHARACTER*14 CRC, CU, CA
-
+
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (TRANSA.EQ.'N')THEN
CA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CA = ' CblasTrans'
- ELSE
+ ELSE
CA = 'CblasConjTrans'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
diff --git a/ctest/c_d2chke.c b/ctest/c_d2chke.c
index 23de9a4..7cdd041 100644
--- a/ctest/c_d2chke.c
+++ b/ctest/c_d2chke.c
@@ -26,9 +26,9 @@ void chkxer(void) {
void F77_d2chke(char *rout) {
char *sf = ( rout ) ;
- double A[2] = {0.0,0.0},
- X[2] = {0.0,0.0},
- Y[2] = {0.0,0.0},
+ double A[2] = {0.0,0.0},
+ X[2] = {0.0,0.0},
+ Y[2] = {0.0,0.0},
ALPHA=0.0, BETA=0.0;
extern int cblas_info, cblas_lerr, cblas_ok;
extern int RowMajorStrg;
@@ -46,588 +46,588 @@ void F77_d2chke(char *rout) {
if (strncmp( sf,"cblas_dgemv",11)==0) {
cblas_rout = "cblas_dgemv";
cblas_info = 1;
- cblas_dgemv(INVALID, CblasNoTrans, 0, 0,
+ cblas_dgemv(INVALID, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_dgemv(CblasColMajor, INVALID, 0, 0,
+ cblas_dgemv(CblasColMajor, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_dgemv(CblasColMajor, CblasNoTrans, INVALID, 0,
+ cblas_dgemv(CblasColMajor, CblasNoTrans, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_dgemv(CblasColMajor, CblasNoTrans, 0, INVALID,
+ cblas_dgemv(CblasColMajor, CblasNoTrans, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_dgemv(CblasColMajor, CblasNoTrans, 2, 0,
+ cblas_dgemv(CblasColMajor, CblasNoTrans, 2, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_dgemv(CblasColMajor, CblasNoTrans, 0, 0,
+ cblas_dgemv(CblasColMajor, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 12; RowMajorStrg = FALSE;
- cblas_dgemv(CblasColMajor, CblasNoTrans, 0, 0,
+ cblas_dgemv(CblasColMajor, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE;
- cblas_dgemv(CblasRowMajor, INVALID, 0, 0,
+ cblas_dgemv(CblasRowMajor, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_dgemv(CblasRowMajor, CblasNoTrans, INVALID, 0,
+ cblas_dgemv(CblasRowMajor, CblasNoTrans, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, INVALID,
+ cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 2,
+ cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 2,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 0,
+ cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 12; RowMajorStrg = TRUE;
- cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 0,
+ cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_dgbmv",11)==0) {
cblas_rout = "cblas_dgbmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_dgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0,
+ cblas_dgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_dgbmv(CblasColMajor, INVALID, 0, 0, 0, 0,
+ cblas_dgbmv(CblasColMajor, INVALID, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_dgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0,
+ cblas_dgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0,
+ cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0,
+ cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
- cblas_dgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID,
+ cblas_dgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0,
+ cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
- cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0,
+ cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
- cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0,
+ cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_dgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0,
+ cblas_dgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_dgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0,
+ cblas_dgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0,
+ cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0,
+ cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
- cblas_dgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID,
+ cblas_dgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0,
+ cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
- cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0,
+ cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
- cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0,
+ cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_dsymv",11)==0) {
cblas_rout = "cblas_dsymv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_dsymv(INVALID, CblasUpper, 0,
+ cblas_dsymv(INVALID, CblasUpper, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_dsymv(CblasColMajor, INVALID, 0,
+ cblas_dsymv(CblasColMajor, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_dsymv(CblasColMajor, CblasUpper, INVALID,
+ cblas_dsymv(CblasColMajor, CblasUpper, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
- cblas_dsymv(CblasColMajor, CblasUpper, 2,
+ cblas_dsymv(CblasColMajor, CblasUpper, 2,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_dsymv(CblasColMajor, CblasUpper, 0,
+ cblas_dsymv(CblasColMajor, CblasUpper, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
- cblas_dsymv(CblasColMajor, CblasUpper, 0,
+ cblas_dsymv(CblasColMajor, CblasUpper, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_dsymv(CblasRowMajor, INVALID, 0,
+ cblas_dsymv(CblasRowMajor, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_dsymv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_dsymv(CblasRowMajor, CblasUpper, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
- cblas_dsymv(CblasRowMajor, CblasUpper, 2,
+ cblas_dsymv(CblasRowMajor, CblasUpper, 2,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_dsymv(CblasRowMajor, CblasUpper, 0,
+ cblas_dsymv(CblasRowMajor, CblasUpper, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
- cblas_dsymv(CblasRowMajor, CblasUpper, 0,
+ cblas_dsymv(CblasRowMajor, CblasUpper, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_dsbmv",11)==0) {
cblas_rout = "cblas_dsbmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_dsbmv(INVALID, CblasUpper, 0, 0,
+ cblas_dsbmv(INVALID, CblasUpper, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_dsbmv(CblasColMajor, INVALID, 0, 0,
+ cblas_dsbmv(CblasColMajor, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_dsbmv(CblasColMajor, CblasUpper, INVALID, 0,
+ cblas_dsbmv(CblasColMajor, CblasUpper, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_dsbmv(CblasColMajor, CblasUpper, 0, INVALID,
+ cblas_dsbmv(CblasColMajor, CblasUpper, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_dsbmv(CblasColMajor, CblasUpper, 0, 1,
+ cblas_dsbmv(CblasColMajor, CblasUpper, 0, 1,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_dsbmv(CblasColMajor, CblasUpper, 0, 0,
+ cblas_dsbmv(CblasColMajor, CblasUpper, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 12; RowMajorStrg = FALSE;
- cblas_dsbmv(CblasColMajor, CblasUpper, 0, 0,
+ cblas_dsbmv(CblasColMajor, CblasUpper, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_dsbmv(CblasRowMajor, INVALID, 0, 0,
+ cblas_dsbmv(CblasRowMajor, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_dsbmv(CblasRowMajor, CblasUpper, INVALID, 0,
+ cblas_dsbmv(CblasRowMajor, CblasUpper, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_dsbmv(CblasRowMajor, CblasUpper, 0, INVALID,
+ cblas_dsbmv(CblasRowMajor, CblasUpper, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 1,
+ cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 1,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 0,
+ cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 12; RowMajorStrg = TRUE;
- cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 0,
+ cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_dspmv",11)==0) {
cblas_rout = "cblas_dspmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_dspmv(INVALID, CblasUpper, 0,
+ cblas_dspmv(INVALID, CblasUpper, 0,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_dspmv(CblasColMajor, INVALID, 0,
+ cblas_dspmv(CblasColMajor, INVALID, 0,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_dspmv(CblasColMajor, CblasUpper, INVALID,
+ cblas_dspmv(CblasColMajor, CblasUpper, INVALID,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_dspmv(CblasColMajor, CblasUpper, 0,
+ cblas_dspmv(CblasColMajor, CblasUpper, 0,
ALPHA, A, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = FALSE;
- cblas_dspmv(CblasColMajor, CblasUpper, 0,
+ cblas_dspmv(CblasColMajor, CblasUpper, 0,
ALPHA, A, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_dspmv(CblasRowMajor, INVALID, 0,
+ cblas_dspmv(CblasRowMajor, INVALID, 0,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_dspmv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_dspmv(CblasRowMajor, CblasUpper, INVALID,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_dspmv(CblasRowMajor, CblasUpper, 0,
+ cblas_dspmv(CblasRowMajor, CblasUpper, 0,
ALPHA, A, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = TRUE;
- cblas_dspmv(CblasRowMajor, CblasUpper, 0,
+ cblas_dspmv(CblasRowMajor, CblasUpper, 0,
ALPHA, A, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_dtrmv",11)==0) {
cblas_rout = "cblas_dtrmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_dtrmv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_dtrmv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_dtrmv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_dtrmv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_dtrmv(CblasColMajor, CblasUpper, INVALID,
+ cblas_dtrmv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 2, A, 1, X, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_dtrmv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_dtrmv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_dtrmv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_dtrmv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 2, A, 1, X, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_dtbmv",11)==0) {
cblas_rout = "cblas_dtbmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_dtbmv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_dtbmv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_dtbmv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_dtbmv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_dtbmv(CblasColMajor, CblasUpper, INVALID,
+ cblas_dtbmv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
- cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 1, A, 1, X, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = FALSE;
- cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_dtbmv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_dtbmv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_dtbmv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_dtbmv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
- cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 1, A, 1, X, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = TRUE;
- cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_dtpmv",11)==0) {
cblas_rout = "cblas_dtpmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_dtpmv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_dtpmv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_dtpmv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_dtpmv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_dtpmv(CblasColMajor, CblasUpper, INVALID,
+ cblas_dtpmv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_dtpmv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_dtpmv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_dtpmv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_dtpmv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_dtrsv",11)==0) {
cblas_rout = "cblas_dtrsv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_dtrsv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_dtrsv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_dtrsv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_dtrsv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_dtrsv(CblasColMajor, CblasUpper, INVALID,
+ cblas_dtrsv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 2, A, 1, X, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_dtrsv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_dtrsv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_dtrsv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_dtrsv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 2, A, 1, X, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_dtbsv",11)==0) {
cblas_rout = "cblas_dtbsv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_dtbsv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_dtbsv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_dtbsv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_dtbsv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_dtbsv(CblasColMajor, CblasUpper, INVALID,
+ cblas_dtbsv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
- cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 1, A, 1, X, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = FALSE;
- cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_dtbsv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_dtbsv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_dtbsv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_dtbsv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
- cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 1, A, 1, X, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = TRUE;
- cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_dtpsv",11)==0) {
cblas_rout = "cblas_dtpsv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_dtpsv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_dtpsv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_dtpsv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_dtpsv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_dtpsv(CblasColMajor, CblasUpper, INVALID,
+ cblas_dtpsv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_dtpsv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_dtpsv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_dtpsv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_dtpsv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_dger",10)==0) {
@@ -781,7 +781,7 @@ void F77_d2chke(char *rout) {
cblas_info = 6; RowMajorStrg = FALSE;
cblas_dspr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A );
chkxer();
- }
+ }
if (cblas_ok == TRUE)
printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout);
else
diff --git a/ctest/c_d3chke.c b/ctest/c_d3chke.c
index 1149475..700cff2 100644
--- a/ctest/c_d3chke.c
+++ b/ctest/c_d3chke.c
@@ -26,9 +26,9 @@ void chkxer(void) {
void F77_d3chke(char *rout) {
char *sf = ( rout ) ;
- double A[2] = {0.0,0.0},
- B[2] = {0.0,0.0},
- C[2] = {0.0,0.0},
+ double A[2] = {0.0,0.0},
+ B[2] = {0.0,0.0},
+ C[2] = {0.0,0.0},
ALPHA=0.0, BETA=0.0;
extern int cblas_info, cblas_lerr, cblas_ok;
extern int RowMajorStrg;
@@ -39,7 +39,7 @@ void F77_d3chke(char *rout) {
cblas_xerbla(cblas_info,cblas_rout,"");
F77_xerbla(cblas_rout,&cblas_info);
}
-
+
cblas_ok = TRUE ;
cblas_lerr = PASSED ;
@@ -47,15 +47,15 @@ void F77_d3chke(char *rout) {
cblas_rout = "cblas_dgemm" ;
cblas_info = 1;
- cblas_dgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0,
+ cblas_dgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
- cblas_dgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0,
+ cblas_dgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
- cblas_dgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0,
+ cblas_dgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
diff --git a/ctest/c_dblas1.c b/ctest/c_dblas1.c
index 2371d33..764a75c 100644
--- a/ctest/c_dblas1.c
+++ b/ctest/c_dblas1.c
@@ -21,7 +21,7 @@ void F77_daxpy(const int *N, const double *alpha, const double *X,
return;
}
-void F77_dcopy(const int *N, double *X, const int *incX,
+void F77_dcopy(const int *N, double *X, const int *incX,
double *Y, const int *incY)
{
cblas_dcopy(*N, X, *incX, Y, *incY);
diff --git a/ctest/c_dblas2.c b/ctest/c_dblas2.c
index ed68402..423a587 100644
--- a/ctest/c_dblas2.c
+++ b/ctest/c_dblas2.c
@@ -8,8 +8,8 @@
#include "common.h"
#include "cblas_test.h"
-void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha,
- double *a, int *lda, double *x, int *incx, double *beta,
+void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha,
+ double *a, int *lda, double *x, int *incx, double *beta,
double *y, int *incy ) {
double *A;
@@ -23,7 +23,7 @@ void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha,
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ )
A[ LDA*i+j ]=a[ (*lda)*j+i ];
- cblas_dgemv( CblasRowMajor, trans,
+ cblas_dgemv( CblasRowMajor, trans,
*m, *n, *alpha, A, LDA, x, *incx, *beta, y, *incy );
free(A);
}
@@ -68,9 +68,9 @@ void F77_dtrmv(int *order, char *uplow, char *transp, char *diagn,
enum CBLAS_UPLO uplo;
enum CBLAS_DIAG diag;
- get_transpose_type(transp,&trans);
- get_uplo_type(uplow,&uplo);
- get_diag_type(diagn,&diag);
+ get_transpose_type(transp,&trans);
+ get_uplo_type(uplow,&uplo);
+ get_diag_type(diagn,&diag);
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
@@ -88,7 +88,7 @@ void F77_dtrmv(int *order, char *uplow, char *transp, char *diagn,
}
}
-void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn,
+void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn,
int *n, double *a, int *lda, double *x, int *incx ) {
double *A;
int i,j,LDA;
@@ -112,7 +112,7 @@ void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn,
else
cblas_dtrsv(CblasColMajor, uplo, trans, diag, *n, a, *lda, x, *incx );
}
-void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a,
+void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a,
int *lda, double *x, int *incx, double *beta, double *y,
int *incy) {
double *A;
@@ -136,7 +136,7 @@ void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a,
*beta, y, *incy );
}
-void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x,
+void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x,
int *incx, double *a, int *lda) {
double *A;
int i,j,LDA;
@@ -160,7 +160,7 @@ void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x,
cblas_dsyr(CblasColMajor, uplo, *n, *alpha, x, *incx, a, *lda);
}
-void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x,
+void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x,
int *incx, double *y, int *incy, double *a, int *lda) {
double *A;
int i,j,LDA;
@@ -185,7 +185,7 @@ void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x,
}
void F77_dgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
- double *alpha, double *a, int *lda, double *x, int *incx,
+ double *alpha, double *a, int *lda, double *x, int *incx,
double *beta, double *y, int *incy ) {
double *A;
@@ -213,7 +213,7 @@ void F77_dgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
for( j=jcol; j<(*n+*kl); j++ )
A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ];
}
- cblas_dgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, *alpha,
+ cblas_dgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, *alpha,
A, LDA, x, *incx, *beta, y, *incy );
free(A);
}
@@ -230,9 +230,9 @@ void F77_dtbmv(int *order, char *uplow, char *transp, char *diagn,
enum CBLAS_UPLO uplo;
enum CBLAS_DIAG diag;
- get_transpose_type(transp,&trans);
- get_uplo_type(uplow,&uplo);
- get_diag_type(diagn,&diag);
+ get_transpose_type(transp,&trans);
+ get_uplo_type(uplow,&uplo);
+ get_diag_type(diagn,&diag);
if (*order == TEST_ROW_MJR) {
LDA = *k+1;
@@ -276,9 +276,9 @@ void F77_dtbsv(int *order, char *uplow, char *transp, char *diagn,
enum CBLAS_UPLO uplo;
enum CBLAS_DIAG diag;
- get_transpose_type(transp,&trans);
- get_uplo_type(uplow,&uplo);
- get_diag_type(diagn,&diag);
+ get_transpose_type(transp,&trans);
+ get_uplo_type(uplow,&uplo);
+ get_diag_type(diagn,&diag);
if (*order == TEST_ROW_MJR) {
LDA = *k+1;
@@ -315,7 +315,7 @@ void F77_dtbsv(int *order, char *uplow, char *transp, char *diagn,
}
void F77_dsbmv(int *order, char *uplow, int *n, int *k, double *alpha,
- double *a, int *lda, double *x, int *incx, double *beta,
+ double *a, int *lda, double *x, int *incx, double *beta,
double *y, int *incy) {
double *A;
int i,j,irow,jcol,LDA;
@@ -387,13 +387,13 @@ void F77_dspmv(int *order, char *uplow, int *n, double *alpha, double *ap,
for( j=0; j<i+1; j++, k++ )
AP[ k ]=A[ LDA*i+j ];
}
- cblas_dspmv( CblasRowMajor, uplo, *n, *alpha, AP, x, *incx, *beta, y,
+ cblas_dspmv( CblasRowMajor, uplo, *n, *alpha, AP, x, *incx, *beta, y,
*incy );
free(A);
free(AP);
}
else
- cblas_dspmv( CblasColMajor, uplo, *n, *alpha, ap, x, *incx, *beta, y,
+ cblas_dspmv( CblasColMajor, uplo, *n, *alpha, ap, x, *incx, *beta, y,
*incy );
}
@@ -405,9 +405,9 @@ void F77_dtpmv(int *order, char *uplow, char *transp, char *diagn,
enum CBLAS_UPLO uplo;
enum CBLAS_DIAG diag;
- get_transpose_type(transp,&trans);
- get_uplo_type(uplow,&uplo);
- get_diag_type(diagn,&diag);
+ get_transpose_type(transp,&trans);
+ get_uplo_type(uplow,&uplo);
+ get_diag_type(diagn,&diag);
if (*order == TEST_ROW_MJR) {
LDA = *n;
@@ -445,9 +445,9 @@ void F77_dtpsv(int *order, char *uplow, char *transp, char *diagn,
enum CBLAS_UPLO uplo;
enum CBLAS_DIAG diag;
- get_transpose_type(transp,&trans);
- get_uplo_type(uplow,&uplo);
- get_diag_type(diagn,&diag);
+ get_transpose_type(transp,&trans);
+ get_uplo_type(uplow,&uplo);
+ get_diag_type(diagn,&diag);
if (*order == TEST_ROW_MJR) {
LDA = *n;
@@ -478,7 +478,7 @@ void F77_dtpsv(int *order, char *uplow, char *transp, char *diagn,
cblas_dtpsv( CblasColMajor, uplo, trans, diag, *n, ap, x, *incx );
}
-void F77_dspr(int *order, char *uplow, int *n, double *alpha, double *x,
+void F77_dspr(int *order, char *uplow, int *n, double *alpha, double *x,
int *incx, double *ap ){
double *A, *AP;
int i,j,k,LDA;
@@ -530,7 +530,7 @@ void F77_dspr(int *order, char *uplow, int *n, double *alpha, double *x,
cblas_dspr( CblasColMajor, uplo, *n, *alpha, x, *incx, ap );
}
-void F77_dspr2(int *order, char *uplow, int *n, double *alpha, double *x,
+void F77_dspr2(int *order, char *uplow, int *n, double *alpha, double *x,
int *incx, double *y, int *incy, double *ap ){
double *A, *AP;
int i,j,k,LDA;
diff --git a/ctest/c_dblas3.c b/ctest/c_dblas3.c
index 4558e41..85d7913 100644
--- a/ctest/c_dblas3.c
+++ b/ctest/c_dblas3.c
@@ -12,7 +12,7 @@
#define TEST_ROW_MJR 1
#define UNDEFINED -1
-void F77_dgemm(int *order, char *transpa, char *transpb, int *m, int *n,
+void F77_dgemm(int *order, char *transpa, char *transpb, int *m, int *n,
int *k, double *alpha, double *a, int *lda, double *b, int *ldb,
double *beta, double *c, int *ldc ) {
@@ -111,7 +111,7 @@ void F77_dsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
C[i*LDC+j]=c[j*(*ldc)+i];
- cblas_dsymm( CblasRowMajor, side, uplo, *m, *n, *alpha, A, LDA, B, LDB,
+ cblas_dsymm( CblasRowMajor, side, uplo, *m, *n, *alpha, A, LDA, B, LDB,
*beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
@@ -121,15 +121,15 @@ void F77_dsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
free(C);
}
else if (*order == TEST_COL_MJR)
- cblas_dsymm( CblasColMajor, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb,
+ cblas_dsymm( CblasColMajor, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb,
*beta, c, *ldc );
else
- cblas_dsymm( UNDEFINED, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb,
+ cblas_dsymm( UNDEFINED, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb,
*beta, c, *ldc );
}
void F77_dsyrk(int *order, char *uplow, char *transp, int *n, int *k,
- double *alpha, double *a, int *lda,
+ double *alpha, double *a, int *lda,
double *beta, double *c, int *ldc ) {
int i,j,LDA,LDC;
@@ -160,7 +160,7 @@ void F77_dsyrk(int *order, char *uplow, char *transp, int *n, int *k,
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
C[i*LDC+j]=c[j*(*ldc)+i];
- cblas_dsyrk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta,
+ cblas_dsyrk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta,
C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ )
@@ -169,10 +169,10 @@ void F77_dsyrk(int *order, char *uplow, char *transp, int *n, int *k,
free(C);
}
else if (*order == TEST_COL_MJR)
- cblas_dsyrk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
+ cblas_dsyrk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
c, *ldc );
else
- cblas_dsyrk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
+ cblas_dsyrk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
c, *ldc );
}
@@ -215,7 +215,7 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
C[i*LDC+j]=c[j*(*ldc)+i];
- cblas_dsyr2k(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA,
+ cblas_dsyr2k(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA,
B, LDB, *beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ )
@@ -225,14 +225,14 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
free(C);
}
else if (*order == TEST_COL_MJR)
- cblas_dsyr2k(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda,
+ cblas_dsyr2k(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda,
b, *ldb, *beta, c, *ldc );
else
- cblas_dsyr2k(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda,
+ cblas_dsyr2k(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda,
b, *ldb, *beta, c, *ldc );
}
void F77_dtrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
- int *m, int *n, double *alpha, double *a, int *lda, double *b,
+ int *m, int *n, double *alpha, double *a, int *lda, double *b,
int *ldb) {
int i,j,LDA,LDB;
double *A, *B;
@@ -266,7 +266,7 @@ void F77_dtrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ )
B[i*LDB+j]=b[j*(*ldb)+i];
- cblas_dtrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha,
+ cblas_dtrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha,
A, LDA, B, LDB );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
@@ -275,10 +275,10 @@ void F77_dtrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
free(B);
}
else if (*order == TEST_COL_MJR)
- cblas_dtrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha,
+ cblas_dtrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha,
a, *lda, b, *ldb);
else
- cblas_dtrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha,
+ cblas_dtrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha,
a, *lda, b, *ldb);
}
@@ -317,7 +317,7 @@ void F77_dtrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ )
B[i*LDB+j]=b[j*(*ldb)+i];
- cblas_dtrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha,
+ cblas_dtrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha,
A, LDA, B, LDB );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
@@ -326,9 +326,9 @@ void F77_dtrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
free(B);
}
else if (*order == TEST_COL_MJR)
- cblas_dtrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha,
+ cblas_dtrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha,
a, *lda, b, *ldb);
else
- cblas_dtrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha,
+ cblas_dtrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha,
a, *lda, b, *ldb);
}
diff --git a/ctest/c_dblat1.f b/ctest/c_dblat1.f
index 63e1ed8..0aeba45 100644
--- a/ctest/c_dblat1.f
+++ b/ctest/c_dblat1.f
@@ -506,7 +506,7 @@
80 CONTINUE
MWPS(1) = 0.0
DO 100 I = 2, 6
- MWPS(I) = 1.0
+ MWPS(I) = 1.0
100 CONTINUE
DO 120 I = 7, 11
MWPS(I) = -1.0
@@ -589,7 +589,7 @@
MWPSTX(K) = MWPTX(I,K)
MWPSTY(K) = MWPTY(I,K)
180 CONTINUE
- CALL DROTTEST(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I))
+ CALL DROTTEST(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I))
CALL STEST(5,COPYX,MWPSTX,MWPSTX,SFAC)
CALL STEST(5,COPYY,MWPSTY,MWPSTY,SFAC)
200 CONTINUE
diff --git a/ctest/c_dblat2.f b/ctest/c_dblat2.f
index 357816b..27ceda6 100644
--- a/ctest/c_dblat2.f
+++ b/ctest/c_dblat2.f
@@ -572,7 +572,7 @@
CTRANS = ' CblasNoTrans'
ELSE IF (TRANS.EQ.'T')THEN
CTRANS = ' CblasTrans'
- ELSE
+ ELSE
CTRANS = 'CblasConjTrans'
END IF
TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C'
@@ -921,7 +921,7 @@
UPLO = ICH( IC: IC )
IF (UPLO.EQ.'U')THEN
CUPLO = ' CblasUpper'
- ELSE
+ ELSE
CUPLO = ' CblasLower'
END IF
*
@@ -1215,7 +1215,7 @@
LOGICAL LDE, LDERES
EXTERNAL LDE, LDERES
* .. External Subroutines ..
- EXTERNAL DMAKE, DMVCH, CDTBMV, CDTBSV, CDTPMV,
+ EXTERNAL DMAKE, DMVCH, CDTBMV, CDTBSV, CDTPMV,
$ CDTPSV, CDTRMV, CDTRSV
* .. Intrinsic Functions ..
INTRINSIC ABS, MAX
@@ -1283,7 +1283,7 @@
UPLO = ICHU( ICU: ICU )
IF (UPLO.EQ.'U')THEN
CUPLO = ' CblasUpper'
- ELSE
+ ELSE
CUPLO = ' CblasLower'
END IF
*
@@ -1293,7 +1293,7 @@
CTRANS = ' CblasNoTrans'
ELSE IF (TRANS.EQ.'T')THEN
CTRANS = ' CblasTrans'
- ELSE
+ ELSE
CTRANS = 'CblasConjTrans'
END IF
*
@@ -1972,7 +1972,7 @@
$ ALPHA, INCX, LDA
IF( REWI )
$ REWIND NTRA
- CALL CDSYR( IORDER, UPLO, N, ALPHA, XX, INCX,
+ CALL CDSYR( IORDER, UPLO, N, ALPHA, XX, INCX,
$ AA, LDA )
ELSE IF( PACKED )THEN
IF( TRACE )
@@ -2737,7 +2737,7 @@
WRITE( NOUT, FMT = 9998 )I, YT( I ),
$ YY( 1 + ( I - 1 )*ABS( INCY ) )
ELSE
- WRITE( NOUT, FMT = 9998 )I,
+ WRITE( NOUT, FMT = 9998 )I,
$ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT(I)
END IF
60 CONTINUE
diff --git a/ctest/c_dblat3.f b/ctest/c_dblat3.f
index fb9acbb..24befdc 100644
--- a/ctest/c_dblat3.f
+++ b/ctest/c_dblat3.f
@@ -675,7 +675,7 @@
*
120 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
- CALL DPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB,
+ CALL DPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB,
$ M, N, K, ALPHA, LDA, LDB, BETA, LDC)
*
130 CONTINUE
@@ -710,24 +710,24 @@
CHARACTER*1 TRANSA, TRANSB
CHARACTER*12 SNAME
CHARACTER*14 CRC, CTA,CTB
-
+
IF (TRANSA.EQ.'N')THEN
CTA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CTA = ' CblasTrans'
- ELSE
+ ELSE
CTA = 'CblasConjTrans'
END IF
IF (TRANSB.EQ.'N')THEN
CTB = ' CblasNoTrans'
ELSE IF (TRANSB.EQ.'T')THEN
CTB = ' CblasTrans'
- ELSE
+ ELSE
CTB = 'CblasConjTrans'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB
@@ -891,9 +891,9 @@
* Call the subroutine.
*
IF( TRACE )
- $ CALL DPRCN2(NTRA, NC, SNAME, IORDER,
- $ SIDE, UPLO, M, N, ALPHA, LDA, LDB,
- $ BETA, LDC)
+ $ CALL DPRCN2(NTRA, NC, SNAME, IORDER,
+ $ SIDE, UPLO, M, N, ALPHA, LDA, LDB,
+ $ BETA, LDC)
IF( REWI )
$ REWIND NTRA
CALL CDSYMM( IORDER, SIDE, UPLO, M, N, ALPHA,
@@ -989,7 +989,7 @@
110 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
CALL DPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA,
- $ LDB, BETA, LDC)
+ $ LDB, BETA, LDC)
*
120 CONTINUE
RETURN
@@ -1024,20 +1024,20 @@
CHARACTER*1 SIDE, UPLO
CHARACTER*12 SNAME
CHARACTER*14 CRC, CS,CU
-
+
IF (SIDE.EQ.'L')THEN
CS = ' CblasLeft'
- ELSE
+ ELSE
CS = ' CblasRight'
END IF
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU
@@ -1210,7 +1210,7 @@
IF( REWI )
$ REWIND NTRA
CALL CDTRSM( IORDER, SIDE, UPLO, TRANSA,
- $ DIAG, M, N, ALPHA, AA, LDA,
+ $ DIAG, M, N, ALPHA, AA, LDA,
$ BB, LDB )
END IF
*
@@ -1370,22 +1370,22 @@
CHARACTER*1 SIDE, UPLO, TRANSA, DIAG
CHARACTER*12 SNAME
CHARACTER*14 CRC, CS, CU, CA, CD
-
+
IF (SIDE.EQ.'L')THEN
CS = ' CblasLeft'
- ELSE
+ ELSE
CS = ' CblasRight'
END IF
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (TRANSA.EQ.'N')THEN
CA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CA = ' CblasTrans'
- ELSE
+ ELSE
CA = 'CblasConjTrans'
END IF
IF (DIAG.EQ.'N')THEN
@@ -1395,14 +1395,14 @@
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU
WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB
9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',')
- 9994 FORMAT( 22X, 2( A14, ',') , 2( I3, ',' ),
+ 9994 FORMAT( 22X, 2( A14, ',') , 2( I3, ',' ),
$ F4.1, ', A,', I3, ', B,', I3, ').' )
END
*
@@ -1696,36 +1696,36 @@
CHARACTER*1 UPLO, TRANSA
CHARACTER*12 SNAME
CHARACTER*14 CRC, CU, CA
-
+
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (TRANSA.EQ.'N')THEN
CA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CA = ' CblasTrans'
- ELSE
+ ELSE
CA = 'CblasConjTrans'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC
9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') )
- 9994 FORMAT( 20X, 2( I3, ',' ),
+ 9994 FORMAT( 20X, 2( I3, ',' ),
$ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' )
END
*
SUBROUTINE DCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
$ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
- $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W,
- $ IORDER )
+ $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W,
+ $ IORDER )
*
* Tests DSYR2K.
*
@@ -2053,29 +2053,29 @@
CHARACTER*1 UPLO, TRANSA
CHARACTER*12 SNAME
CHARACTER*14 CRC, CU, CA
-
+
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (TRANSA.EQ.'N')THEN
CA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CA = ' CblasTrans'
- ELSE
+ ELSE
CA = 'CblasConjTrans'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC
9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') )
- 9994 FORMAT( 20X, 2( I3, ',' ),
+ 9994 FORMAT( 20X, 2( I3, ',' ),
$ F4.1, ', A,', I3, ', B', I3, ',', F4.1, ', C,', I3, ').' )
END
*
diff --git a/ctest/c_s2chke.c b/ctest/c_s2chke.c
index b0a48a6..1455153 100644
--- a/ctest/c_s2chke.c
+++ b/ctest/c_s2chke.c
@@ -26,9 +26,9 @@ void chkxer(void) {
void F77_s2chke(char *rout) {
char *sf = ( rout ) ;
- float A[2] = {0.0,0.0},
- X[2] = {0.0,0.0},
- Y[2] = {0.0,0.0},
+ float A[2] = {0.0,0.0},
+ X[2] = {0.0,0.0},
+ Y[2] = {0.0,0.0},
ALPHA=0.0, BETA=0.0;
extern int cblas_info, cblas_lerr, cblas_ok;
extern int RowMajorStrg;
@@ -46,588 +46,588 @@ void F77_s2chke(char *rout) {
if (strncmp( sf,"cblas_sgemv",11)==0) {
cblas_rout = "cblas_sgemv";
cblas_info = 1;
- cblas_sgemv(INVALID, CblasNoTrans, 0, 0,
+ cblas_sgemv(INVALID, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_sgemv(CblasColMajor, INVALID, 0, 0,
+ cblas_sgemv(CblasColMajor, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_sgemv(CblasColMajor, CblasNoTrans, INVALID, 0,
+ cblas_sgemv(CblasColMajor, CblasNoTrans, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_sgemv(CblasColMajor, CblasNoTrans, 0, INVALID,
+ cblas_sgemv(CblasColMajor, CblasNoTrans, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_sgemv(CblasColMajor, CblasNoTrans, 2, 0,
+ cblas_sgemv(CblasColMajor, CblasNoTrans, 2, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_sgemv(CblasColMajor, CblasNoTrans, 0, 0,
+ cblas_sgemv(CblasColMajor, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 12; RowMajorStrg = FALSE;
- cblas_sgemv(CblasColMajor, CblasNoTrans, 0, 0,
+ cblas_sgemv(CblasColMajor, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE;
- cblas_sgemv(CblasRowMajor, INVALID, 0, 0,
+ cblas_sgemv(CblasRowMajor, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_sgemv(CblasRowMajor, CblasNoTrans, INVALID, 0,
+ cblas_sgemv(CblasRowMajor, CblasNoTrans, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, INVALID,
+ cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, 2,
+ cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, 2,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, 0,
+ cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 12; RowMajorStrg = TRUE;
- cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, 0,
+ cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_sgbmv",11)==0) {
cblas_rout = "cblas_sgbmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_sgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0,
+ cblas_sgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_sgbmv(CblasColMajor, INVALID, 0, 0, 0, 0,
+ cblas_sgbmv(CblasColMajor, INVALID, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_sgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0,
+ cblas_sgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0,
+ cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0,
+ cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
- cblas_sgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID,
+ cblas_sgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0,
+ cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
- cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0,
+ cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
- cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0,
+ cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_sgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0,
+ cblas_sgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_sgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0,
+ cblas_sgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0,
+ cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0,
+ cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
- cblas_sgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID,
+ cblas_sgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0,
+ cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
- cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0,
+ cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
- cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0,
+ cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_ssymv",11)==0) {
cblas_rout = "cblas_ssymv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_ssymv(INVALID, CblasUpper, 0,
+ cblas_ssymv(INVALID, CblasUpper, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_ssymv(CblasColMajor, INVALID, 0,
+ cblas_ssymv(CblasColMajor, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_ssymv(CblasColMajor, CblasUpper, INVALID,
+ cblas_ssymv(CblasColMajor, CblasUpper, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
- cblas_ssymv(CblasColMajor, CblasUpper, 2,
+ cblas_ssymv(CblasColMajor, CblasUpper, 2,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_ssymv(CblasColMajor, CblasUpper, 0,
+ cblas_ssymv(CblasColMajor, CblasUpper, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
- cblas_ssymv(CblasColMajor, CblasUpper, 0,
+ cblas_ssymv(CblasColMajor, CblasUpper, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_ssymv(CblasRowMajor, INVALID, 0,
+ cblas_ssymv(CblasRowMajor, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_ssymv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_ssymv(CblasRowMajor, CblasUpper, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
- cblas_ssymv(CblasRowMajor, CblasUpper, 2,
+ cblas_ssymv(CblasRowMajor, CblasUpper, 2,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_ssymv(CblasRowMajor, CblasUpper, 0,
+ cblas_ssymv(CblasRowMajor, CblasUpper, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
- cblas_ssymv(CblasRowMajor, CblasUpper, 0,
+ cblas_ssymv(CblasRowMajor, CblasUpper, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_ssbmv",11)==0) {
cblas_rout = "cblas_ssbmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_ssbmv(INVALID, CblasUpper, 0, 0,
+ cblas_ssbmv(INVALID, CblasUpper, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_ssbmv(CblasColMajor, INVALID, 0, 0,
+ cblas_ssbmv(CblasColMajor, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_ssbmv(CblasColMajor, CblasUpper, INVALID, 0,
+ cblas_ssbmv(CblasColMajor, CblasUpper, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_ssbmv(CblasColMajor, CblasUpper, 0, INVALID,
+ cblas_ssbmv(CblasColMajor, CblasUpper, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_ssbmv(CblasColMajor, CblasUpper, 0, 1,
+ cblas_ssbmv(CblasColMajor, CblasUpper, 0, 1,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_ssbmv(CblasColMajor, CblasUpper, 0, 0,
+ cblas_ssbmv(CblasColMajor, CblasUpper, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 12; RowMajorStrg = FALSE;
- cblas_ssbmv(CblasColMajor, CblasUpper, 0, 0,
+ cblas_ssbmv(CblasColMajor, CblasUpper, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_ssbmv(CblasRowMajor, INVALID, 0, 0,
+ cblas_ssbmv(CblasRowMajor, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_ssbmv(CblasRowMajor, CblasUpper, INVALID, 0,
+ cblas_ssbmv(CblasRowMajor, CblasUpper, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_ssbmv(CblasRowMajor, CblasUpper, 0, INVALID,
+ cblas_ssbmv(CblasRowMajor, CblasUpper, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_ssbmv(CblasRowMajor, CblasUpper, 0, 1,
+ cblas_ssbmv(CblasRowMajor, CblasUpper, 0, 1,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_ssbmv(CblasRowMajor, CblasUpper, 0, 0,
+ cblas_ssbmv(CblasRowMajor, CblasUpper, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 12; RowMajorStrg = TRUE;
- cblas_ssbmv(CblasRowMajor, CblasUpper, 0, 0,
+ cblas_ssbmv(CblasRowMajor, CblasUpper, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_sspmv",11)==0) {
cblas_rout = "cblas_sspmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_sspmv(INVALID, CblasUpper, 0,
+ cblas_sspmv(INVALID, CblasUpper, 0,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_sspmv(CblasColMajor, INVALID, 0,
+ cblas_sspmv(CblasColMajor, INVALID, 0,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_sspmv(CblasColMajor, CblasUpper, INVALID,
+ cblas_sspmv(CblasColMajor, CblasUpper, INVALID,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_sspmv(CblasColMajor, CblasUpper, 0,
+ cblas_sspmv(CblasColMajor, CblasUpper, 0,
ALPHA, A, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = FALSE;
- cblas_sspmv(CblasColMajor, CblasUpper, 0,
+ cblas_sspmv(CblasColMajor, CblasUpper, 0,
ALPHA, A, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_sspmv(CblasRowMajor, INVALID, 0,
+ cblas_sspmv(CblasRowMajor, INVALID, 0,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_sspmv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_sspmv(CblasRowMajor, CblasUpper, INVALID,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_sspmv(CblasRowMajor, CblasUpper, 0,
+ cblas_sspmv(CblasRowMajor, CblasUpper, 0,
ALPHA, A, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = TRUE;
- cblas_sspmv(CblasRowMajor, CblasUpper, 0,
+ cblas_sspmv(CblasRowMajor, CblasUpper, 0,
ALPHA, A, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_strmv",11)==0) {
cblas_rout = "cblas_strmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_strmv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_strmv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_strmv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_strmv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_strmv(CblasColMajor, CblasUpper, INVALID,
+ cblas_strmv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 2, A, 1, X, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_strmv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_strmv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_strmv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_strmv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 2, A, 1, X, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_stbmv",11)==0) {
cblas_rout = "cblas_stbmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_stbmv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_stbmv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_stbmv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_stbmv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_stbmv(CblasColMajor, CblasUpper, INVALID,
+ cblas_stbmv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
- cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 1, A, 1, X, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = FALSE;
- cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_stbmv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_stbmv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_stbmv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_stbmv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
- cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 1, A, 1, X, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = TRUE;
- cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_stpmv",11)==0) {
cblas_rout = "cblas_stpmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_stpmv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_stpmv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_stpmv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_stpmv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_stpmv(CblasColMajor, CblasUpper, INVALID,
+ cblas_stpmv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_stpmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_stpmv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_stpmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_stpmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_stpmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_stpmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_stpmv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_stpmv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_stpmv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_stpmv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_stpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_stpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_stpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_stpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_stpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_stpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_strsv",11)==0) {
cblas_rout = "cblas_strsv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_strsv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_strsv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_strsv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_strsv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_strsv(CblasColMajor, CblasUpper, INVALID,
+ cblas_strsv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 2, A, 1, X, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_strsv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_strsv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_strsv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_strsv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 2, A, 1, X, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_stbsv",11)==0) {
cblas_rout = "cblas_stbsv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_stbsv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_stbsv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_stbsv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_stbsv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_stbsv(CblasColMajor, CblasUpper, INVALID,
+ cblas_stbsv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
- cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 1, A, 1, X, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = FALSE;
- cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_stbsv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_stbsv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_stbsv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_stbsv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
- cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 1, A, 1, X, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = TRUE;
- cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_stpsv",11)==0) {
cblas_rout = "cblas_stpsv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_stpsv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_stpsv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_stpsv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_stpsv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_stpsv(CblasColMajor, CblasUpper, INVALID,
+ cblas_stpsv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_stpsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_stpsv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_stpsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_stpsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_stpsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_stpsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_stpsv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_stpsv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_stpsv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_stpsv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_stpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_stpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_stpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_stpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_stpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_stpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_sger",10)==0) {
@@ -781,7 +781,7 @@ void F77_s2chke(char *rout) {
cblas_info = 6; RowMajorStrg = FALSE;
cblas_sspr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A );
chkxer();
- }
+ }
if (cblas_ok == TRUE)
printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout);
else
diff --git a/ctest/c_s3chke.c b/ctest/c_s3chke.c
index 7c832c1..632eaae 100644
--- a/ctest/c_s3chke.c
+++ b/ctest/c_s3chke.c
@@ -26,9 +26,9 @@ void chkxer(void) {
void F77_s3chke(char *rout) {
char *sf = ( rout ) ;
- float A[2] = {0.0,0.0},
- B[2] = {0.0,0.0},
- C[2] = {0.0,0.0},
+ float A[2] = {0.0,0.0},
+ B[2] = {0.0,0.0},
+ C[2] = {0.0,0.0},
ALPHA=0.0, BETA=0.0;
extern int cblas_info, cblas_lerr, cblas_ok;
extern int RowMajorStrg;
@@ -39,22 +39,22 @@ void F77_s3chke(char *rout) {
cblas_xerbla(cblas_info,cblas_rout,"");
F77_xerbla(cblas_rout,&cblas_info);
}
-
+
cblas_ok = TRUE ;
cblas_lerr = PASSED ;
if (strncmp( sf,"cblas_sgemm" ,11)==0) {
cblas_rout = "cblas_sgemm" ;
cblas_info = 1;
- cblas_sgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0,
+ cblas_sgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
- cblas_sgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0,
+ cblas_sgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
- cblas_sgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0,
+ cblas_sgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
diff --git a/ctest/c_sblas1.c b/ctest/c_sblas1.c
index 5ccb2d3..f1b18b6 100644
--- a/ctest/c_sblas1.c
+++ b/ctest/c_sblas1.c
@@ -31,14 +31,14 @@ float F77_scnrm2(blasint *N, const float *X, blasint *incX)
return cblas_scnrm2(*N, X, *incX);
}
-void F77_scopy(blasint *N, const float *X, blasint *incX,
+void F77_scopy(blasint *N, const float *X, blasint *incX,
float *Y, blasint *incY)
{
cblas_scopy(*N, X, *incX, Y, *incY);
return;
}
-float F77_sdot(blasint *N, const float *X, blasint *incX,
+float F77_sdot(blasint *N, const float *X, blasint *incX,
const float *Y, blasint *incY)
{
return cblas_sdot(*N, X, *incX, Y, *incY);
diff --git a/ctest/c_sblas2.c b/ctest/c_sblas2.c
index 3059525..6cbc074 100644
--- a/ctest/c_sblas2.c
+++ b/ctest/c_sblas2.c
@@ -8,8 +8,8 @@
#include "common.h"
#include "cblas_test.h"
-void F77_sgemv(int *order, char *transp, int *m, int *n, float *alpha,
- float *a, int *lda, float *x, int *incx, float *beta,
+void F77_sgemv(int *order, char *transp, int *m, int *n, float *alpha,
+ float *a, int *lda, float *x, int *incx, float *beta,
float *y, int *incy ) {
float *A;
@@ -23,7 +23,7 @@ void F77_sgemv(int *order, char *transp, int *m, int *n, float *alpha,
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ )
A[ LDA*i+j ]=a[ (*lda)*j+i ];
- cblas_sgemv( CblasRowMajor, trans,
+ cblas_sgemv( CblasRowMajor, trans,
*m, *n, *alpha, A, LDA, x, *incx, *beta, y, *incy );
free(A);
}
@@ -68,9 +68,9 @@ void F77_strmv(int *order, char *uplow, char *transp, char *diagn,
enum CBLAS_UPLO uplo;
enum CBLAS_DIAG diag;
- get_transpose_type(transp,&trans);
- get_uplo_type(uplow,&uplo);
- get_diag_type(diagn,&diag);
+ get_transpose_type(transp,&trans);
+ get_uplo_type(uplow,&uplo);
+ get_diag_type(diagn,&diag);
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
@@ -88,7 +88,7 @@ void F77_strmv(int *order, char *uplow, char *transp, char *diagn,
}
}
-void F77_strsv(int *order, char *uplow, char *transp, char *diagn,
+void F77_strsv(int *order, char *uplow, char *transp, char *diagn,
int *n, float *a, int *lda, float *x, int *incx ) {
float *A;
int i,j,LDA;
@@ -112,7 +112,7 @@ void F77_strsv(int *order, char *uplow, char *transp, char *diagn,
else
cblas_strsv(CblasColMajor, uplo, trans, diag, *n, a, *lda, x, *incx );
}
-void F77_ssymv(int *order, char *uplow, int *n, float *alpha, float *a,
+void F77_ssymv(int *order, char *uplow, int *n, float *alpha, float *a,
int *lda, float *x, int *incx, float *beta, float *y,
int *incy) {
float *A;
@@ -136,7 +136,7 @@ void F77_ssymv(int *order, char *uplow, int *n, float *alpha, float *a,
*beta, y, *incy );
}
-void F77_ssyr(int *order, char *uplow, int *n, float *alpha, float *x,
+void F77_ssyr(int *order, char *uplow, int *n, float *alpha, float *x,
int *incx, float *a, int *lda) {
float *A;
int i,j,LDA;
@@ -160,7 +160,7 @@ void F77_ssyr(int *order, char *uplow, int *n, float *alpha, float *x,
cblas_ssyr(CblasColMajor, uplo, *n, *alpha, x, *incx, a, *lda);
}
-void F77_ssyr2(int *order, char *uplow, int *n, float *alpha, float *x,
+void F77_ssyr2(int *order, char *uplow, int *n, float *alpha, float *x,
int *incx, float *y, int *incy, float *a, int *lda) {
float *A;
int i,j,LDA;
@@ -185,7 +185,7 @@ void F77_ssyr2(int *order, char *uplow, int *n, float *alpha, float *x,
}
void F77_sgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
- float *alpha, float *a, int *lda, float *x, int *incx,
+ float *alpha, float *a, int *lda, float *x, int *incx,
float *beta, float *y, int *incy ) {
float *A;
@@ -213,7 +213,7 @@ void F77_sgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
for( j=jcol; j<(*n+*kl); j++ )
A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ];
}
- cblas_sgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, *alpha,
+ cblas_sgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, *alpha,
A, LDA, x, *incx, *beta, y, *incy );
free(A);
}
@@ -230,9 +230,9 @@ void F77_stbmv(int *order, char *uplow, char *transp, char *diagn,
enum CBLAS_UPLO uplo;
enum CBLAS_DIAG diag;
- get_transpose_type(transp,&trans);
- get_uplo_type(uplow,&uplo);
- get_diag_type(diagn,&diag);
+ get_transpose_type(transp,&trans);
+ get_uplo_type(uplow,&uplo);
+ get_diag_type(diagn,&diag);
if (*order == TEST_ROW_MJR) {
LDA = *k+1;
@@ -276,9 +276,9 @@ void F77_stbsv(int *order, char *uplow, char *transp, char *diagn,
enum CBLAS_UPLO uplo;
enum CBLAS_DIAG diag;
- get_transpose_type(transp,&trans);
- get_uplo_type(uplow,&uplo);
- get_diag_type(diagn,&diag);
+ get_transpose_type(transp,&trans);
+ get_uplo_type(uplow,&uplo);
+ get_diag_type(diagn,&diag);
if (*order == TEST_ROW_MJR) {
LDA = *k+1;
@@ -315,7 +315,7 @@ void F77_stbsv(int *order, char *uplow, char *transp, char *diagn,
}
void F77_ssbmv(int *order, char *uplow, int *n, int *k, float *alpha,
- float *a, int *lda, float *x, int *incx, float *beta,
+ float *a, int *lda, float *x, int *incx, float *beta,
float *y, int *incy) {
float *A;
int i,j,irow,jcol,LDA;
@@ -387,12 +387,12 @@ void F77_sspmv(int *order, char *uplow, int *n, float *alpha, float *ap,
for( j=0; j<i+1; j++, k++ )
AP[ k ]=A[ LDA*i+j ];
}
- cblas_sspmv( CblasRowMajor, uplo, *n, *alpha, AP, x, *incx, *beta, y,
+ cblas_sspmv( CblasRowMajor, uplo, *n, *alpha, AP, x, *incx, *beta, y,
*incy );
free(A); free(AP);
}
else
- cblas_sspmv( CblasColMajor, uplo, *n, *alpha, ap, x, *incx, *beta, y,
+ cblas_sspmv( CblasColMajor, uplo, *n, *alpha, ap, x, *incx, *beta, y,
*incy );
}
@@ -404,9 +404,9 @@ void F77_stpmv(int *order, char *uplow, char *transp, char *diagn,
enum CBLAS_UPLO uplo;
enum CBLAS_DIAG diag;
- get_transpose_type(transp,&trans);
- get_uplo_type(uplow,&uplo);
- get_diag_type(diagn,&diag);
+ get_transpose_type(transp,&trans);
+ get_uplo_type(uplow,&uplo);
+ get_diag_type(diagn,&diag);
if (*order == TEST_ROW_MJR) {
LDA = *n;
@@ -443,9 +443,9 @@ void F77_stpsv(int *order, char *uplow, char *transp, char *diagn,
enum CBLAS_UPLO uplo;
enum CBLAS_DIAG diag;
- get_transpose_type(transp,&trans);
- get_uplo_type(uplow,&uplo);
- get_diag_type(diagn,&diag);
+ get_transpose_type(transp,&trans);
+ get_uplo_type(uplow,&uplo);
+ get_diag_type(diagn,&diag);
if (*order == TEST_ROW_MJR) {
LDA = *n;
@@ -475,7 +475,7 @@ void F77_stpsv(int *order, char *uplow, char *transp, char *diagn,
cblas_stpsv( CblasColMajor, uplo, trans, diag, *n, ap, x, *incx );
}
-void F77_sspr(int *order, char *uplow, int *n, float *alpha, float *x,
+void F77_sspr(int *order, char *uplow, int *n, float *alpha, float *x,
int *incx, float *ap ){
float *A, *AP;
int i,j,k,LDA;
@@ -526,7 +526,7 @@ void F77_sspr(int *order, char *uplow, int *n, float *alpha, float *x,
cblas_sspr( CblasColMajor, uplo, *n, *alpha, x, *incx, ap );
}
-void F77_sspr2(int *order, char *uplow, int *n, float *alpha, float *x,
+void F77_sspr2(int *order, char *uplow, int *n, float *alpha, float *x,
int *incx, float *y, int *incy, float *ap ){
float *A, *AP;
int i,j,k,LDA;
diff --git a/ctest/c_sblas3.c b/ctest/c_sblas3.c
index 03c6895..e3977d0 100644
--- a/ctest/c_sblas3.c
+++ b/ctest/c_sblas3.c
@@ -9,7 +9,7 @@
#include "common.h"
#include "cblas_test.h"
-void F77_sgemm(int *order, char *transpa, char *transpb, int *m, int *n,
+void F77_sgemm(int *order, char *transpa, char *transpb, int *m, int *n,
int *k, float *alpha, float *a, int *lda, float *b, int *ldb,
float *beta, float *c, int *ldc ) {
@@ -107,7 +107,7 @@ void F77_ssymm(int *order, char *rtlf, char *uplow, int *m, int *n,
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
C[i*LDC+j]=c[j*(*ldc)+i];
- cblas_ssymm( CblasRowMajor, side, uplo, *m, *n, *alpha, A, LDA, B, LDB,
+ cblas_ssymm( CblasRowMajor, side, uplo, *m, *n, *alpha, A, LDA, B, LDB,
*beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
@@ -117,15 +117,15 @@ void F77_ssymm(int *order, char *rtlf, char *uplow, int *m, int *n,
free(C);
}
else if (*order == TEST_COL_MJR)
- cblas_ssymm( CblasColMajor, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb,
+ cblas_ssymm( CblasColMajor, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb,
*beta, c, *ldc );
else
- cblas_ssymm( UNDEFINED, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb,
+ cblas_ssymm( UNDEFINED, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb,
*beta, c, *ldc );
}
void F77_ssyrk(int *order, char *uplow, char *transp, int *n, int *k,
- float *alpha, float *a, int *lda,
+ float *alpha, float *a, int *lda,
float *beta, float *c, int *ldc ) {
int i,j,LDA,LDC;
@@ -156,7 +156,7 @@ void F77_ssyrk(int *order, char *uplow, char *transp, int *n, int *k,
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
C[i*LDC+j]=c[j*(*ldc)+i];
- cblas_ssyrk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta,
+ cblas_ssyrk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta,
C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ )
@@ -165,10 +165,10 @@ void F77_ssyrk(int *order, char *uplow, char *transp, int *n, int *k,
free(C);
}
else if (*order == TEST_COL_MJR)
- cblas_ssyrk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
+ cblas_ssyrk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
c, *ldc );
else
- cblas_ssyrk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
+ cblas_ssyrk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
c, *ldc );
}
@@ -211,7 +211,7 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
C[i*LDC+j]=c[j*(*ldc)+i];
- cblas_ssyr2k(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA,
+ cblas_ssyr2k(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA,
B, LDB, *beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ )
@@ -221,14 +221,14 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
free(C);
}
else if (*order == TEST_COL_MJR)
- cblas_ssyr2k(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda,
+ cblas_ssyr2k(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda,
b, *ldb, *beta, c, *ldc );
else
- cblas_ssyr2k(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda,
+ cblas_ssyr2k(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda,
b, *ldb, *beta, c, *ldc );
}
void F77_strmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
- int *m, int *n, float *alpha, float *a, int *lda, float *b,
+ int *m, int *n, float *alpha, float *a, int *lda, float *b,
int *ldb) {
int i,j,LDA,LDB;
float *A, *B;
@@ -262,7 +262,7 @@ void F77_strmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ )
B[i*LDB+j]=b[j*(*ldb)+i];
- cblas_strmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha,
+ cblas_strmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha,
A, LDA, B, LDB );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
@@ -271,10 +271,10 @@ void F77_strmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
free(B);
}
else if (*order == TEST_COL_MJR)
- cblas_strmm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha,
+ cblas_strmm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha,
a, *lda, b, *ldb);
else
- cblas_strmm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha,
+ cblas_strmm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha,
a, *lda, b, *ldb);
}
@@ -313,7 +313,7 @@ void F77_strsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ )
B[i*LDB+j]=b[j*(*ldb)+i];
- cblas_strsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha,
+ cblas_strsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha,
A, LDA, B, LDB );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
@@ -322,9 +322,9 @@ void F77_strsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
free(B);
}
else if (*order == TEST_COL_MJR)
- cblas_strsm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha,
+ cblas_strsm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha,
a, *lda, b, *ldb);
else
- cblas_strsm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha,
+ cblas_strsm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha,
a, *lda, b, *ldb);
}
diff --git a/ctest/c_sblat2.f b/ctest/c_sblat2.f
index bf6f3e4..8bd23c3 100644
--- a/ctest/c_sblat2.f
+++ b/ctest/c_sblat2.f
@@ -572,7 +572,7 @@
CTRANS = ' CblasNoTrans'
ELSE IF (TRANS.EQ.'T')THEN
CTRANS = ' CblasTrans'
- ELSE
+ ELSE
CTRANS = 'CblasConjTrans'
END IF
TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C'
@@ -921,7 +921,7 @@
UPLO = ICH( IC: IC )
IF (UPLO.EQ.'U')THEN
CUPLO = ' CblasUpper'
- ELSE
+ ELSE
CUPLO = ' CblasLower'
END IF
*
@@ -1215,7 +1215,7 @@
LOGICAL LSE, LSERES
EXTERNAL LSE, LSERES
* .. External Subroutines ..
- EXTERNAL SMAKE, SMVCH, CSTBMV, CSTBSV, CSTPMV,
+ EXTERNAL SMAKE, SMVCH, CSTBMV, CSTBSV, CSTPMV,
$ CSTPSV, CSTRMV, CSTRSV
* .. Intrinsic Functions ..
INTRINSIC ABS, MAX
@@ -1283,7 +1283,7 @@
UPLO = ICHU( ICU: ICU )
IF (UPLO.EQ.'U')THEN
CUPLO = ' CblasUpper'
- ELSE
+ ELSE
CUPLO = ' CblasLower'
END IF
*
@@ -1293,7 +1293,7 @@
CTRANS = ' CblasNoTrans'
ELSE IF (TRANS.EQ.'T')THEN
CTRANS = ' CblasTrans'
- ELSE
+ ELSE
CTRANS = 'CblasConjTrans'
END IF
*
@@ -1972,7 +1972,7 @@
$ ALPHA, INCX, LDA
IF( REWI )
$ REWIND NTRA
- CALL CSSYR( IORDER, UPLO, N, ALPHA, XX, INCX,
+ CALL CSSYR( IORDER, UPLO, N, ALPHA, XX, INCX,
$ AA, LDA )
ELSE IF( PACKED )THEN
IF( TRACE )
@@ -2737,7 +2737,7 @@
WRITE( NOUT, FMT = 9998 )I, YT( I ),
$ YY( 1 + ( I - 1 )*ABS( INCY ) )
ELSE
- WRITE( NOUT, FMT = 9998 )I,
+ WRITE( NOUT, FMT = 9998 )I,
$ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT(I)
END IF
60 CONTINUE
diff --git a/ctest/c_sblat3.f b/ctest/c_sblat3.f
index 948fd6e..606f83a 100644
--- a/ctest/c_sblat3.f
+++ b/ctest/c_sblat3.f
@@ -587,7 +587,7 @@
IF( REWI )
$ REWIND NTRA
CALL CSGEMM( IORDER, TRANSA, TRANSB, M, N,
- $ K, ALPHA, AA, LDA, BB, LDB,
+ $ K, ALPHA, AA, LDA, BB, LDB,
$ BETA, CC, LDC )
*
* Check if error-exit was taken incorrectly.
@@ -675,7 +675,7 @@
*
120 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
- CALL SPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB,
+ CALL SPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB,
$ M, N, K, ALPHA, LDA, LDB, BETA, LDC)
*
130 CONTINUE
@@ -713,24 +713,24 @@
CHARACTER*1 TRANSA, TRANSB
CHARACTER*12 SNAME
CHARACTER*14 CRC, CTA,CTB
-
+
IF (TRANSA.EQ.'N')THEN
CTA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CTA = ' CblasTrans'
- ELSE
+ ELSE
CTA = 'CblasConjTrans'
END IF
IF (TRANSB.EQ.'N')THEN
CTB = ' CblasNoTrans'
ELSE IF (TRANSB.EQ.'T')THEN
CTB = ' CblasTrans'
- ELSE
+ ELSE
CTB = 'CblasConjTrans'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB
@@ -743,7 +743,7 @@
*
SUBROUTINE SCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
$ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
- $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G,
+ $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G,
$ IORDER )
*
* Tests SSYMM.
@@ -895,9 +895,9 @@
* Call the subroutine.
*
IF( TRACE )
- $ CALL SPRCN2(NTRA, NC, SNAME, IORDER,
- $ SIDE, UPLO, M, N, ALPHA, LDA, LDB,
- $ BETA, LDC)
+ $ CALL SPRCN2(NTRA, NC, SNAME, IORDER,
+ $ SIDE, UPLO, M, N, ALPHA, LDA, LDB,
+ $ BETA, LDC)
IF( REWI )
$ REWIND NTRA
CALL CSSYMM( IORDER, SIDE, UPLO, M, N, ALPHA,
@@ -993,7 +993,7 @@
110 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
CALL SPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA,
- $ LDB, BETA, LDC)
+ $ LDB, BETA, LDC)
*
120 CONTINUE
RETURN
@@ -1028,20 +1028,20 @@
CHARACTER*1 SIDE, UPLO
CHARACTER*12 SNAME
CHARACTER*14 CRC, CS,CU
-
+
IF (SIDE.EQ.'L')THEN
CS = ' CblasLeft'
- ELSE
+ ELSE
CS = ' CblasRight'
END IF
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU
@@ -1351,9 +1351,9 @@
10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ',
$ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
$ 'RATIO ', F8.2, ' - SUSPECT *******' )
-10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS',
+10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS',
$ ' (', I6, ' CALL', 'S)' )
-10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS',
+10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS',
$ ' (', I6, ' CALL', 'S)' )
9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
$ 'ANGED INCORRECTLY *******' )
@@ -1374,22 +1374,22 @@
CHARACTER*1 SIDE, UPLO, TRANSA, DIAG
CHARACTER*12 SNAME
CHARACTER*14 CRC, CS, CU, CA, CD
-
+
IF (SIDE.EQ.'L')THEN
CS = ' CblasLeft'
- ELSE
+ ELSE
CS = ' CblasRight'
END IF
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (TRANSA.EQ.'N')THEN
CA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CA = ' CblasTrans'
- ELSE
+ ELSE
CA = 'CblasConjTrans'
END IF
IF (DIAG.EQ.'N')THEN
@@ -1399,14 +1399,14 @@
END IF
IF (IORDER.EQ.1)THEN
CRC = 'CblasRowMajor'
- ELSE
+ ELSE
CRC = 'CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU
WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB
9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',')
- 9994 FORMAT( 22X, 2( A14, ',') , 2( I3, ',' ),
+ 9994 FORMAT( 22X, 2( A14, ',') , 2( I3, ',' ),
$ F4.1, ', A,', I3, ', B,', I3, ').' )
END
*
@@ -1701,29 +1701,29 @@
CHARACTER*1 UPLO, TRANSA
CHARACTER*12 SNAME
CHARACTER*14 CRC, CU, CA
-
+
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (TRANSA.EQ.'N')THEN
CA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CA = ' CblasTrans'
- ELSE
+ ELSE
CA = 'CblasConjTrans'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC
9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') )
- 9994 FORMAT( 20X, 2( I3, ',' ),
+ 9994 FORMAT( 20X, 2( I3, ',' ),
$ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' )
END
*
@@ -2057,29 +2057,29 @@
CHARACTER*1 UPLO, TRANSA
CHARACTER*12 SNAME
CHARACTER*14 CRC, CU, CA
-
+
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (TRANSA.EQ.'N')THEN
CA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CA = ' CblasTrans'
- ELSE
+ ELSE
CA = 'CblasConjTrans'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC
9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') )
- 9994 FORMAT( 20X, 2( I3, ',' ),
+ 9994 FORMAT( 20X, 2( I3, ',' ),
$ F4.1, ', A,', I3, ', B', I3, ',', F4.1, ', C,', I3, ').' )
END
*
diff --git a/ctest/c_xerbla.c b/ctest/c_xerbla.c
index 3402460..dd23a49 100644
--- a/ctest/c_xerbla.c
+++ b/ctest/c_xerbla.c
@@ -11,10 +11,10 @@ void cblas_xerbla(blasint info, char *rout, char *form, ...)
extern int link_xerbla;
extern int RowMajorStrg;
extern char *cblas_rout;
-
- /* Initially, c__3chke will call this routine with
- * global variable link_xerbla=1, and F77_xerbla will set link_xerbla=0.
- * This is done to fool the linker into loading these subroutines first
+
+ /* Initially, c__3chke will call this routine with
+ * global variable link_xerbla=1, and F77_xerbla will set link_xerbla=0.
+ * This is done to fool the linker into loading these subroutines first
* instead of ones in the CBLAS or the legacy BLAS library.
*/
if (link_xerbla) return;
@@ -26,11 +26,11 @@ void cblas_xerbla(blasint info, char *rout, char *form, ...)
if (RowMajorStrg)
{
- /* To properly check leading dimension problems in cblas__gemm, we
- * need to do the following trick. When cblas__gemm is called with
- * CblasRowMajor, the arguments A and B switch places in the call to
- * f77__gemm. Thus when we test for bad leading dimension problems
- * for A and B, lda is in position 11 instead of 9, and ldb is in
+ /* To properly check leading dimension problems in cblas__gemm, we
+ * need to do the following trick. When cblas__gemm is called with
+ * CblasRowMajor, the arguments A and B switch places in the call to
+ * f77__gemm. Thus when we test for bad leading dimension problems
+ * for A and B, lda is in position 11 instead of 9, and ldb is in
* position 9 instead of 11.
*/
if (strstr(rout,"gemm") != 0)
@@ -117,7 +117,7 @@ void F77_xerbla(char *srname, void *vinfo)
}
for(i=0; i < 6; i++) rout[i+6] = tolower(srname[i]);
for(i=11; i >= 9; i--) if (rout[i] == ' ') rout[i] = '\0';
-
+
/* We increment *info by 1 since the CBLAS interface adds one more
* argument to all level 2 and 3 routines.
*/
diff --git a/ctest/c_z2chke.c b/ctest/c_z2chke.c
index ac60971..8767b52 100644
--- a/ctest/c_z2chke.c
+++ b/ctest/c_z2chke.c
@@ -26,11 +26,11 @@ void chkxer(void) {
void F77_z2chke(char *rout) {
char *sf = ( rout ) ;
- double A[2] = {0.0,0.0},
- X[2] = {0.0,0.0},
- Y[2] = {0.0,0.0},
+ double A[2] = {0.0,0.0},
+ X[2] = {0.0,0.0},
+ Y[2] = {0.0,0.0},
ALPHA[2] = {0.0,0.0},
- BETA[2] = {0.0,0.0},
+ BETA[2] = {0.0,0.0},
RALPHA = 0.0;
extern int cblas_info, cblas_lerr, cblas_ok;
extern int RowMajorStrg;
@@ -48,588 +48,588 @@ void F77_z2chke(char *rout) {
if (strncmp( sf,"cblas_zgemv",11)==0) {
cblas_rout = "cblas_zgemv";
cblas_info = 1;
- cblas_zgemv(INVALID, CblasNoTrans, 0, 0,
+ cblas_zgemv(INVALID, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_zgemv(CblasColMajor, INVALID, 0, 0,
+ cblas_zgemv(CblasColMajor, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_zgemv(CblasColMajor, CblasNoTrans, INVALID, 0,
+ cblas_zgemv(CblasColMajor, CblasNoTrans, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_zgemv(CblasColMajor, CblasNoTrans, 0, INVALID,
+ cblas_zgemv(CblasColMajor, CblasNoTrans, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_zgemv(CblasColMajor, CblasNoTrans, 2, 0,
+ cblas_zgemv(CblasColMajor, CblasNoTrans, 2, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_zgemv(CblasColMajor, CblasNoTrans, 0, 0,
+ cblas_zgemv(CblasColMajor, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 12; RowMajorStrg = FALSE;
- cblas_zgemv(CblasColMajor, CblasNoTrans, 0, 0,
+ cblas_zgemv(CblasColMajor, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE;
- cblas_zgemv(CblasRowMajor, INVALID, 0, 0,
+ cblas_zgemv(CblasRowMajor, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_zgemv(CblasRowMajor, CblasNoTrans, INVALID, 0,
+ cblas_zgemv(CblasRowMajor, CblasNoTrans, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, INVALID,
+ cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 2,
+ cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 2,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 0,
+ cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 12; RowMajorStrg = TRUE;
- cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 0,
+ cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_zgbmv",11)==0) {
cblas_rout = "cblas_zgbmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_zgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0,
+ cblas_zgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_zgbmv(CblasColMajor, INVALID, 0, 0, 0, 0,
+ cblas_zgbmv(CblasColMajor, INVALID, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_zgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0,
+ cblas_zgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0,
+ cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0,
+ cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
- cblas_zgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID,
+ cblas_zgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0,
+ cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
- cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0,
+ cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
- cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0,
+ cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_zgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0,
+ cblas_zgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_zgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0,
+ cblas_zgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0,
+ cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0,
+ cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
- cblas_zgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID,
+ cblas_zgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0,
+ cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
- cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0,
+ cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
- cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0,
+ cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_zhemv",11)==0) {
cblas_rout = "cblas_zhemv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_zhemv(INVALID, CblasUpper, 0,
+ cblas_zhemv(INVALID, CblasUpper, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_zhemv(CblasColMajor, INVALID, 0,
+ cblas_zhemv(CblasColMajor, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_zhemv(CblasColMajor, CblasUpper, INVALID,
+ cblas_zhemv(CblasColMajor, CblasUpper, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
- cblas_zhemv(CblasColMajor, CblasUpper, 2,
+ cblas_zhemv(CblasColMajor, CblasUpper, 2,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_zhemv(CblasColMajor, CblasUpper, 0,
+ cblas_zhemv(CblasColMajor, CblasUpper, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
- cblas_zhemv(CblasColMajor, CblasUpper, 0,
+ cblas_zhemv(CblasColMajor, CblasUpper, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_zhemv(CblasRowMajor, INVALID, 0,
+ cblas_zhemv(CblasRowMajor, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_zhemv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_zhemv(CblasRowMajor, CblasUpper, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
- cblas_zhemv(CblasRowMajor, CblasUpper, 2,
+ cblas_zhemv(CblasRowMajor, CblasUpper, 2,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_zhemv(CblasRowMajor, CblasUpper, 0,
+ cblas_zhemv(CblasRowMajor, CblasUpper, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
- cblas_zhemv(CblasRowMajor, CblasUpper, 0,
+ cblas_zhemv(CblasRowMajor, CblasUpper, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_zhbmv",11)==0) {
cblas_rout = "cblas_zhbmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_zhbmv(INVALID, CblasUpper, 0, 0,
+ cblas_zhbmv(INVALID, CblasUpper, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_zhbmv(CblasColMajor, INVALID, 0, 0,
+ cblas_zhbmv(CblasColMajor, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_zhbmv(CblasColMajor, CblasUpper, INVALID, 0,
+ cblas_zhbmv(CblasColMajor, CblasUpper, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_zhbmv(CblasColMajor, CblasUpper, 0, INVALID,
+ cblas_zhbmv(CblasColMajor, CblasUpper, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_zhbmv(CblasColMajor, CblasUpper, 0, 1,
+ cblas_zhbmv(CblasColMajor, CblasUpper, 0, 1,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_zhbmv(CblasColMajor, CblasUpper, 0, 0,
+ cblas_zhbmv(CblasColMajor, CblasUpper, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 12; RowMajorStrg = FALSE;
- cblas_zhbmv(CblasColMajor, CblasUpper, 0, 0,
+ cblas_zhbmv(CblasColMajor, CblasUpper, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_zhbmv(CblasRowMajor, INVALID, 0, 0,
+ cblas_zhbmv(CblasRowMajor, INVALID, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_zhbmv(CblasRowMajor, CblasUpper, INVALID, 0,
+ cblas_zhbmv(CblasRowMajor, CblasUpper, INVALID, 0,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_zhbmv(CblasRowMajor, CblasUpper, 0, INVALID,
+ cblas_zhbmv(CblasRowMajor, CblasUpper, 0, INVALID,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 1,
+ cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 1,
ALPHA, A, 1, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 0,
+ cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 0,
ALPHA, A, 1, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 12; RowMajorStrg = TRUE;
- cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 0,
+ cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 0,
ALPHA, A, 1, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_zhpmv",11)==0) {
cblas_rout = "cblas_zhpmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_zhpmv(INVALID, CblasUpper, 0,
+ cblas_zhpmv(INVALID, CblasUpper, 0,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_zhpmv(CblasColMajor, INVALID, 0,
+ cblas_zhpmv(CblasColMajor, INVALID, 0,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_zhpmv(CblasColMajor, CblasUpper, INVALID,
+ cblas_zhpmv(CblasColMajor, CblasUpper, INVALID,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_zhpmv(CblasColMajor, CblasUpper, 0,
+ cblas_zhpmv(CblasColMajor, CblasUpper, 0,
ALPHA, A, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = FALSE;
- cblas_zhpmv(CblasColMajor, CblasUpper, 0,
+ cblas_zhpmv(CblasColMajor, CblasUpper, 0,
ALPHA, A, X, 1, BETA, Y, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_zhpmv(CblasRowMajor, INVALID, 0,
+ cblas_zhpmv(CblasRowMajor, INVALID, 0,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_zhpmv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_zhpmv(CblasRowMajor, CblasUpper, INVALID,
ALPHA, A, X, 1, BETA, Y, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_zhpmv(CblasRowMajor, CblasUpper, 0,
+ cblas_zhpmv(CblasRowMajor, CblasUpper, 0,
ALPHA, A, X, 0, BETA, Y, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = TRUE;
- cblas_zhpmv(CblasRowMajor, CblasUpper, 0,
+ cblas_zhpmv(CblasRowMajor, CblasUpper, 0,
ALPHA, A, X, 1, BETA, Y, 0 );
chkxer();
} else if (strncmp( sf,"cblas_ztrmv",11)==0) {
cblas_rout = "cblas_ztrmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_ztrmv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_ztrmv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_ztrmv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_ztrmv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_ztrmv(CblasColMajor, CblasUpper, INVALID,
+ cblas_ztrmv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 2, A, 1, X, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_ztrmv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_ztrmv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_ztrmv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_ztrmv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 2, A, 1, X, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_ztbmv",11)==0) {
cblas_rout = "cblas_ztbmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_ztbmv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_ztbmv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_ztbmv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_ztbmv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_ztbmv(CblasColMajor, CblasUpper, INVALID,
+ cblas_ztbmv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
- cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 1, A, 1, X, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = FALSE;
- cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_ztbmv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_ztbmv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_ztbmv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_ztbmv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
- cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 1, A, 1, X, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = TRUE;
- cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_ztpmv",11)==0) {
cblas_rout = "cblas_ztpmv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_ztpmv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_ztpmv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_ztpmv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_ztpmv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_ztpmv(CblasColMajor, CblasUpper, INVALID,
+ cblas_ztpmv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_ztpmv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_ztpmv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_ztpmv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_ztpmv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_ztrsv",11)==0) {
cblas_rout = "cblas_ztrsv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_ztrsv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_ztrsv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_ztrsv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_ztrsv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_ztrsv(CblasColMajor, CblasUpper, INVALID,
+ cblas_ztrsv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = FALSE;
- cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 2, A, 1, X, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
- cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_ztrsv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_ztrsv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_ztrsv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_ztrsv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 7; RowMajorStrg = TRUE;
- cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 2, A, 1, X, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
- cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, 1, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_ztbsv",11)==0) {
cblas_rout = "cblas_ztbsv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_ztbsv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_ztbsv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_ztbsv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_ztbsv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_ztbsv(CblasColMajor, CblasUpper, INVALID,
+ cblas_ztbsv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
- cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 1, A, 1, X, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = FALSE;
- cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_ztbsv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_ztbsv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_ztbsv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_ztbsv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, 0, A, 1, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, 0, A, 1, X, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
- cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, INVALID, A, 1, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 1, A, 1, X, 1 );
chkxer();
cblas_info = 10; RowMajorStrg = TRUE;
- cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, 0, A, 1, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_ztpsv",11)==0) {
cblas_rout = "cblas_ztpsv";
cblas_info = 1; RowMajorStrg = FALSE;
- cblas_ztpsv(INVALID, CblasUpper, CblasNoTrans,
+ cblas_ztpsv(INVALID, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
- cblas_ztpsv(CblasColMajor, INVALID, CblasNoTrans,
+ cblas_ztpsv(CblasColMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
- cblas_ztpsv(CblasColMajor, CblasUpper, INVALID,
+ cblas_ztpsv(CblasColMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
- cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
- cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = FALSE;
- cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans,
+ cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 0 );
chkxer();
cblas_info = 2; RowMajorStrg = TRUE;
- cblas_ztpsv(CblasRowMajor, INVALID, CblasNoTrans,
+ cblas_ztpsv(CblasRowMajor, INVALID, CblasNoTrans,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = TRUE;
- cblas_ztpsv(CblasRowMajor, CblasUpper, INVALID,
+ cblas_ztpsv(CblasRowMajor, CblasUpper, INVALID,
CblasNonUnit, 0, A, X, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
- cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
INVALID, 0, A, X, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
- cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, INVALID, A, X, 1 );
chkxer();
cblas_info = 8; RowMajorStrg = TRUE;
- cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
+ cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans,
CblasNonUnit, 0, A, X, 0 );
chkxer();
} else if (strncmp( sf,"cblas_zgeru",10)==0) {
@@ -818,7 +818,7 @@ void F77_z2chke(char *rout) {
cblas_info = 6; RowMajorStrg = FALSE;
cblas_zhpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A );
chkxer();
- }
+ }
if (cblas_ok == TRUE)
printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout);
else
diff --git a/ctest/c_z3chke.c b/ctest/c_z3chke.c
index b58cb62..df25135 100644
--- a/ctest/c_z3chke.c
+++ b/ctest/c_z3chke.c
@@ -30,7 +30,7 @@ void F77_z3chke(char * rout) {
B[4] = {0.0,0.0,0.0,0.0},
C[4] = {0.0,0.0,0.0,0.0},
ALPHA[2] = {0.0,0.0},
- BETA[2] = {0.0,0.0},
+ BETA[2] = {0.0,0.0},
RALPHA = 0.0, RBETA = 0.0;
extern int cblas_info, cblas_lerr, cblas_ok;
extern int RowMajorStrg;
@@ -49,15 +49,15 @@ void F77_z3chke(char * rout) {
cblas_rout = "cblas_zgemm" ;
cblas_info = 1;
- cblas_zgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0,
+ cblas_zgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
- cblas_zgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0,
+ cblas_zgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
- cblas_zgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0,
+ cblas_zgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
@@ -272,7 +272,7 @@ void F77_z3chke(char * rout) {
cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
-
+
} else if (strncmp( sf,"cblas_zhemm" ,11)==0) {
cblas_rout = "cblas_zhemm" ;
@@ -1696,7 +1696,7 @@ void F77_z3chke(char * rout) {
cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
-
+
}
if (cblas_ok == 1 )
diff --git a/ctest/c_zblas1.c b/ctest/c_zblas1.c
index 0a36f33..160ef4b 100644
--- a/ctest/c_zblas1.c
+++ b/ctest/c_zblas1.c
@@ -16,21 +16,21 @@ void F77_zaxpy(const int *N, const void *alpha, void *X,
return;
}
-void F77_zcopy(const int *N, void *X, const int *incX,
+void F77_zcopy(const int *N, void *X, const int *incX,
void *Y, const int *incY)
{
cblas_zcopy(*N, X, *incX, Y, *incY);
return;
}
-void F77_zdotc(const int *N, const void *X, const int *incX,
+void F77_zdotc(const int *N, const void *X, const int *incX,
const void *Y, const int *incY,void *dotc)
{
cblas_zdotc_sub(*N, X, *incX, Y, *incY, dotc);
return;
}
-void F77_zdotu(const int *N, void *X, const int *incX,
+void F77_zdotu(const int *N, void *X, const int *incX,
void *Y, const int *incY,void *dotu)
{
cblas_zdotu_sub(*N, X, *incX, Y, *incY, dotu);
diff --git a/ctest/c_zblas2.c b/ctest/c_zblas2.c
index 6291abe..ab1bd79 100644
--- a/ctest/c_zblas2.c
+++ b/ctest/c_zblas2.c
@@ -8,9 +8,9 @@
#include "common.h"
#include "cblas_test.h"
-void F77_zgemv(int *order, char *transp, int *m, int *n,
+void F77_zgemv(int *order, char *transp, int *m, int *n,
const void *alpha,
- CBLAS_TEST_ZOMPLEX *a, int *lda, const void *x, int *incx,
+ CBLAS_TEST_ZOMPLEX *a, int *lda, const void *x, int *incx,
const void *beta, void *y, int *incy) {
CBLAS_TEST_ZOMPLEX *A;
@@ -38,9 +38,9 @@ void F77_zgemv(int *order, char *transp, int *m, int *n,
*m, *n, alpha, a, *lda, x, *incx, beta, y, *incy );
}
-void F77_zgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
- CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
- CBLAS_TEST_ZOMPLEX *x, int *incx,
+void F77_zgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
+ CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
+ CBLAS_TEST_ZOMPLEX *x, int *incx,
CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *y, int *incy) {
CBLAS_TEST_ZOMPLEX *A;
@@ -85,8 +85,8 @@ void F77_zgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
*incx, beta, y, *incy );
}
-void F77_zgeru(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha,
- CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *y, int *incy,
+void F77_zgeru(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha,
+ CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *y, int *incy,
CBLAS_TEST_ZOMPLEX *a, int *lda){
CBLAS_TEST_ZOMPLEX *A;
@@ -114,8 +114,8 @@ void F77_zgeru(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha,
cblas_zgeru( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda );
}
-void F77_zgerc(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha,
- CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *y, int *incy,
+void F77_zgerc(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha,
+ CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *y, int *incy,
CBLAS_TEST_ZOMPLEX *a, int *lda) {
CBLAS_TEST_ZOMPLEX *A;
int i,j,LDA;
@@ -165,7 +165,7 @@ void F77_zhemv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
free(A);
}
else if (*order == TEST_COL_MJR)
- cblas_zhemv( CblasColMajor, uplo, *n, alpha, a, *lda, x, *incx,
+ cblas_zhemv( CblasColMajor, uplo, *n, alpha, a, *lda, x, *incx,
beta, y, *incy );
else
cblas_zhemv( UNDEFINED, uplo, *n, alpha, a, *lda, x, *incx,
@@ -173,7 +173,7 @@ void F77_zhemv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
}
void F77_zhbmv(int *order, char *uplow, int *n, int *k,
- CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
+ CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *beta,
CBLAS_TEST_ZOMPLEX *y, int *incy){
@@ -186,7 +186,7 @@ int i,irow,j,jcol,LDA;
if (*order == TEST_ROW_MJR) {
if (uplo != CblasUpper && uplo != CblasLower )
- cblas_zhbmv(CblasRowMajor, UNDEFINED, *n, *k, alpha, a, *lda, x,
+ cblas_zhbmv(CblasRowMajor, UNDEFINED, *n, *k, alpha, a, *lda, x,
*incx, beta, y, *incy );
else {
LDA = *k+2;
@@ -237,7 +237,7 @@ int i,irow,j,jcol,LDA;
}
void F77_zhpmv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
- CBLAS_TEST_ZOMPLEX *ap, CBLAS_TEST_ZOMPLEX *x, int *incx,
+ CBLAS_TEST_ZOMPLEX *ap, CBLAS_TEST_ZOMPLEX *x, int *incx,
CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *y, int *incy){
CBLAS_TEST_ZOMPLEX *A, *AP;
@@ -247,7 +247,7 @@ void F77_zhpmv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
get_uplo_type(uplow,&uplo);
if (*order == TEST_ROW_MJR) {
if (uplo != CblasUpper && uplo != CblasLower )
- cblas_zhpmv(CblasRowMajor, UNDEFINED, *n, alpha, ap, x, *incx,
+ cblas_zhpmv(CblasRowMajor, UNDEFINED, *n, alpha, ap, x, *incx,
beta, y, *incy);
else {
LDA = *n;
@@ -344,7 +344,7 @@ void F77_ztbmv(int *order, char *uplow, char *transp, char *diagn,
}
}
}
- cblas_ztbmv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x,
+ cblas_ztbmv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x,
*incx);
free(A);
}
@@ -371,7 +371,7 @@ void F77_ztbsv(int *order, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
if (uplo != CblasUpper && uplo != CblasLower )
- cblas_ztbsv(CblasRowMajor, UNDEFINED, trans, diag, *n, *k, a, *lda, x,
+ cblas_ztbsv(CblasRowMajor, UNDEFINED, trans, diag, *n, *k, a, *lda, x,
*incx);
else {
LDA = *k+2;
@@ -408,7 +408,7 @@ void F77_ztbsv(int *order, char *uplow, char *transp, char *diagn,
}
}
}
- cblas_ztbsv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA,
+ cblas_ztbsv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA,
x, *incx);
free(A);
}
@@ -674,7 +674,7 @@ void F77_zhpr2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
if (*order == TEST_ROW_MJR) {
if (uplo != CblasUpper && uplo != CblasLower )
- cblas_zhpr2( CblasRowMajor, UNDEFINED, *n, alpha, x, *incx, y,
+ cblas_zhpr2( CblasRowMajor, UNDEFINED, *n, alpha, x, *incx, y,
*incy, ap );
else {
LDA = *n;
@@ -752,7 +752,7 @@ void F77_zher(int *order, char *uplow, int *n, double *alpha,
LDA = *n+1;
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_ZOMPLEX ));
- for( i=0; i<*n; i++ )
+ for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag;
@@ -786,7 +786,7 @@ void F77_zher2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
LDA = *n+1;
A= ( CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
- for( i=0; i<*n; i++ )
+ for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag;
diff --git a/ctest/c_zblas3.c b/ctest/c_zblas3.c
index 7f46365..ad74411 100644
--- a/ctest/c_zblas3.c
+++ b/ctest/c_zblas3.c
@@ -11,9 +11,9 @@
#define TEST_ROW_MJR 1
#define UNDEFINED -1
-void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
+void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
- CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta,
+ CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta,
CBLAS_TEST_ZOMPLEX *c, int *ldc ) {
CBLAS_TEST_ZOMPLEX *A, *B, *C;
@@ -133,7 +133,7 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
- cblas_zhemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB,
+ cblas_zhemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB,
beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
@@ -145,10 +145,10 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
free(C);
}
else if (*order == TEST_COL_MJR)
- cblas_zhemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
+ cblas_zhemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
beta, c, *ldc );
else
- cblas_zhemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
+ cblas_zhemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
beta, c, *ldc );
}
void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
@@ -189,7 +189,7 @@ void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
C[i*LDC+j]=c[j*(*ldc)+i];
- cblas_zsymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB,
+ cblas_zsymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB,
beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
@@ -199,15 +199,15 @@ void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
free(C);
}
else if (*order == TEST_COL_MJR)
- cblas_zsymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
+ cblas_zsymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
beta, c, *ldc );
else
- cblas_zsymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
+ cblas_zsymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
beta, c, *ldc );
}
void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
- double *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
+ double *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
double *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) {
int i,j,LDA,LDC;
@@ -244,7 +244,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
- cblas_zherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta,
+ cblas_zherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta,
C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ ) {
@@ -255,15 +255,15 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
free(C);
}
else if (*order == TEST_COL_MJR)
- cblas_zherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
+ cblas_zherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
c, *ldc );
else
- cblas_zherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
+ cblas_zherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
c, *ldc );
}
void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
- CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
+ CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) {
int i,j,LDA,LDC;
@@ -300,7 +300,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
- cblas_zsyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta,
+ cblas_zsyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta,
C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ ) {
@@ -311,10 +311,10 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
free(C);
}
else if (*order == TEST_COL_MJR)
- cblas_zsyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta,
+ cblas_zsyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta,
c, *ldc );
else
- cblas_zsyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta,
+ cblas_zsyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta,
c, *ldc );
}
void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
@@ -363,7 +363,7 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
- cblas_zher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA,
+ cblas_zher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA,
B, LDB, *beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ ) {
@@ -375,10 +375,10 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
free(C);
}
else if (*order == TEST_COL_MJR)
- cblas_zher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda,
+ cblas_zher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda,
b, *ldb, *beta, c, *ldc );
else
- cblas_zher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda,
+ cblas_zher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda,
b, *ldb, *beta, c, *ldc );
}
void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
@@ -427,7 +427,7 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
- cblas_zsyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA,
+ cblas_zsyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA,
B, LDB, beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ ) {
@@ -439,14 +439,14 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
free(C);
}
else if (*order == TEST_COL_MJR)
- cblas_zsyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda,
+ cblas_zsyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
else
- cblas_zsyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda,
+ cblas_zsyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
}
void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
- int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a,
+ int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a,
int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) {
int i,j,LDA,LDB;
CBLAS_TEST_ZOMPLEX *A, *B;
@@ -486,7 +486,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
- cblas_ztrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha,
+ cblas_ztrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha,
A, LDA, B, LDB );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
@@ -497,15 +497,15 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
free(B);
}
else if (*order == TEST_COL_MJR)
- cblas_ztrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha,
+ cblas_ztrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb);
else
- cblas_ztrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha,
+ cblas_ztrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb);
}
void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
- int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a,
+ int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a,
int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) {
int i,j,LDA,LDB;
CBLAS_TEST_ZOMPLEX *A, *B;
@@ -545,7 +545,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
- cblas_ztrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha,
+ cblas_ztrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha,
A, LDA, B, LDB );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
@@ -556,9 +556,9 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
free(B);
}
else if (*order == TEST_COL_MJR)
- cblas_ztrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha,
+ cblas_ztrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb);
else
- cblas_ztrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha,
+ cblas_ztrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb);
}
diff --git a/ctest/c_zblat2.f b/ctest/c_zblat2.f
index 236088f..5a7d83f 100644
--- a/ctest/c_zblat2.f
+++ b/ctest/c_zblat2.f
@@ -69,7 +69,7 @@
INTEGER NSUBS
PARAMETER ( NSUBS = 17 )
COMPLEX*16 ZERO, ONE
- PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
$ ONE = ( 1.0D0, 0.0D0 ) )
DOUBLE PRECISION RZERO, RHALF, RONE
PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 )
@@ -348,13 +348,13 @@
160 IF (CORDER) THEN
CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
$ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC,
- $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z,
+ $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z,
$ 0 )
END IF
IF (RORDER) THEN
CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
$ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC,
- $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z,
+ $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z,
$ 1 )
END IF
GO TO 200
@@ -474,7 +474,7 @@
*
* .. Parameters ..
COMPLEX*16 ZERO, HALF
- PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
$ HALF = ( 0.5D0, 0.0D0 ) )
DOUBLE PRECISION RZERO
PARAMETER ( RZERO = 0.0D0 )
@@ -582,7 +582,7 @@
CTRANS = ' CblasNoTrans'
ELSE IF (TRANS.EQ.'T')THEN
CTRANS = ' CblasTrans'
- ELSE
+ ELSE
CTRANS = 'CblasConjTrans'
END IF
TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C'
@@ -685,7 +685,7 @@
*
* See what data changed inside subroutines.
*
-* IF(TRANS .NE. 'C' .OR. (INCX .GT. 0 .AND. INCY .GT. 0)) THEN
+* IF(TRANS .NE. 'C' .OR. (INCX .GT. 0 .AND. INCY .GT. 0)) THEN
ISAME( 1 ) = TRANS.EQ.TRANSS
ISAME( 2 ) = MS.EQ.M
ISAME( 3 ) = NS.EQ.N
@@ -927,7 +927,7 @@
UPLO = ICH( IC: IC )
IF (UPLO.EQ.'U')THEN
CUPLO = ' CblasUpper'
- ELSE
+ ELSE
CUPLO = ' CblasLower'
END IF
*
@@ -1287,7 +1287,7 @@
UPLO = ICHU( ICU: ICU )
IF (UPLO.EQ.'U')THEN
CUPLO = ' CblasUpper'
- ELSE
+ ELSE
CUPLO = ' CblasLower'
END IF
*
@@ -1297,7 +1297,7 @@
CTRANS = ' CblasNoTrans'
ELSE IF (TRANS.EQ.'T')THEN
CTRANS = ' CblasTrans'
- ELSE
+ ELSE
CTRANS = 'CblasConjTrans'
END IF
*
@@ -1569,7 +1569,7 @@
*
* .. Parameters ..
COMPLEX*16 ZERO, HALF, ONE
- PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
$ HALF = ( 0.5D0, 0.0D0 ),
$ ONE = ( 1.0D0, 0.0D0 ) )
DOUBLE PRECISION RZERO
@@ -1847,7 +1847,7 @@
*
* .. Parameters ..
COMPLEX*16 ZERO, HALF, ONE
- PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
$ HALF = ( 0.5D0, 0.0D0 ),
$ ONE = ( 1.0D0, 0.0D0 ) )
DOUBLE PRECISION RZERO
@@ -2141,7 +2141,7 @@
*
* .. Parameters ..
COMPLEX*16 ZERO, HALF, ONE
- PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
$ HALF = ( 0.5D0, 0.0D0 ),
$ ONE = ( 1.0D0, 0.0D0 ) )
DOUBLE PRECISION RZERO
@@ -2762,7 +2762,7 @@
*
* .. Parameters ..
COMPLEX*16 ZERO, ONE
- PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
$ ONE = ( 1.0D0, 0.0D0 ) )
COMPLEX*16 ROGUE
PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) )
diff --git a/ctest/c_zblat3.f b/ctest/c_zblat3.f
index 6e9dbbd..93b2b77 100644
--- a/ctest/c_zblat3.f
+++ b/ctest/c_zblat3.f
@@ -51,7 +51,7 @@
INTEGER NSUBS
PARAMETER ( NSUBS = 9 )
COMPLEX*16 ZERO, ONE
- PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
$ ONE = ( 1.0D0, 0.0D0 ) )
DOUBLE PRECISION RZERO, RHALF, RONE
PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 )
@@ -425,7 +425,7 @@
END
SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
$ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
- $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G,
+ $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G,
$ IORDER )
*
* Tests ZGEMM.
@@ -601,7 +601,7 @@
IF( REWI )
$ REWIND NTRA
CALL CZGEMM( IORDER, TRANSA, TRANSB, M, N,
- $ K, ALPHA, AA, LDA, BB, LDB,
+ $ K, ALPHA, AA, LDA, BB, LDB,
$ BETA, CC, LDC )
*
* Check if error-exit was taken incorrectly.
@@ -689,7 +689,7 @@
*
120 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
- CALL ZPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB,
+ CALL ZPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB,
$ M, N, K, ALPHA, LDA, LDB, BETA, LDC)
*
130 CONTINUE
@@ -725,24 +725,24 @@
CHARACTER*1 TRANSA, TRANSB
CHARACTER*12 SNAME
CHARACTER*14 CRC, CTA,CTB
-
+
IF (TRANSA.EQ.'N')THEN
CTA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CTA = ' CblasTrans'
- ELSE
+ ELSE
CTA = 'CblasConjTrans'
END IF
IF (TRANSB.EQ.'N')THEN
CTB = ' CblasNoTrans'
ELSE IF (TRANSB.EQ.'T')THEN
CTB = ' CblasTrans'
- ELSE
+ ELSE
CTB = 'CblasConjTrans'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB
@@ -755,7 +755,7 @@
*
SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
$ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
- $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G,
+ $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G,
$ IORDER )
*
* Tests ZHEMM and ZSYMM.
@@ -911,9 +911,9 @@
* Call the subroutine.
*
IF( TRACE )
- $ CALL ZPRCN2(NTRA, NC, SNAME, IORDER,
- $ SIDE, UPLO, M, N, ALPHA, LDA, LDB,
- $ BETA, LDC)
+ $ CALL ZPRCN2(NTRA, NC, SNAME, IORDER,
+ $ SIDE, UPLO, M, N, ALPHA, LDA, LDB,
+ $ BETA, LDC)
IF( REWI )
$ REWIND NTRA
IF( CONJ )THEN
@@ -1016,7 +1016,7 @@
110 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
CALL ZPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA,
- $ LDB, BETA, LDC)
+ $ LDB, BETA, LDC)
*
120 CONTINUE
RETURN
@@ -1051,20 +1051,20 @@
CHARACTER*1 SIDE, UPLO
CHARACTER*12 SNAME
CHARACTER*14 CRC, CS,CU
-
+
IF (SIDE.EQ.'L')THEN
CS = ' CblasLeft'
- ELSE
+ ELSE
CS = ' CblasRight'
END IF
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU
@@ -1402,22 +1402,22 @@
CHARACTER*1 SIDE, UPLO, TRANSA, DIAG
CHARACTER*12 SNAME
CHARACTER*14 CRC, CS, CU, CA, CD
-
+
IF (SIDE.EQ.'L')THEN
CS = ' CblasLeft'
- ELSE
+ ELSE
CS = ' CblasRight'
END IF
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (TRANSA.EQ.'N')THEN
CA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CA = ' CblasTrans'
- ELSE
+ ELSE
CA = 'CblasConjTrans'
END IF
IF (DIAG.EQ.'N')THEN
@@ -1427,7 +1427,7 @@
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU
@@ -1788,22 +1788,22 @@
CHARACTER*1 UPLO, TRANSA
CHARACTER*12 SNAME
CHARACTER*14 CRC, CU, CA
-
+
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (TRANSA.EQ.'N')THEN
CA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CA = ' CblasTrans'
- ELSE
+ ELSE
CA = 'CblasConjTrans'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
@@ -1822,29 +1822,29 @@
CHARACTER*1 UPLO, TRANSA
CHARACTER*12 SNAME
CHARACTER*14 CRC, CU, CA
-
+
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (TRANSA.EQ.'N')THEN
CA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CA = ' CblasTrans'
- ELSE
+ ELSE
CA = 'CblasConjTrans'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC
9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') )
- 9994 FORMAT( 10X, 2( I3, ',' ),
+ 9994 FORMAT( 10X, 2( I3, ',' ),
$ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' )
END
*
@@ -2041,7 +2041,7 @@
IF( REWI )
$ REWIND NTRA
CALL CZSYR2K( IORDER, UPLO, TRANS, N, K,
- $ ALPHA, AA, LDA, BB, LDB, BETA,
+ $ ALPHA, AA, LDA, BB, LDB, BETA,
$ CC, LDC )
END IF
*
@@ -2241,22 +2241,22 @@
CHARACTER*1 UPLO, TRANSA
CHARACTER*12 SNAME
CHARACTER*14 CRC, CU, CA
-
+
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (TRANSA.EQ.'N')THEN
CA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CA = ' CblasTrans'
- ELSE
+ ELSE
CA = 'CblasConjTrans'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
@@ -2276,22 +2276,22 @@
CHARACTER*1 UPLO, TRANSA
CHARACTER*12 SNAME
CHARACTER*14 CRC, CU, CA
-
+
IF (UPLO.EQ.'U')THEN
CU = ' CblasUpper'
- ELSE
+ ELSE
CU = ' CblasLower'
END IF
IF (TRANSA.EQ.'N')THEN
CA = ' CblasNoTrans'
ELSE IF (TRANSA.EQ.'T')THEN
CA = ' CblasTrans'
- ELSE
+ ELSE
CA = 'CblasConjTrans'
END IF
IF (IORDER.EQ.1)THEN
CRC = ' CblasRowMajor'
- ELSE
+ ELSE
CRC = ' CblasColMajor'
END IF
WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
diff --git a/driver/level2/Makefile b/driver/level2/Makefile
index 7043e52..79c4ca1 100644
--- a/driver/level2/Makefile
+++ b/driver/level2/Makefile
@@ -419,3200 +419,3200 @@ endif
all ::
-sgbmv_n.$(SUFFIX) sgbmv_n.$(PSUFFIX) : gbmv_k.c
+sgbmv_n.$(SUFFIX) sgbmv_n.$(PSUFFIX) : gbmv_k.c
$(CC) -c -UCOMPLEX -UDOUBLE -UTRANS $(CFLAGS) -o $(@F) $<
-sgbmv_t.$(SUFFIX) sgbmv_t.$(PSUFFIX) : gbmv_k.c
+sgbmv_t.$(SUFFIX) sgbmv_t.$(PSUFFIX) : gbmv_k.c
$(CC) -c -UCOMPLEX -UDOUBLE -DTRANS $(CFLAGS) -o $(@F) $<
-dgbmv_n.$(SUFFIX) dgbmv_n.$(PSUFFIX) : gbmv_k.c
+dgbmv_n.$(SUFFIX) dgbmv_n.$(PSUFFIX) : gbmv_k.c
$(CC) -c -UCOMPLEX -DDOUBLE -UTRANS $(CFLAGS) -o $(@F) $<
-dgbmv_t.$(SUFFIX) dgbmv_t.$(PSUFFIX) : gbmv_k.c
+dgbmv_t.$(SUFFIX) dgbmv_t.$(PSUFFIX) : gbmv_k.c
$(CC) -c -UCOMPLEX -DDOUBLE -DTRANS $(CFLAGS) -o $(@F) $<
-qgbmv_n.$(SUFFIX) qgbmv_n.$(PSUFFIX) : gbmv_k.c
+qgbmv_n.$(SUFFIX) qgbmv_n.$(PSUFFIX) : gbmv_k.c
$(CC) -c -UCOMPLEX -DXDOUBLE -UTRANS $(CFLAGS) -o $(@F) $<
-qgbmv_t.$(SUFFIX) qgbmv_t.$(PSUFFIX) : gbmv_k.c
+qgbmv_t.$(SUFFIX) qgbmv_t.$(PSUFFIX) : gbmv_k.c
$(CC) -c -UCOMPLEX -DXDOUBLE -DTRANS $(CFLAGS) -o $(@F) $<
-cgbmv_n.$(SUFFIX) cgbmv_n.$(PSUFFIX) : zgbmv_k.c
+cgbmv_n.$(SUFFIX) cgbmv_n.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-cgbmv_t.$(SUFFIX) cgbmv_t.$(PSUFFIX) : zgbmv_k.c
+cgbmv_t.$(SUFFIX) cgbmv_t.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-cgbmv_r.$(SUFFIX) cgbmv_r.$(PSUFFIX) : zgbmv_k.c
+cgbmv_r.$(SUFFIX) cgbmv_r.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-cgbmv_c.$(SUFFIX) cgbmv_c.$(PSUFFIX) : zgbmv_k.c
+cgbmv_c.$(SUFFIX) cgbmv_c.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-cgbmv_o.$(SUFFIX) cgbmv_o.$(PSUFFIX) : zgbmv_k.c
+cgbmv_o.$(SUFFIX) cgbmv_o.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-cgbmv_u.$(SUFFIX) cgbmv_u.$(PSUFFIX) : zgbmv_k.c
+cgbmv_u.$(SUFFIX) cgbmv_u.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-cgbmv_s.$(SUFFIX) cgbmv_s.$(PSUFFIX) : zgbmv_k.c
+cgbmv_s.$(SUFFIX) cgbmv_s.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-cgbmv_d.$(SUFFIX) cgbmv_d.$(PSUFFIX) : zgbmv_k.c
+cgbmv_d.$(SUFFIX) cgbmv_d.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-zgbmv_n.$(SUFFIX) zgbmv_n.$(PSUFFIX) : zgbmv_k.c
+zgbmv_n.$(SUFFIX) zgbmv_n.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-zgbmv_t.$(SUFFIX) zgbmv_t.$(PSUFFIX) : zgbmv_k.c
+zgbmv_t.$(SUFFIX) zgbmv_t.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-zgbmv_r.$(SUFFIX) zgbmv_r.$(PSUFFIX) : zgbmv_k.c
+zgbmv_r.$(SUFFIX) zgbmv_r.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-zgbmv_c.$(SUFFIX) zgbmv_c.$(PSUFFIX) : zgbmv_k.c
+zgbmv_c.$(SUFFIX) zgbmv_c.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-zgbmv_o.$(SUFFIX) zgbmv_o.$(PSUFFIX) : zgbmv_k.c
+zgbmv_o.$(SUFFIX) zgbmv_o.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-zgbmv_u.$(SUFFIX) zgbmv_u.$(PSUFFIX) : zgbmv_k.c
+zgbmv_u.$(SUFFIX) zgbmv_u.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-zgbmv_s.$(SUFFIX) zgbmv_s.$(PSUFFIX) : zgbmv_k.c
+zgbmv_s.$(SUFFIX) zgbmv_s.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-zgbmv_d.$(SUFFIX) zgbmv_d.$(PSUFFIX) : zgbmv_k.c
+zgbmv_d.$(SUFFIX) zgbmv_d.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-xgbmv_n.$(SUFFIX) xgbmv_n.$(PSUFFIX) : zgbmv_k.c
+xgbmv_n.$(SUFFIX) xgbmv_n.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-xgbmv_t.$(SUFFIX) xgbmv_t.$(PSUFFIX) : zgbmv_k.c
+xgbmv_t.$(SUFFIX) xgbmv_t.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-xgbmv_r.$(SUFFIX) xgbmv_r.$(PSUFFIX) : zgbmv_k.c
+xgbmv_r.$(SUFFIX) xgbmv_r.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-xgbmv_c.$(SUFFIX) xgbmv_c.$(PSUFFIX) : zgbmv_k.c
+xgbmv_c.$(SUFFIX) xgbmv_c.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-xgbmv_o.$(SUFFIX) xgbmv_o.$(PSUFFIX) : zgbmv_k.c
+xgbmv_o.$(SUFFIX) xgbmv_o.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-xgbmv_u.$(SUFFIX) xgbmv_u.$(PSUFFIX) : zgbmv_k.c
+xgbmv_u.$(SUFFIX) xgbmv_u.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-xgbmv_s.$(SUFFIX) xgbmv_s.$(PSUFFIX) : zgbmv_k.c
+xgbmv_s.$(SUFFIX) xgbmv_s.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-xgbmv_d.$(SUFFIX) xgbmv_d.$(PSUFFIX) : zgbmv_k.c
+xgbmv_d.$(SUFFIX) xgbmv_d.$(PSUFFIX) : zgbmv_k.c
$(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-sgbmv_thread_n.$(SUFFIX) sgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c
+sgbmv_thread_n.$(SUFFIX) sgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -UCOMPLEX -UDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $<
-sgbmv_thread_t.$(SUFFIX) sgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c
+sgbmv_thread_t.$(SUFFIX) sgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -UCOMPLEX -UDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $<
-dgbmv_thread_n.$(SUFFIX) dgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c
+dgbmv_thread_n.$(SUFFIX) dgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -UCOMPLEX -DDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $<
-dgbmv_thread_t.$(SUFFIX) dgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c
+dgbmv_thread_t.$(SUFFIX) dgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -UCOMPLEX -DDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $<
-qgbmv_thread_n.$(SUFFIX) qgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c
+qgbmv_thread_n.$(SUFFIX) qgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -UCOMPLEX -DXDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $<
-qgbmv_thread_t.$(SUFFIX) qgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c
+qgbmv_thread_t.$(SUFFIX) qgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -UCOMPLEX -DXDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $<
-cgbmv_thread_n.$(SUFFIX) cgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c
+cgbmv_thread_n.$(SUFFIX) cgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-cgbmv_thread_t.$(SUFFIX) cgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c
+cgbmv_thread_t.$(SUFFIX) cgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-cgbmv_thread_r.$(SUFFIX) cgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c
+cgbmv_thread_r.$(SUFFIX) cgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-cgbmv_thread_c.$(SUFFIX) cgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c
+cgbmv_thread_c.$(SUFFIX) cgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-cgbmv_thread_o.$(SUFFIX) cgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c
+cgbmv_thread_o.$(SUFFIX) cgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-cgbmv_thread_u.$(SUFFIX) cgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c
+cgbmv_thread_u.$(SUFFIX) cgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-cgbmv_thread_s.$(SUFFIX) cgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c
+cgbmv_thread_s.$(SUFFIX) cgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-cgbmv_thread_d.$(SUFFIX) cgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c
+cgbmv_thread_d.$(SUFFIX) cgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-zgbmv_thread_n.$(SUFFIX) zgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c
+zgbmv_thread_n.$(SUFFIX) zgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-zgbmv_thread_t.$(SUFFIX) zgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c
+zgbmv_thread_t.$(SUFFIX) zgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-zgbmv_thread_r.$(SUFFIX) zgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c
+zgbmv_thread_r.$(SUFFIX) zgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-zgbmv_thread_c.$(SUFFIX) zgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c
+zgbmv_thread_c.$(SUFFIX) zgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-zgbmv_thread_o.$(SUFFIX) zgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c
+zgbmv_thread_o.$(SUFFIX) zgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-zgbmv_thread_u.$(SUFFIX) zgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c
+zgbmv_thread_u.$(SUFFIX) zgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-zgbmv_thread_s.$(SUFFIX) zgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c
+zgbmv_thread_s.$(SUFFIX) zgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-zgbmv_thread_d.$(SUFFIX) zgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c
+zgbmv_thread_d.$(SUFFIX) zgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-xgbmv_thread_n.$(SUFFIX) xgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c
+xgbmv_thread_n.$(SUFFIX) xgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-xgbmv_thread_t.$(SUFFIX) xgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c
+xgbmv_thread_t.$(SUFFIX) xgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-xgbmv_thread_r.$(SUFFIX) xgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c
+xgbmv_thread_r.$(SUFFIX) xgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-xgbmv_thread_c.$(SUFFIX) xgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c
+xgbmv_thread_c.$(SUFFIX) xgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $<
-xgbmv_thread_o.$(SUFFIX) xgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c
+xgbmv_thread_o.$(SUFFIX) xgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-xgbmv_thread_u.$(SUFFIX) xgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c
+xgbmv_thread_u.$(SUFFIX) xgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-xgbmv_thread_s.$(SUFFIX) xgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c
+xgbmv_thread_s.$(SUFFIX) xgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-xgbmv_thread_d.$(SUFFIX) xgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c
+xgbmv_thread_d.$(SUFFIX) xgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c
$(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $<
-sgemv_thread_n.$(SUFFIX) sgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h
+sgemv_thread_n.$(SUFFIX) sgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F)
-sgemv_thread_t.$(SUFFIX) sgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h
+sgemv_thread_t.$(SUFFIX) sgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F)
-dgemv_thread_n.$(SUFFIX) dgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h
+dgemv_thread_n.$(SUFFIX) dgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F)
-dgemv_thread_t.$(SUFFIX) dgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h
+dgemv_thread_t.$(SUFFIX) dgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F)
-qgemv_thread_n.$(SUFFIX) qgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h
+qgemv_thread_n.$(SUFFIX) qgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F)
-qgemv_thread_t.$(SUFFIX) qgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h
+qgemv_thread_t.$(SUFFIX) qgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F)
-cgemv_thread_n.$(SUFFIX) cgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h
+cgemv_thread_n.$(SUFFIX) cgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F)
-cgemv_thread_t.$(SUFFIX) cgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h
+cgemv_thread_t.$(SUFFIX) cgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F)
-cgemv_thread_r.$(SUFFIX) cgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h
+cgemv_thread_r.$(SUFFIX) cgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -UXCONJ $< -o $(@F)
-cgemv_thread_c.$(SUFFIX) cgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h
+cgemv_thread_c.$(SUFFIX) cgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -UXCONJ $< -o $(@F)
-cgemv_thread_o.$(SUFFIX) cgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h
+cgemv_thread_o.$(SUFFIX) cgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -DXCONJ $< -o $(@F)
-cgemv_thread_u.$(SUFFIX) cgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h
+cgemv_thread_u.$(SUFFIX) cgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -DXCONJ $< -o $(@F)
-cgemv_thread_s.$(SUFFIX) cgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h
+cgemv_thread_s.$(SUFFIX) cgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -DXCONJ $< -o $(@F)
-cgemv_thread_d.$(SUFFIX) cgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h
+cgemv_thread_d.$(SUFFIX) cgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -DXCONJ $< -o $(@F)
-zgemv_thread_n.$(SUFFIX) zgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h
+zgemv_thread_n.$(SUFFIX) zgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F)
-zgemv_thread_t.$(SUFFIX) zgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h
+zgemv_thread_t.$(SUFFIX) zgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F)
-zgemv_thread_r.$(SUFFIX) zgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h
+zgemv_thread_r.$(SUFFIX) zgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -UXCONJ $< -o $(@F)
-zgemv_thread_c.$(SUFFIX) zgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h
+zgemv_thread_c.$(SUFFIX) zgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -UXCONJ $< -o $(@F)
-zgemv_thread_o.$(SUFFIX) zgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h
+zgemv_thread_o.$(SUFFIX) zgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -DXCONJ $< -o $(@F)
-zgemv_thread_u.$(SUFFIX) zgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h
+zgemv_thread_u.$(SUFFIX) zgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -DXCONJ $< -o $(@F)
-zgemv_thread_s.$(SUFFIX) zgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h
+zgemv_thread_s.$(SUFFIX) zgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -DXCONJ $< -o $(@F)
-zgemv_thread_d.$(SUFFIX) zgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h
+zgemv_thread_d.$(SUFFIX) zgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -DXCONJ $< -o $(@F)
-xgemv_thread_n.$(SUFFIX) xgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h
+xgemv_thread_n.$(SUFFIX) xgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F)
-xgemv_thread_t.$(SUFFIX) xgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h
+xgemv_thread_t.$(SUFFIX) xgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F)
-xgemv_thread_r.$(SUFFIX) xgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h
+xgemv_thread_r.$(SUFFIX) xgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -UXCONJ $< -o $(@F)
-xgemv_thread_c.$(SUFFIX) xgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h
+xgemv_thread_c.$(SUFFIX) xgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -UXCONJ $< -o $(@F)
-xgemv_thread_o.$(SUFFIX) xgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h
+xgemv_thread_o.$(SUFFIX) xgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -DXCONJ $< -o $(@F)
-xgemv_thread_u.$(SUFFIX) xgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h
+xgemv_thread_u.$(SUFFIX) xgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -DXCONJ $< -o $(@F)
-xgemv_thread_s.$(SUFFIX) xgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h
+xgemv_thread_s.$(SUFFIX) xgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -DXCONJ $< -o $(@F)
-xgemv_thread_d.$(SUFFIX) xgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h
+xgemv_thread_d.$(SUFFIX) xgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -DXCONJ $< -o $(@F)
-sger_thread.$(SUFFIX) sger_thread.$(PSUFFIX) : ger_thread.c ../../common.h
+sger_thread.$(SUFFIX) sger_thread.$(PSUFFIX) : ger_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UCONJ -UXCONJ $< -o $(@F)
-dger_thread.$(SUFFIX) dger_thread.$(PSUFFIX) : ger_thread.c ../../common.h
+dger_thread.$(SUFFIX) dger_thread.$(PSUFFIX) : ger_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UCONJ -UXCONJ $< -o $(@F)
-qger_thread.$(SUFFIX) qger_thread.$(PSUFFIX) : ger_thread.c ../../common.h
+qger_thread.$(SUFFIX) qger_thread.$(PSUFFIX) : ger_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UCONJ -UXCONJ $< -o $(@F)
-cger_thread_U.$(SUFFIX) cger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h
+cger_thread_U.$(SUFFIX) cger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UCONJ -UXCONJ $< -o $(@F)
-cger_thread_C.$(SUFFIX) cger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h
+cger_thread_C.$(SUFFIX) cger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCONJ -UXCONJ $< -o $(@F)
-cger_thread_V.$(SUFFIX) cger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h
+cger_thread_V.$(SUFFIX) cger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UCONJ -DXCONJ $< -o $(@F)
-cger_thread_D.$(SUFFIX) cger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h
+cger_thread_D.$(SUFFIX) cger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCONJ -DXCONJ $< -o $(@F)
-zger_thread_U.$(SUFFIX) zger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h
+zger_thread_U.$(SUFFIX) zger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ -UXCONJ $< -o $(@F)
-zger_thread_C.$(SUFFIX) zger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h
+zger_thread_C.$(SUFFIX) zger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCONJ -UXCONJ $< -o $(@F)
-zger_thread_V.$(SUFFIX) zger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h
+zger_thread_V.$(SUFFIX) zger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ -DXCONJ $< -o $(@F)
-zger_thread_D.$(SUFFIX) zger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h
+zger_thread_D.$(SUFFIX) zger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCONJ -DXCONJ $< -o $(@F)
-xger_thread_U.$(SUFFIX) xger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h
+xger_thread_U.$(SUFFIX) xger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UCONJ -UXCONJ $< -o $(@F)
-xger_thread_C.$(SUFFIX) xger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h
+xger_thread_C.$(SUFFIX) xger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCONJ -UXCONJ $< -o $(@F)
-xger_thread_V.$(SUFFIX) xger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h
+xger_thread_V.$(SUFFIX) xger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UCONJ -DXCONJ $< -o $(@F)
-xger_thread_D.$(SUFFIX) xger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h
+xger_thread_D.$(SUFFIX) xger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCONJ -DXCONJ $< -o $(@F)
-ssymv_thread_U.$(SUFFIX) ssymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h
+ssymv_thread_U.$(SUFFIX) ssymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-ssymv_thread_L.$(SUFFIX) ssymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h
+ssymv_thread_L.$(SUFFIX) ssymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-dsymv_thread_U.$(SUFFIX) dsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h
+dsymv_thread_U.$(SUFFIX) dsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-dsymv_thread_L.$(SUFFIX) dsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h
+dsymv_thread_L.$(SUFFIX) dsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-qsymv_thread_U.$(SUFFIX) qsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h
+qsymv_thread_U.$(SUFFIX) qsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-qsymv_thread_L.$(SUFFIX) qsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h
+qsymv_thread_L.$(SUFFIX) qsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-csymv_thread_U.$(SUFFIX) csymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h
+csymv_thread_U.$(SUFFIX) csymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-csymv_thread_L.$(SUFFIX) csymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h
+csymv_thread_L.$(SUFFIX) csymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-zsymv_thread_U.$(SUFFIX) zsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h
+zsymv_thread_U.$(SUFFIX) zsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-zsymv_thread_L.$(SUFFIX) zsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h
+zsymv_thread_L.$(SUFFIX) zsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-xsymv_thread_U.$(SUFFIX) xsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h
+xsymv_thread_U.$(SUFFIX) xsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-xsymv_thread_L.$(SUFFIX) xsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h
+xsymv_thread_L.$(SUFFIX) xsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-chemv_thread_U.$(SUFFIX) chemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h
+chemv_thread_U.$(SUFFIX) chemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $(@F)
-chemv_thread_L.$(SUFFIX) chemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h
+chemv_thread_L.$(SUFFIX) chemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $(@F)
-chemv_thread_V.$(SUFFIX) chemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h
+chemv_thread_V.$(SUFFIX) chemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-chemv_thread_M.$(SUFFIX) chemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h
+chemv_thread_M.$(SUFFIX) chemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-zhemv_thread_U.$(SUFFIX) zhemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h
+zhemv_thread_U.$(SUFFIX) zhemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $(@F)
-zhemv_thread_L.$(SUFFIX) zhemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h
+zhemv_thread_L.$(SUFFIX) zhemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $(@F)
-zhemv_thread_V.$(SUFFIX) zhemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h
+zhemv_thread_V.$(SUFFIX) zhemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-zhemv_thread_M.$(SUFFIX) zhemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h
+zhemv_thread_M.$(SUFFIX) zhemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-xhemv_thread_U.$(SUFFIX) xhemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h
+xhemv_thread_U.$(SUFFIX) xhemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $(@F)
-xhemv_thread_L.$(SUFFIX) xhemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h
+xhemv_thread_L.$(SUFFIX) xhemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $(@F)
-xhemv_thread_V.$(SUFFIX) xhemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h
+xhemv_thread_V.$(SUFFIX) xhemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-xhemv_thread_M.$(SUFFIX) xhemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h
+xhemv_thread_M.$(SUFFIX) xhemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-ssyr_thread_U.$(SUFFIX) ssyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h
+ssyr_thread_U.$(SUFFIX) ssyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-ssyr_thread_L.$(SUFFIX) ssyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h
+ssyr_thread_L.$(SUFFIX) ssyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-dsyr_thread_U.$(SUFFIX) dsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h
+dsyr_thread_U.$(SUFFIX) dsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-dsyr_thread_L.$(SUFFIX) dsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h
+dsyr_thread_L.$(SUFFIX) dsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-qsyr_thread_U.$(SUFFIX) qsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h
+qsyr_thread_U.$(SUFFIX) qsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-qsyr_thread_L.$(SUFFIX) qsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h
+qsyr_thread_L.$(SUFFIX) qsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-csyr_thread_U.$(SUFFIX) csyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h
+csyr_thread_U.$(SUFFIX) csyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-csyr_thread_L.$(SUFFIX) csyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h
+csyr_thread_L.$(SUFFIX) csyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-zsyr_thread_U.$(SUFFIX) zsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h
+zsyr_thread_U.$(SUFFIX) zsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-zsyr_thread_L.$(SUFFIX) zsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h
+zsyr_thread_L.$(SUFFIX) zsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-xsyr_thread_U.$(SUFFIX) xsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h
+xsyr_thread_U.$(SUFFIX) xsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-xsyr_thread_L.$(SUFFIX) xsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h
+xsyr_thread_L.$(SUFFIX) xsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-cher_thread_U.$(SUFFIX) cher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h
+cher_thread_U.$(SUFFIX) cher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHER $< -o $(@F)
-cher_thread_L.$(SUFFIX) cher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h
+cher_thread_L.$(SUFFIX) cher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHER $< -o $(@F)
-cher_thread_V.$(SUFFIX) cher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h
+cher_thread_V.$(SUFFIX) cher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHERREV $< -o $(@F)
-cher_thread_M.$(SUFFIX) cher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h
+cher_thread_M.$(SUFFIX) cher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHERREV $< -o $(@F)
-zher_thread_U.$(SUFFIX) zher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h
+zher_thread_U.$(SUFFIX) zher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHER $< -o $(@F)
-zher_thread_L.$(SUFFIX) zher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h
+zher_thread_L.$(SUFFIX) zher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHER $< -o $(@F)
-zher_thread_V.$(SUFFIX) zher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h
+zher_thread_V.$(SUFFIX) zher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHERREV $< -o $(@F)
-zher_thread_M.$(SUFFIX) zher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h
+zher_thread_M.$(SUFFIX) zher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHERREV $< -o $(@F)
-xher_thread_U.$(SUFFIX) xher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h
+xher_thread_U.$(SUFFIX) xher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHER $< -o $(@F)
-xher_thread_L.$(SUFFIX) xher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h
+xher_thread_L.$(SUFFIX) xher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHER $< -o $(@F)
-xher_thread_V.$(SUFFIX) xher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h
+xher_thread_V.$(SUFFIX) xher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHERREV $< -o $(@F)
-xher_thread_M.$(SUFFIX) xher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h
+xher_thread_M.$(SUFFIX) xher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHERREV $< -o $(@F)
-ssyr2_thread_U.$(SUFFIX) ssyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h
+ssyr2_thread_U.$(SUFFIX) ssyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-ssyr2_thread_L.$(SUFFIX) ssyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h
+ssyr2_thread_L.$(SUFFIX) ssyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-dsyr2_thread_U.$(SUFFIX) dsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h
+dsyr2_thread_U.$(SUFFIX) dsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-dsyr2_thread_L.$(SUFFIX) dsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h
+dsyr2_thread_L.$(SUFFIX) dsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-qsyr2_thread_U.$(SUFFIX) qsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h
+qsyr2_thread_U.$(SUFFIX) qsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-qsyr2_thread_L.$(SUFFIX) qsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h
+qsyr2_thread_L.$(SUFFIX) qsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-csyr2_thread_U.$(SUFFIX) csyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h
+csyr2_thread_U.$(SUFFIX) csyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-csyr2_thread_L.$(SUFFIX) csyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h
+csyr2_thread_L.$(SUFFIX) csyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-zsyr2_thread_U.$(SUFFIX) zsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h
+zsyr2_thread_U.$(SUFFIX) zsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-zsyr2_thread_L.$(SUFFIX) zsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h
+zsyr2_thread_L.$(SUFFIX) zsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-xsyr2_thread_U.$(SUFFIX) xsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h
+xsyr2_thread_U.$(SUFFIX) xsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-xsyr2_thread_L.$(SUFFIX) xsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h
+xsyr2_thread_L.$(SUFFIX) xsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-cher2_thread_U.$(SUFFIX) cher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h
+cher2_thread_U.$(SUFFIX) cher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHER $< -o $(@F)
-cher2_thread_L.$(SUFFIX) cher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h
+cher2_thread_L.$(SUFFIX) cher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHER $< -o $(@F)
-cher2_thread_V.$(SUFFIX) cher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h
+cher2_thread_V.$(SUFFIX) cher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHERREV $< -o $(@F)
-cher2_thread_M.$(SUFFIX) cher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h
+cher2_thread_M.$(SUFFIX) cher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHERREV $< -o $(@F)
-zher2_thread_U.$(SUFFIX) zher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h
+zher2_thread_U.$(SUFFIX) zher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHER $< -o $(@F)
-zher2_thread_L.$(SUFFIX) zher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h
+zher2_thread_L.$(SUFFIX) zher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHER $< -o $(@F)
-zher2_thread_V.$(SUFFIX) zher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h
+zher2_thread_V.$(SUFFIX) zher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHERREV $< -o $(@F)
-zher2_thread_M.$(SUFFIX) zher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h
+zher2_thread_M.$(SUFFIX) zher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHERREV $< -o $(@F)
-xher2_thread_U.$(SUFFIX) xher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h
+xher2_thread_U.$(SUFFIX) xher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHER $< -o $(@F)
-xher2_thread_L.$(SUFFIX) xher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h
+xher2_thread_L.$(SUFFIX) xher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHER $< -o $(@F)
-xher2_thread_V.$(SUFFIX) xher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h
+xher2_thread_V.$(SUFFIX) xher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHERREV $< -o $(@F)
-xher2_thread_M.$(SUFFIX) xher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h
+xher2_thread_M.$(SUFFIX) xher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHERREV $< -o $(@F)
-chbmv_U.$(SUFFIX) chbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h
+chbmv_U.$(SUFFIX) chbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-chbmv_L.$(SUFFIX) chbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h
+chbmv_L.$(SUFFIX) chbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-chbmv_V.$(SUFFIX) chbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h
+chbmv_V.$(SUFFIX) chbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-chbmv_M.$(SUFFIX) chbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h
+chbmv_M.$(SUFFIX) chbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-zhbmv_U.$(SUFFIX) zhbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h
+zhbmv_U.$(SUFFIX) zhbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-zhbmv_L.$(SUFFIX) zhbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h
+zhbmv_L.$(SUFFIX) zhbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-zhbmv_V.$(SUFFIX) zhbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h
+zhbmv_V.$(SUFFIX) zhbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-zhbmv_M.$(SUFFIX) zhbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h
+zhbmv_M.$(SUFFIX) zhbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-xhbmv_U.$(SUFFIX) xhbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h
+xhbmv_U.$(SUFFIX) xhbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-xhbmv_L.$(SUFFIX) xhbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h
+xhbmv_L.$(SUFFIX) xhbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-xhbmv_V.$(SUFFIX) xhbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h
+xhbmv_V.$(SUFFIX) xhbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-xhbmv_M.$(SUFFIX) xhbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h
+xhbmv_M.$(SUFFIX) xhbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-chbmv_thread_U.$(SUFFIX) chbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h
+chbmv_thread_U.$(SUFFIX) chbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $(@F)
-chbmv_thread_L.$(SUFFIX) chbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h
+chbmv_thread_L.$(SUFFIX) chbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $(@F)
-chbmv_thread_V.$(SUFFIX) chbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h
+chbmv_thread_V.$(SUFFIX) chbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-chbmv_thread_M.$(SUFFIX) chbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h
+chbmv_thread_M.$(SUFFIX) chbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-zhbmv_thread_U.$(SUFFIX) zhbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h
+zhbmv_thread_U.$(SUFFIX) zhbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $(@F)
-zhbmv_thread_L.$(SUFFIX) zhbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h
+zhbmv_thread_L.$(SUFFIX) zhbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $(@F)
-zhbmv_thread_V.$(SUFFIX) zhbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h
+zhbmv_thread_V.$(SUFFIX) zhbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-zhbmv_thread_M.$(SUFFIX) zhbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h
+zhbmv_thread_M.$(SUFFIX) zhbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-xhbmv_thread_U.$(SUFFIX) xhbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h
+xhbmv_thread_U.$(SUFFIX) xhbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $(@F)
-xhbmv_thread_L.$(SUFFIX) xhbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h
+xhbmv_thread_L.$(SUFFIX) xhbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $(@F)
-xhbmv_thread_V.$(SUFFIX) xhbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h
+xhbmv_thread_V.$(SUFFIX) xhbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-xhbmv_thread_M.$(SUFFIX) xhbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h
+xhbmv_thread_M.$(SUFFIX) xhbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-cher_U.$(SUFFIX) cher_U.$(PSUFFIX) : zher_k.c ../../common.h
+cher_U.$(SUFFIX) cher_U.$(PSUFFIX) : zher_k.c ../../common.h
$(CC) -c $(CFLAGS) -UDOUBLE -ULOWER $< -o $(@F)
-cher_L.$(SUFFIX) cher_L.$(PSUFFIX) : zher_k.c ../../common.h
+cher_L.$(SUFFIX) cher_L.$(PSUFFIX) : zher_k.c ../../common.h
$(CC) -c $(CFLAGS) -UDOUBLE -DLOWER $< -o $(@F)
-cher_V.$(SUFFIX) cher_V.$(PSUFFIX) : zher_k.c ../../common.h
+cher_V.$(SUFFIX) cher_V.$(PSUFFIX) : zher_k.c ../../common.h
$(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-cher_M.$(SUFFIX) cher_M.$(PSUFFIX) : zher_k.c ../../common.h
+cher_M.$(SUFFIX) cher_M.$(PSUFFIX) : zher_k.c ../../common.h
$(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-zher_U.$(SUFFIX) zher_U.$(PSUFFIX) : zher_k.c ../../common.h
+zher_U.$(SUFFIX) zher_U.$(PSUFFIX) : zher_k.c ../../common.h
$(CC) -c $(CFLAGS) -DDOUBLE -ULOWER $< -o $(@F)
-zher_L.$(SUFFIX) zher_L.$(PSUFFIX) : zher_k.c ../../common.h
+zher_L.$(SUFFIX) zher_L.$(PSUFFIX) : zher_k.c ../../common.h
$(CC) -c $(CFLAGS) -DDOUBLE -DLOWER $< -o $(@F)
-zher_V.$(SUFFIX) zher_V.$(PSUFFIX) : zher_k.c ../../common.h
+zher_V.$(SUFFIX) zher_V.$(PSUFFIX) : zher_k.c ../../common.h
$(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-zher_M.$(SUFFIX) zher_M.$(PSUFFIX) : zher_k.c ../../common.h
+zher_M.$(SUFFIX) zher_M.$(PSUFFIX) : zher_k.c ../../common.h
$(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-xher_U.$(SUFFIX) xher_U.$(PSUFFIX) : zher_k.c ../../common.h
+xher_U.$(SUFFIX) xher_U.$(PSUFFIX) : zher_k.c ../../common.h
$(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER $< -o $(@F)
-xher_L.$(SUFFIX) xher_L.$(PSUFFIX) : zher_k.c ../../common.h
+xher_L.$(SUFFIX) xher_L.$(PSUFFIX) : zher_k.c ../../common.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER $< -o $(@F)
-xher_V.$(SUFFIX) xher_V.$(PSUFFIX) : zher_k.c ../../common.h
+xher_V.$(SUFFIX) xher_V.$(PSUFFIX) : zher_k.c ../../common.h
$(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-xher_M.$(SUFFIX) xher_M.$(PSUFFIX) : zher_k.c ../../common.h
+xher_M.$(SUFFIX) xher_M.$(PSUFFIX) : zher_k.c ../../common.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-cher2_U.$(SUFFIX) cher2_U.$(PSUFFIX) : zher2_k.c ../../param.h
+cher2_U.$(SUFFIX) cher2_U.$(PSUFFIX) : zher2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F)
-cher2_L.$(SUFFIX) cher2_L.$(PSUFFIX) : zher2_k.c ../../param.h
+cher2_L.$(SUFFIX) cher2_L.$(PSUFFIX) : zher2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F)
-cher2_V.$(SUFFIX) cher2_V.$(PSUFFIX) : zher2_k.c ../../param.h
+cher2_V.$(SUFFIX) cher2_V.$(PSUFFIX) : zher2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F)
-cher2_M.$(SUFFIX) cher2_M.$(PSUFFIX) : zher2_k.c ../../param.h
+cher2_M.$(SUFFIX) cher2_M.$(PSUFFIX) : zher2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F)
-zher2_U.$(SUFFIX) zher2_U.$(PSUFFIX) : zher2_k.c ../../param.h
+zher2_U.$(SUFFIX) zher2_U.$(PSUFFIX) : zher2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F)
-zher2_L.$(SUFFIX) zher2_L.$(PSUFFIX) : zher2_k.c ../../param.h
+zher2_L.$(SUFFIX) zher2_L.$(PSUFFIX) : zher2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F)
-zher2_V.$(SUFFIX) zher2_V.$(PSUFFIX) : zher2_k.c ../../param.h
+zher2_V.$(SUFFIX) zher2_V.$(PSUFFIX) : zher2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F)
-zher2_M.$(SUFFIX) zher2_M.$(PSUFFIX) : zher2_k.c ../../param.h
+zher2_M.$(SUFFIX) zher2_M.$(PSUFFIX) : zher2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F)
-xher2_U.$(SUFFIX) xher2_U.$(PSUFFIX) : zher2_k.c ../../param.h
+xher2_U.$(SUFFIX) xher2_U.$(PSUFFIX) : zher2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F)
-xher2_L.$(SUFFIX) xher2_L.$(PSUFFIX) : zher2_k.c ../../param.h
+xher2_L.$(SUFFIX) xher2_L.$(PSUFFIX) : zher2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F)
-xher2_V.$(SUFFIX) xher2_V.$(PSUFFIX) : zher2_k.c ../../param.h
+xher2_V.$(SUFFIX) xher2_V.$(PSUFFIX) : zher2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -DHEMVREV -o $(@F)
-xher2_M.$(SUFFIX) xher2_M.$(PSUFFIX) : zher2_k.c ../../param.h
+xher2_M.$(SUFFIX) xher2_M.$(PSUFFIX) : zher2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -DHEMVREV -o $(@F)
-chpmv_U.$(SUFFIX) chpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h
+chpmv_U.$(SUFFIX) chpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-chpmv_L.$(SUFFIX) chpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h
+chpmv_L.$(SUFFIX) chpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-chpmv_V.$(SUFFIX) chpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h
+chpmv_V.$(SUFFIX) chpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-chpmv_M.$(SUFFIX) chpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h
+chpmv_M.$(SUFFIX) chpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-zhpmv_U.$(SUFFIX) zhpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h
+zhpmv_U.$(SUFFIX) zhpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-zhpmv_L.$(SUFFIX) zhpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h
+zhpmv_L.$(SUFFIX) zhpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-zhpmv_V.$(SUFFIX) zhpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h
+zhpmv_V.$(SUFFIX) zhpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-zhpmv_M.$(SUFFIX) zhpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h
+zhpmv_M.$(SUFFIX) zhpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-xhpmv_U.$(SUFFIX) xhpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h
+xhpmv_U.$(SUFFIX) xhpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-xhpmv_L.$(SUFFIX) xhpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h
+xhpmv_L.$(SUFFIX) xhpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-xhpmv_V.$(SUFFIX) xhpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h
+xhpmv_V.$(SUFFIX) xhpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-xhpmv_M.$(SUFFIX) xhpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h
+xhpmv_M.$(SUFFIX) xhpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-chpmv_thread_U.$(SUFFIX) chpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h
+chpmv_thread_U.$(SUFFIX) chpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $(@F)
-chpmv_thread_L.$(SUFFIX) chpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h
+chpmv_thread_L.$(SUFFIX) chpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $(@F)
-chpmv_thread_V.$(SUFFIX) chpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h
+chpmv_thread_V.$(SUFFIX) chpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-chpmv_thread_M.$(SUFFIX) chpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h
+chpmv_thread_M.$(SUFFIX) chpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-zhpmv_thread_U.$(SUFFIX) zhpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h
+zhpmv_thread_U.$(SUFFIX) zhpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $(@F)
-zhpmv_thread_L.$(SUFFIX) zhpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h
+zhpmv_thread_L.$(SUFFIX) zhpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $(@F)
-zhpmv_thread_V.$(SUFFIX) zhpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h
+zhpmv_thread_V.$(SUFFIX) zhpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-zhpmv_thread_M.$(SUFFIX) zhpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h
+zhpmv_thread_M.$(SUFFIX) zhpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-xhpmv_thread_U.$(SUFFIX) xhpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h
+xhpmv_thread_U.$(SUFFIX) xhpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $(@F)
-xhpmv_thread_L.$(SUFFIX) xhpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h
+xhpmv_thread_L.$(SUFFIX) xhpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $(@F)
-xhpmv_thread_V.$(SUFFIX) xhpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h
+xhpmv_thread_V.$(SUFFIX) xhpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-xhpmv_thread_M.$(SUFFIX) xhpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h
+xhpmv_thread_M.$(SUFFIX) xhpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-chpr_U.$(SUFFIX) chpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h
+chpr_U.$(SUFFIX) chpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h
$(CC) -c $(CFLAGS) -UDOUBLE -ULOWER $< -o $(@F)
-chpr_L.$(SUFFIX) chpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h
+chpr_L.$(SUFFIX) chpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h
$(CC) -c $(CFLAGS) -UDOUBLE -DLOWER $< -o $(@F)
-chpr_V.$(SUFFIX) chpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h
+chpr_V.$(SUFFIX) chpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h
$(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-chpr_M.$(SUFFIX) chpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h
+chpr_M.$(SUFFIX) chpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h
$(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-zhpr_U.$(SUFFIX) zhpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h
+zhpr_U.$(SUFFIX) zhpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h
$(CC) -c $(CFLAGS) -DDOUBLE -ULOWER $< -o $(@F)
-zhpr_L.$(SUFFIX) zhpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h
+zhpr_L.$(SUFFIX) zhpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h
$(CC) -c $(CFLAGS) -DDOUBLE -DLOWER $< -o $(@F)
-zhpr_V.$(SUFFIX) zhpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h
+zhpr_V.$(SUFFIX) zhpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h
$(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-zhpr_M.$(SUFFIX) zhpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h
+zhpr_M.$(SUFFIX) zhpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h
$(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-xhpr_U.$(SUFFIX) xhpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h
+xhpr_U.$(SUFFIX) xhpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h
$(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER $< -o $(@F)
-xhpr_L.$(SUFFIX) xhpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h
+xhpr_L.$(SUFFIX) xhpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER $< -o $(@F)
-xhpr_V.$(SUFFIX) xhpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h
+xhpr_V.$(SUFFIX) xhpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h
$(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-xhpr_M.$(SUFFIX) xhpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h
+xhpr_M.$(SUFFIX) xhpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-chpr_thread_U.$(SUFFIX) chpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h
+chpr_thread_U.$(SUFFIX) chpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMV $< -o $(@F)
-chpr_thread_L.$(SUFFIX) chpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h
+chpr_thread_L.$(SUFFIX) chpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMV $< -o $(@F)
-chpr_thread_V.$(SUFFIX) chpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h
+chpr_thread_V.$(SUFFIX) chpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-chpr_thread_M.$(SUFFIX) chpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h
+chpr_thread_M.$(SUFFIX) chpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-zhpr_thread_U.$(SUFFIX) zhpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h
+zhpr_thread_U.$(SUFFIX) zhpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMV $< -o $(@F)
-zhpr_thread_L.$(SUFFIX) zhpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h
+zhpr_thread_L.$(SUFFIX) zhpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMV $< -o $(@F)
-zhpr_thread_V.$(SUFFIX) zhpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h
+zhpr_thread_V.$(SUFFIX) zhpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-zhpr_thread_M.$(SUFFIX) zhpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h
+zhpr_thread_M.$(SUFFIX) zhpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-xhpr_thread_U.$(SUFFIX) xhpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h
+xhpr_thread_U.$(SUFFIX) xhpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMV $< -o $(@F)
-xhpr_thread_L.$(SUFFIX) xhpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h
+xhpr_thread_L.$(SUFFIX) xhpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMV $< -o $(@F)
-xhpr_thread_V.$(SUFFIX) xhpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h
+xhpr_thread_V.$(SUFFIX) xhpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F)
-xhpr_thread_M.$(SUFFIX) xhpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h
+xhpr_thread_M.$(SUFFIX) xhpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F)
-chpr2_U.$(SUFFIX) chpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h
+chpr2_U.$(SUFFIX) chpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F)
-chpr2_L.$(SUFFIX) chpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h
+chpr2_L.$(SUFFIX) chpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F)
-chpr2_V.$(SUFFIX) chpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h
+chpr2_V.$(SUFFIX) chpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F)
-chpr2_M.$(SUFFIX) chpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h
+chpr2_M.$(SUFFIX) chpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F)
-zhpr2_U.$(SUFFIX) zhpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h
+zhpr2_U.$(SUFFIX) zhpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F)
-zhpr2_L.$(SUFFIX) zhpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h
+zhpr2_L.$(SUFFIX) zhpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F)
-zhpr2_V.$(SUFFIX) zhpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h
+zhpr2_V.$(SUFFIX) zhpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F)
-zhpr2_M.$(SUFFIX) zhpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h
+zhpr2_M.$(SUFFIX) zhpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F)
-xhpr2_U.$(SUFFIX) xhpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h
+xhpr2_U.$(SUFFIX) xhpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F)
-xhpr2_L.$(SUFFIX) xhpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h
+xhpr2_L.$(SUFFIX) xhpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F)
-xhpr2_V.$(SUFFIX) xhpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h
+xhpr2_V.$(SUFFIX) xhpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F)
-xhpr2_M.$(SUFFIX) xhpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h
+xhpr2_M.$(SUFFIX) xhpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -DHEMVREV -o $(@F)
-chpr2_thread_U.$(SUFFIX) chpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h
+chpr2_thread_U.$(SUFFIX) chpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMV $< -o $(@F)
-chpr2_thread_L.$(SUFFIX) chpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h
+chpr2_thread_L.$(SUFFIX) chpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMV $< -o $(@F)
-chpr2_thread_V.$(SUFFIX) chpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h
+chpr2_thread_V.$(SUFFIX) chpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F)
-chpr2_thread_M.$(SUFFIX) chpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h
+chpr2_thread_M.$(SUFFIX) chpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F)
-zhpr2_thread_U.$(SUFFIX) zhpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h
+zhpr2_thread_U.$(SUFFIX) zhpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMV $< -o $(@F)
-zhpr2_thread_L.$(SUFFIX) zhpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h
+zhpr2_thread_L.$(SUFFIX) zhpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMV $< -o $(@F)
-zhpr2_thread_V.$(SUFFIX) zhpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h
+zhpr2_thread_V.$(SUFFIX) zhpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F)
-zhpr2_thread_M.$(SUFFIX) zhpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h
+zhpr2_thread_M.$(SUFFIX) zhpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F)
-xhpr2_thread_U.$(SUFFIX) xhpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h
+xhpr2_thread_U.$(SUFFIX) xhpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DHEMV $< -o $(@F)
-xhpr2_thread_L.$(SUFFIX) xhpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h
+xhpr2_thread_L.$(SUFFIX) xhpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DHEMV $< -o $(@F)
-xhpr2_thread_V.$(SUFFIX) xhpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h
+xhpr2_thread_V.$(SUFFIX) xhpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F)
-xhpr2_thread_M.$(SUFFIX) xhpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h
+xhpr2_thread_M.$(SUFFIX) xhpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -DHEMVREV -o $(@F)
-ssbmv_U.$(SUFFIX) ssbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h
+ssbmv_U.$(SUFFIX) ssbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-ssbmv_L.$(SUFFIX) ssbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h
+ssbmv_L.$(SUFFIX) ssbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-dsbmv_U.$(SUFFIX) dsbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h
+dsbmv_U.$(SUFFIX) dsbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-dsbmv_L.$(SUFFIX) dsbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h
+dsbmv_L.$(SUFFIX) dsbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-qsbmv_U.$(SUFFIX) qsbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h
+qsbmv_U.$(SUFFIX) qsbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-qsbmv_L.$(SUFFIX) qsbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h
+qsbmv_L.$(SUFFIX) qsbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-csbmv_U.$(SUFFIX) csbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h
+csbmv_U.$(SUFFIX) csbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-csbmv_L.$(SUFFIX) csbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h
+csbmv_L.$(SUFFIX) csbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-zsbmv_U.$(SUFFIX) zsbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h
+zsbmv_U.$(SUFFIX) zsbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-zsbmv_L.$(SUFFIX) zsbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h
+zsbmv_L.$(SUFFIX) zsbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-xsbmv_U.$(SUFFIX) xsbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h
+xsbmv_U.$(SUFFIX) xsbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-xsbmv_L.$(SUFFIX) xsbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h
+xsbmv_L.$(SUFFIX) xsbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-ssbmv_thread_U.$(SUFFIX) ssbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h
+ssbmv_thread_U.$(SUFFIX) ssbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-ssbmv_thread_L.$(SUFFIX) ssbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h
+ssbmv_thread_L.$(SUFFIX) ssbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-dsbmv_thread_U.$(SUFFIX) dsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h
+dsbmv_thread_U.$(SUFFIX) dsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-dsbmv_thread_L.$(SUFFIX) dsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h
+dsbmv_thread_L.$(SUFFIX) dsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-qsbmv_thread_U.$(SUFFIX) qsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h
+qsbmv_thread_U.$(SUFFIX) qsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-qsbmv_thread_L.$(SUFFIX) qsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h
+qsbmv_thread_L.$(SUFFIX) qsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-csbmv_thread_U.$(SUFFIX) csbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h
+csbmv_thread_U.$(SUFFIX) csbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-csbmv_thread_L.$(SUFFIX) csbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h
+csbmv_thread_L.$(SUFFIX) csbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-zsbmv_thread_U.$(SUFFIX) zsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h
+zsbmv_thread_U.$(SUFFIX) zsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-zsbmv_thread_L.$(SUFFIX) zsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h
+zsbmv_thread_L.$(SUFFIX) zsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-xsbmv_thread_U.$(SUFFIX) xsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h
+xsbmv_thread_U.$(SUFFIX) xsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-xsbmv_thread_L.$(SUFFIX) xsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h
+xsbmv_thread_L.$(SUFFIX) xsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-sspmv_U.$(SUFFIX) sspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h
+sspmv_U.$(SUFFIX) sspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-sspmv_L.$(SUFFIX) sspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h
+sspmv_L.$(SUFFIX) sspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-dspmv_U.$(SUFFIX) dspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h
+dspmv_U.$(SUFFIX) dspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-dspmv_L.$(SUFFIX) dspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h
+dspmv_L.$(SUFFIX) dspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-qspmv_U.$(SUFFIX) qspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h
+qspmv_U.$(SUFFIX) qspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-qspmv_L.$(SUFFIX) qspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h
+qspmv_L.$(SUFFIX) qspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-cspmv_U.$(SUFFIX) cspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h
+cspmv_U.$(SUFFIX) cspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-cspmv_L.$(SUFFIX) cspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h
+cspmv_L.$(SUFFIX) cspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-zspmv_U.$(SUFFIX) zspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h
+zspmv_U.$(SUFFIX) zspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-zspmv_L.$(SUFFIX) zspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h
+zspmv_L.$(SUFFIX) zspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-xspmv_U.$(SUFFIX) xspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h
+xspmv_U.$(SUFFIX) xspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-xspmv_L.$(SUFFIX) xspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h
+xspmv_L.$(SUFFIX) xspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-sspmv_thread_U.$(SUFFIX) sspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h
+sspmv_thread_U.$(SUFFIX) sspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-sspmv_thread_L.$(SUFFIX) sspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h
+sspmv_thread_L.$(SUFFIX) sspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-dspmv_thread_U.$(SUFFIX) dspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h
+dspmv_thread_U.$(SUFFIX) dspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-dspmv_thread_L.$(SUFFIX) dspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h
+dspmv_thread_L.$(SUFFIX) dspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-qspmv_thread_U.$(SUFFIX) qspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h
+qspmv_thread_U.$(SUFFIX) qspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-qspmv_thread_L.$(SUFFIX) qspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h
+qspmv_thread_L.$(SUFFIX) qspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-cspmv_thread_U.$(SUFFIX) cspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h
+cspmv_thread_U.$(SUFFIX) cspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-cspmv_thread_L.$(SUFFIX) cspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h
+cspmv_thread_L.$(SUFFIX) cspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-zspmv_thread_U.$(SUFFIX) zspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h
+zspmv_thread_U.$(SUFFIX) zspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-zspmv_thread_L.$(SUFFIX) zspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h
+zspmv_thread_L.$(SUFFIX) zspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-xspmv_thread_U.$(SUFFIX) xspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h
+xspmv_thread_U.$(SUFFIX) xspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-xspmv_thread_L.$(SUFFIX) xspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h
+xspmv_thread_L.$(SUFFIX) xspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-sspr_U.$(SUFFIX) sspr_U.$(PSUFFIX) : spr_k.c ../../param.h
+sspr_U.$(SUFFIX) sspr_U.$(PSUFFIX) : spr_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-sspr_L.$(SUFFIX) sspr_L.$(PSUFFIX) : spr_k.c ../../param.h
+sspr_L.$(SUFFIX) sspr_L.$(PSUFFIX) : spr_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-dspr_U.$(SUFFIX) dspr_U.$(PSUFFIX) : spr_k.c ../../param.h
+dspr_U.$(SUFFIX) dspr_U.$(PSUFFIX) : spr_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-dspr_L.$(SUFFIX) dspr_L.$(PSUFFIX) : spr_k.c ../../param.h
+dspr_L.$(SUFFIX) dspr_L.$(PSUFFIX) : spr_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-qspr_U.$(SUFFIX) qspr_U.$(PSUFFIX) : spr_k.c ../../param.h
+qspr_U.$(SUFFIX) qspr_U.$(PSUFFIX) : spr_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-qspr_L.$(SUFFIX) qspr_L.$(PSUFFIX) : spr_k.c ../../param.h
+qspr_L.$(SUFFIX) qspr_L.$(PSUFFIX) : spr_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-cspr_U.$(SUFFIX) cspr_U.$(PSUFFIX) : zspr_k.c ../../param.h
+cspr_U.$(SUFFIX) cspr_U.$(PSUFFIX) : zspr_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-cspr_L.$(SUFFIX) cspr_L.$(PSUFFIX) : zspr_k.c ../../param.h
+cspr_L.$(SUFFIX) cspr_L.$(PSUFFIX) : zspr_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-zspr_U.$(SUFFIX) zspr_U.$(PSUFFIX) : zspr_k.c ../../param.h
+zspr_U.$(SUFFIX) zspr_U.$(PSUFFIX) : zspr_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-zspr_L.$(SUFFIX) zspr_L.$(PSUFFIX) : zspr_k.c ../../param.h
+zspr_L.$(SUFFIX) zspr_L.$(PSUFFIX) : zspr_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-xspr_U.$(SUFFIX) xspr_U.$(PSUFFIX) : zspr_k.c ../../param.h
+xspr_U.$(SUFFIX) xspr_U.$(PSUFFIX) : zspr_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-xspr_L.$(SUFFIX) xspr_L.$(PSUFFIX) : zspr_k.c ../../param.h
+xspr_L.$(SUFFIX) xspr_L.$(PSUFFIX) : zspr_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-sspr_thread_U.$(SUFFIX) sspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h
+sspr_thread_U.$(SUFFIX) sspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-sspr_thread_L.$(SUFFIX) sspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h
+sspr_thread_L.$(SUFFIX) sspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-dspr_thread_U.$(SUFFIX) dspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h
+dspr_thread_U.$(SUFFIX) dspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-dspr_thread_L.$(SUFFIX) dspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h
+dspr_thread_L.$(SUFFIX) dspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-qspr_thread_U.$(SUFFIX) qspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h
+qspr_thread_U.$(SUFFIX) qspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-qspr_thread_L.$(SUFFIX) qspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h
+qspr_thread_L.$(SUFFIX) qspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-cspr_thread_U.$(SUFFIX) cspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h
+cspr_thread_U.$(SUFFIX) cspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-cspr_thread_L.$(SUFFIX) cspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h
+cspr_thread_L.$(SUFFIX) cspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-zspr_thread_U.$(SUFFIX) zspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h
+zspr_thread_U.$(SUFFIX) zspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-zspr_thread_L.$(SUFFIX) zspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h
+zspr_thread_L.$(SUFFIX) zspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-xspr_thread_U.$(SUFFIX) xspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h
+xspr_thread_U.$(SUFFIX) xspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-xspr_thread_L.$(SUFFIX) xspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h
+xspr_thread_L.$(SUFFIX) xspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-sspr2_U.$(SUFFIX) sspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h
+sspr2_U.$(SUFFIX) sspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-sspr2_L.$(SUFFIX) sspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h
+sspr2_L.$(SUFFIX) sspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-dspr2_U.$(SUFFIX) dspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h
+dspr2_U.$(SUFFIX) dspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-dspr2_L.$(SUFFIX) dspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h
+dspr2_L.$(SUFFIX) dspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-qspr2_U.$(SUFFIX) qspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h
+qspr2_U.$(SUFFIX) qspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-qspr2_L.$(SUFFIX) qspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h
+qspr2_L.$(SUFFIX) qspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-cspr2_U.$(SUFFIX) cspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h
+cspr2_U.$(SUFFIX) cspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-cspr2_L.$(SUFFIX) cspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h
+cspr2_L.$(SUFFIX) cspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-zspr2_U.$(SUFFIX) zspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h
+zspr2_U.$(SUFFIX) zspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-zspr2_L.$(SUFFIX) zspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h
+zspr2_L.$(SUFFIX) zspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-xspr2_U.$(SUFFIX) xspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h
+xspr2_U.$(SUFFIX) xspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-xspr2_L.$(SUFFIX) xspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h
+xspr2_L.$(SUFFIX) xspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-sspr2_thread_U.$(SUFFIX) sspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h
+sspr2_thread_U.$(SUFFIX) sspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-sspr2_thread_L.$(SUFFIX) sspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h
+sspr2_thread_L.$(SUFFIX) sspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-dspr2_thread_U.$(SUFFIX) dspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h
+dspr2_thread_U.$(SUFFIX) dspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-dspr2_thread_L.$(SUFFIX) dspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h
+dspr2_thread_L.$(SUFFIX) dspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-qspr2_thread_U.$(SUFFIX) qspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h
+qspr2_thread_U.$(SUFFIX) qspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-qspr2_thread_L.$(SUFFIX) qspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h
+qspr2_thread_L.$(SUFFIX) qspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-cspr2_thread_U.$(SUFFIX) cspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h
+cspr2_thread_U.$(SUFFIX) cspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-cspr2_thread_L.$(SUFFIX) cspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h
+cspr2_thread_L.$(SUFFIX) cspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-zspr2_thread_U.$(SUFFIX) zspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h
+zspr2_thread_U.$(SUFFIX) zspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-zspr2_thread_L.$(SUFFIX) zspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h
+zspr2_thread_L.$(SUFFIX) zspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-xspr2_thread_U.$(SUFFIX) xspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h
+xspr2_thread_U.$(SUFFIX) xspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-xspr2_thread_L.$(SUFFIX) xspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h
+xspr2_thread_L.$(SUFFIX) xspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-ssyr_U.$(SUFFIX) ssyr_U.$(PSUFFIX) : syr_k.c ../../param.h
+ssyr_U.$(SUFFIX) ssyr_U.$(PSUFFIX) : syr_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-ssyr_L.$(SUFFIX) ssyr_L.$(PSUFFIX) : syr_k.c ../../param.h
+ssyr_L.$(SUFFIX) ssyr_L.$(PSUFFIX) : syr_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-dsyr_U.$(SUFFIX) dsyr_U.$(PSUFFIX) : syr_k.c ../../param.h
+dsyr_U.$(SUFFIX) dsyr_U.$(PSUFFIX) : syr_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-dsyr_L.$(SUFFIX) dsyr_L.$(PSUFFIX) : syr_k.c ../../param.h
+dsyr_L.$(SUFFIX) dsyr_L.$(PSUFFIX) : syr_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-qsyr_U.$(SUFFIX) qsyr_U.$(PSUFFIX) : syr_k.c ../../param.h
+qsyr_U.$(SUFFIX) qsyr_U.$(PSUFFIX) : syr_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-qsyr_L.$(SUFFIX) qsyr_L.$(PSUFFIX) : syr_k.c ../../param.h
+qsyr_L.$(SUFFIX) qsyr_L.$(PSUFFIX) : syr_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-csyr_U.$(SUFFIX) csyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h
+csyr_U.$(SUFFIX) csyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-csyr_L.$(SUFFIX) csyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h
+csyr_L.$(SUFFIX) csyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-zsyr_U.$(SUFFIX) zsyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h
+zsyr_U.$(SUFFIX) zsyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-zsyr_L.$(SUFFIX) zsyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h
+zsyr_L.$(SUFFIX) zsyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-xsyr_U.$(SUFFIX) xsyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h
+xsyr_U.$(SUFFIX) xsyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-xsyr_L.$(SUFFIX) xsyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h
+xsyr_L.$(SUFFIX) xsyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-ssyr2_U.$(SUFFIX) ssyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h
+ssyr2_U.$(SUFFIX) ssyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-ssyr2_L.$(SUFFIX) ssyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h
+ssyr2_L.$(SUFFIX) ssyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-dsyr2_U.$(SUFFIX) dsyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h
+dsyr2_U.$(SUFFIX) dsyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-dsyr2_L.$(SUFFIX) dsyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h
+dsyr2_L.$(SUFFIX) dsyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-qsyr2_U.$(SUFFIX) qsyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h
+qsyr2_U.$(SUFFIX) qsyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-qsyr2_L.$(SUFFIX) qsyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h
+qsyr2_L.$(SUFFIX) qsyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-csyr2_U.$(SUFFIX) csyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h
+csyr2_U.$(SUFFIX) csyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F)
-csyr2_L.$(SUFFIX) csyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h
+csyr2_L.$(SUFFIX) csyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F)
-zsyr2_U.$(SUFFIX) zsyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h
+zsyr2_U.$(SUFFIX) zsyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F)
-zsyr2_L.$(SUFFIX) zsyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h
+zsyr2_L.$(SUFFIX) zsyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F)
-xsyr2_U.$(SUFFIX) xsyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h
+xsyr2_U.$(SUFFIX) xsyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F)
-xsyr2_L.$(SUFFIX) xsyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h
+xsyr2_L.$(SUFFIX) xsyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F)
-stbmv_NUU.$(SUFFIX) stbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h
+stbmv_NUU.$(SUFFIX) stbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-stbmv_NUN.$(SUFFIX) stbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h
+stbmv_NUN.$(SUFFIX) stbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-stbmv_TLU.$(SUFFIX) stbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h
+stbmv_TLU.$(SUFFIX) stbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-stbmv_TLN.$(SUFFIX) stbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h
+stbmv_TLN.$(SUFFIX) stbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-stbmv_NLU.$(SUFFIX) stbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h
+stbmv_NLU.$(SUFFIX) stbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-stbmv_NLN.$(SUFFIX) stbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h
+stbmv_NLN.$(SUFFIX) stbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-stbmv_TUU.$(SUFFIX) stbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h
+stbmv_TUU.$(SUFFIX) stbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-stbmv_TUN.$(SUFFIX) stbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h
+stbmv_TUN.$(SUFFIX) stbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-dtbmv_NUU.$(SUFFIX) dtbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h
+dtbmv_NUU.$(SUFFIX) dtbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-dtbmv_NUN.$(SUFFIX) dtbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h
+dtbmv_NUN.$(SUFFIX) dtbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-dtbmv_TLU.$(SUFFIX) dtbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h
+dtbmv_TLU.$(SUFFIX) dtbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-dtbmv_TLN.$(SUFFIX) dtbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h
+dtbmv_TLN.$(SUFFIX) dtbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-dtbmv_NLU.$(SUFFIX) dtbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h
+dtbmv_NLU.$(SUFFIX) dtbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-dtbmv_NLN.$(SUFFIX) dtbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h
+dtbmv_NLN.$(SUFFIX) dtbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-dtbmv_TUU.$(SUFFIX) dtbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h
+dtbmv_TUU.$(SUFFIX) dtbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-dtbmv_TUN.$(SUFFIX) dtbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h
+dtbmv_TUN.$(SUFFIX) dtbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-qtbmv_NUU.$(SUFFIX) qtbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h
+qtbmv_NUU.$(SUFFIX) qtbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-qtbmv_NUN.$(SUFFIX) qtbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h
+qtbmv_NUN.$(SUFFIX) qtbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-qtbmv_TLU.$(SUFFIX) qtbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h
+qtbmv_TLU.$(SUFFIX) qtbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-qtbmv_TLN.$(SUFFIX) qtbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h
+qtbmv_TLN.$(SUFFIX) qtbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-qtbmv_NLU.$(SUFFIX) qtbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h
+qtbmv_NLU.$(SUFFIX) qtbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-qtbmv_NLN.$(SUFFIX) qtbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h
+qtbmv_NLN.$(SUFFIX) qtbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-qtbmv_TUU.$(SUFFIX) qtbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h
+qtbmv_TUU.$(SUFFIX) qtbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-qtbmv_TUN.$(SUFFIX) qtbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h
+qtbmv_TUN.$(SUFFIX) qtbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-ctbmv_NUU.$(SUFFIX) ctbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h
+ctbmv_NUU.$(SUFFIX) ctbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-ctbmv_NUN.$(SUFFIX) ctbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h
+ctbmv_NUN.$(SUFFIX) ctbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-ctbmv_TLU.$(SUFFIX) ctbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h
+ctbmv_TLU.$(SUFFIX) ctbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-ctbmv_TLN.$(SUFFIX) ctbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h
+ctbmv_TLN.$(SUFFIX) ctbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-ctbmv_RLU.$(SUFFIX) ctbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h
+ctbmv_RLU.$(SUFFIX) ctbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-ctbmv_RLN.$(SUFFIX) ctbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h
+ctbmv_RLN.$(SUFFIX) ctbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-ctbmv_CLU.$(SUFFIX) ctbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h
+ctbmv_CLU.$(SUFFIX) ctbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-ctbmv_CLN.$(SUFFIX) ctbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h
+ctbmv_CLN.$(SUFFIX) ctbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-ctbmv_NLU.$(SUFFIX) ctbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h
+ctbmv_NLU.$(SUFFIX) ctbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-ctbmv_NLN.$(SUFFIX) ctbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h
+ctbmv_NLN.$(SUFFIX) ctbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-ctbmv_TUU.$(SUFFIX) ctbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h
+ctbmv_TUU.$(SUFFIX) ctbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-ctbmv_TUN.$(SUFFIX) ctbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h
+ctbmv_TUN.$(SUFFIX) ctbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-ctbmv_RUU.$(SUFFIX) ctbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h
+ctbmv_RUU.$(SUFFIX) ctbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-ctbmv_RUN.$(SUFFIX) ctbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h
+ctbmv_RUN.$(SUFFIX) ctbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-ctbmv_CUU.$(SUFFIX) ctbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h
+ctbmv_CUU.$(SUFFIX) ctbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-ctbmv_CUN.$(SUFFIX) ctbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h
+ctbmv_CUN.$(SUFFIX) ctbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-ztbmv_NUU.$(SUFFIX) ztbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h
+ztbmv_NUU.$(SUFFIX) ztbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-ztbmv_NUN.$(SUFFIX) ztbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h
+ztbmv_NUN.$(SUFFIX) ztbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-ztbmv_TLU.$(SUFFIX) ztbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h
+ztbmv_TLU.$(SUFFIX) ztbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-ztbmv_TLN.$(SUFFIX) ztbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h
+ztbmv_TLN.$(SUFFIX) ztbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-ztbmv_RLU.$(SUFFIX) ztbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h
+ztbmv_RLU.$(SUFFIX) ztbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-ztbmv_RLN.$(SUFFIX) ztbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h
+ztbmv_RLN.$(SUFFIX) ztbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-ztbmv_CLU.$(SUFFIX) ztbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h
+ztbmv_CLU.$(SUFFIX) ztbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-ztbmv_CLN.$(SUFFIX) ztbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h
+ztbmv_CLN.$(SUFFIX) ztbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-ztbmv_NLU.$(SUFFIX) ztbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h
+ztbmv_NLU.$(SUFFIX) ztbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-ztbmv_NLN.$(SUFFIX) ztbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h
+ztbmv_NLN.$(SUFFIX) ztbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-ztbmv_TUU.$(SUFFIX) ztbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h
+ztbmv_TUU.$(SUFFIX) ztbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-ztbmv_TUN.$(SUFFIX) ztbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h
+ztbmv_TUN.$(SUFFIX) ztbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-ztbmv_RUU.$(SUFFIX) ztbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h
+ztbmv_RUU.$(SUFFIX) ztbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-ztbmv_RUN.$(SUFFIX) ztbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h
+ztbmv_RUN.$(SUFFIX) ztbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-ztbmv_CUU.$(SUFFIX) ztbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h
+ztbmv_CUU.$(SUFFIX) ztbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-ztbmv_CUN.$(SUFFIX) ztbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h
+ztbmv_CUN.$(SUFFIX) ztbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-xtbmv_NUU.$(SUFFIX) xtbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h
+xtbmv_NUU.$(SUFFIX) xtbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-xtbmv_NUN.$(SUFFIX) xtbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h
+xtbmv_NUN.$(SUFFIX) xtbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-xtbmv_TLU.$(SUFFIX) xtbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h
+xtbmv_TLU.$(SUFFIX) xtbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-xtbmv_TLN.$(SUFFIX) xtbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h
+xtbmv_TLN.$(SUFFIX) xtbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-xtbmv_RLU.$(SUFFIX) xtbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h
+xtbmv_RLU.$(SUFFIX) xtbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-xtbmv_RLN.$(SUFFIX) xtbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h
+xtbmv_RLN.$(SUFFIX) xtbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-xtbmv_CLU.$(SUFFIX) xtbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h
+xtbmv_CLU.$(SUFFIX) xtbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-xtbmv_CLN.$(SUFFIX) xtbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h
+xtbmv_CLN.$(SUFFIX) xtbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-xtbmv_NLU.$(SUFFIX) xtbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h
+xtbmv_NLU.$(SUFFIX) xtbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-xtbmv_NLN.$(SUFFIX) xtbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h
+xtbmv_NLN.$(SUFFIX) xtbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-xtbmv_TUU.$(SUFFIX) xtbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h
+xtbmv_TUU.$(SUFFIX) xtbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-xtbmv_TUN.$(SUFFIX) xtbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h
+xtbmv_TUN.$(SUFFIX) xtbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-xtbmv_RUU.$(SUFFIX) xtbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h
+xtbmv_RUU.$(SUFFIX) xtbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-xtbmv_RUN.$(SUFFIX) xtbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h
+xtbmv_RUN.$(SUFFIX) xtbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-xtbmv_CUU.$(SUFFIX) xtbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h
+xtbmv_CUU.$(SUFFIX) xtbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-xtbmv_CUN.$(SUFFIX) xtbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h
+xtbmv_CUN.$(SUFFIX) xtbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-stbmv_thread_NUU.$(SUFFIX) stbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+stbmv_thread_NUU.$(SUFFIX) stbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F)
-stbmv_thread_NUN.$(SUFFIX) stbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+stbmv_thread_NUN.$(SUFFIX) stbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F)
-stbmv_thread_TLU.$(SUFFIX) stbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+stbmv_thread_TLU.$(SUFFIX) stbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F)
-stbmv_thread_TLN.$(SUFFIX) stbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+stbmv_thread_TLN.$(SUFFIX) stbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F)
-stbmv_thread_NLU.$(SUFFIX) stbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+stbmv_thread_NLU.$(SUFFIX) stbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F)
-stbmv_thread_NLN.$(SUFFIX) stbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+stbmv_thread_NLN.$(SUFFIX) stbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F)
-stbmv_thread_TUU.$(SUFFIX) stbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+stbmv_thread_TUU.$(SUFFIX) stbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F)
-stbmv_thread_TUN.$(SUFFIX) stbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+stbmv_thread_TUN.$(SUFFIX) stbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F)
-dtbmv_thread_NUU.$(SUFFIX) dtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+dtbmv_thread_NUU.$(SUFFIX) dtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F)
-dtbmv_thread_NUN.$(SUFFIX) dtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+dtbmv_thread_NUN.$(SUFFIX) dtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F)
-dtbmv_thread_TLU.$(SUFFIX) dtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+dtbmv_thread_TLU.$(SUFFIX) dtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F)
-dtbmv_thread_TLN.$(SUFFIX) dtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+dtbmv_thread_TLN.$(SUFFIX) dtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F)
-dtbmv_thread_NLU.$(SUFFIX) dtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+dtbmv_thread_NLU.$(SUFFIX) dtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F)
-dtbmv_thread_NLN.$(SUFFIX) dtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+dtbmv_thread_NLN.$(SUFFIX) dtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F)
-dtbmv_thread_TUU.$(SUFFIX) dtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+dtbmv_thread_TUU.$(SUFFIX) dtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F)
-dtbmv_thread_TUN.$(SUFFIX) dtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+dtbmv_thread_TUN.$(SUFFIX) dtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F)
-qtbmv_thread_NUU.$(SUFFIX) qtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+qtbmv_thread_NUU.$(SUFFIX) qtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F)
-qtbmv_thread_NUN.$(SUFFIX) qtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+qtbmv_thread_NUN.$(SUFFIX) qtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F)
-qtbmv_thread_TLU.$(SUFFIX) qtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+qtbmv_thread_TLU.$(SUFFIX) qtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F)
-qtbmv_thread_TLN.$(SUFFIX) qtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+qtbmv_thread_TLN.$(SUFFIX) qtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F)
-qtbmv_thread_NLU.$(SUFFIX) qtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+qtbmv_thread_NLU.$(SUFFIX) qtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F)
-qtbmv_thread_NLN.$(SUFFIX) qtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+qtbmv_thread_NLN.$(SUFFIX) qtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F)
-qtbmv_thread_TUU.$(SUFFIX) qtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+qtbmv_thread_TUU.$(SUFFIX) qtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F)
-qtbmv_thread_TUN.$(SUFFIX) qtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+qtbmv_thread_TUN.$(SUFFIX) qtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F)
-ctbmv_thread_NUU.$(SUFFIX) ctbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ctbmv_thread_NUU.$(SUFFIX) ctbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F)
-ctbmv_thread_NUN.$(SUFFIX) ctbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ctbmv_thread_NUN.$(SUFFIX) ctbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F)
-ctbmv_thread_TLU.$(SUFFIX) ctbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ctbmv_thread_TLU.$(SUFFIX) ctbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F)
-ctbmv_thread_TLN.$(SUFFIX) ctbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ctbmv_thread_TLN.$(SUFFIX) ctbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F)
-ctbmv_thread_RLU.$(SUFFIX) ctbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ctbmv_thread_RLU.$(SUFFIX) ctbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F)
-ctbmv_thread_RLN.$(SUFFIX) ctbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ctbmv_thread_RLN.$(SUFFIX) ctbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F)
-ctbmv_thread_CLU.$(SUFFIX) ctbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ctbmv_thread_CLU.$(SUFFIX) ctbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F)
-ctbmv_thread_CLN.$(SUFFIX) ctbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ctbmv_thread_CLN.$(SUFFIX) ctbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F)
-ctbmv_thread_NLU.$(SUFFIX) ctbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ctbmv_thread_NLU.$(SUFFIX) ctbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F)
-ctbmv_thread_NLN.$(SUFFIX) ctbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ctbmv_thread_NLN.$(SUFFIX) ctbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F)
-ctbmv_thread_TUU.$(SUFFIX) ctbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ctbmv_thread_TUU.$(SUFFIX) ctbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F)
-ctbmv_thread_TUN.$(SUFFIX) ctbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ctbmv_thread_TUN.$(SUFFIX) ctbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F)
-ctbmv_thread_RUU.$(SUFFIX) ctbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ctbmv_thread_RUU.$(SUFFIX) ctbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F)
-ctbmv_thread_RUN.$(SUFFIX) ctbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ctbmv_thread_RUN.$(SUFFIX) ctbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F)
-ctbmv_thread_CUU.$(SUFFIX) ctbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ctbmv_thread_CUU.$(SUFFIX) ctbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F)
-ctbmv_thread_CUN.$(SUFFIX) ctbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ctbmv_thread_CUN.$(SUFFIX) ctbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F)
-ztbmv_thread_NUU.$(SUFFIX) ztbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ztbmv_thread_NUU.$(SUFFIX) ztbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F)
-ztbmv_thread_NUN.$(SUFFIX) ztbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ztbmv_thread_NUN.$(SUFFIX) ztbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F)
-ztbmv_thread_TLU.$(SUFFIX) ztbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ztbmv_thread_TLU.$(SUFFIX) ztbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F)
-ztbmv_thread_TLN.$(SUFFIX) ztbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ztbmv_thread_TLN.$(SUFFIX) ztbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F)
-ztbmv_thread_RLU.$(SUFFIX) ztbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ztbmv_thread_RLU.$(SUFFIX) ztbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F)
-ztbmv_thread_RLN.$(SUFFIX) ztbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ztbmv_thread_RLN.$(SUFFIX) ztbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F)
-ztbmv_thread_CLU.$(SUFFIX) ztbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ztbmv_thread_CLU.$(SUFFIX) ztbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F)
-ztbmv_thread_CLN.$(SUFFIX) ztbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ztbmv_thread_CLN.$(SUFFIX) ztbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F)
-ztbmv_thread_NLU.$(SUFFIX) ztbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ztbmv_thread_NLU.$(SUFFIX) ztbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F)
-ztbmv_thread_NLN.$(SUFFIX) ztbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ztbmv_thread_NLN.$(SUFFIX) ztbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F)
-ztbmv_thread_TUU.$(SUFFIX) ztbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ztbmv_thread_TUU.$(SUFFIX) ztbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F)
-ztbmv_thread_TUN.$(SUFFIX) ztbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ztbmv_thread_TUN.$(SUFFIX) ztbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F)
-ztbmv_thread_RUU.$(SUFFIX) ztbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ztbmv_thread_RUU.$(SUFFIX) ztbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F)
-ztbmv_thread_RUN.$(SUFFIX) ztbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ztbmv_thread_RUN.$(SUFFIX) ztbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F)
-ztbmv_thread_CUU.$(SUFFIX) ztbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ztbmv_thread_CUU.$(SUFFIX) ztbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F)
-ztbmv_thread_CUN.$(SUFFIX) ztbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+ztbmv_thread_CUN.$(SUFFIX) ztbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F)
-xtbmv_thread_NUU.$(SUFFIX) xtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+xtbmv_thread_NUU.$(SUFFIX) xtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F)
-xtbmv_thread_NUN.$(SUFFIX) xtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+xtbmv_thread_NUN.$(SUFFIX) xtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F)
-xtbmv_thread_TLU.$(SUFFIX) xtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+xtbmv_thread_TLU.$(SUFFIX) xtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F)
-xtbmv_thread_TLN.$(SUFFIX) xtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+xtbmv_thread_TLN.$(SUFFIX) xtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F)
-xtbmv_thread_RLU.$(SUFFIX) xtbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+xtbmv_thread_RLU.$(SUFFIX) xtbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F)
-xtbmv_thread_RLN.$(SUFFIX) xtbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+xtbmv_thread_RLN.$(SUFFIX) xtbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F)
-xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F)
-xtbmv_thread_CLN.$(SUFFIX) xtbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+xtbmv_thread_CLN.$(SUFFIX) xtbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F)
-xtbmv_thread_NLU.$(SUFFIX) xtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+xtbmv_thread_NLU.$(SUFFIX) xtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F)
-xtbmv_thread_NLN.$(SUFFIX) xtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+xtbmv_thread_NLN.$(SUFFIX) xtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F)
-xtbmv_thread_TUU.$(SUFFIX) xtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+xtbmv_thread_TUU.$(SUFFIX) xtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F)
-xtbmv_thread_TUN.$(SUFFIX) xtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+xtbmv_thread_TUN.$(SUFFIX) xtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F)
-xtbmv_thread_RUU.$(SUFFIX) xtbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+xtbmv_thread_RUU.$(SUFFIX) xtbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F)
-xtbmv_thread_RUN.$(SUFFIX) xtbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+xtbmv_thread_RUN.$(SUFFIX) xtbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F)
-xtbmv_thread_CUU.$(SUFFIX) xtbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
+xtbmv_thread_CUU.$(SUFFIX) xtbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F)
-xtbmv_thread_CUN.$(SUFFIX) xtbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
+xtbmv_thread_CUN.$(SUFFIX) xtbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F)
-stbsv_NUU.$(SUFFIX) stbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h
+stbsv_NUU.$(SUFFIX) stbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-stbsv_NUN.$(SUFFIX) stbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h
+stbsv_NUN.$(SUFFIX) stbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-stbsv_TLU.$(SUFFIX) stbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h
+stbsv_TLU.$(SUFFIX) stbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-stbsv_TLN.$(SUFFIX) stbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h
+stbsv_TLN.$(SUFFIX) stbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-stbsv_NLU.$(SUFFIX) stbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h
+stbsv_NLU.$(SUFFIX) stbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-stbsv_NLN.$(SUFFIX) stbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h
+stbsv_NLN.$(SUFFIX) stbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-stbsv_TUU.$(SUFFIX) stbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h
+stbsv_TUU.$(SUFFIX) stbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-stbsv_TUN.$(SUFFIX) stbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h
+stbsv_TUN.$(SUFFIX) stbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-dtbsv_NUU.$(SUFFIX) dtbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h
+dtbsv_NUU.$(SUFFIX) dtbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-dtbsv_NUN.$(SUFFIX) dtbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h
+dtbsv_NUN.$(SUFFIX) dtbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-dtbsv_TLU.$(SUFFIX) dtbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h
+dtbsv_TLU.$(SUFFIX) dtbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-dtbsv_TLN.$(SUFFIX) dtbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h
+dtbsv_TLN.$(SUFFIX) dtbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-dtbsv_NLU.$(SUFFIX) dtbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h
+dtbsv_NLU.$(SUFFIX) dtbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-dtbsv_NLN.$(SUFFIX) dtbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h
+dtbsv_NLN.$(SUFFIX) dtbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-dtbsv_TUU.$(SUFFIX) dtbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h
+dtbsv_TUU.$(SUFFIX) dtbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-dtbsv_TUN.$(SUFFIX) dtbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h
+dtbsv_TUN.$(SUFFIX) dtbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-qtbsv_NUU.$(SUFFIX) qtbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h
+qtbsv_NUU.$(SUFFIX) qtbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-qtbsv_NUN.$(SUFFIX) qtbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h
+qtbsv_NUN.$(SUFFIX) qtbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-qtbsv_TLU.$(SUFFIX) qtbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h
+qtbsv_TLU.$(SUFFIX) qtbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-qtbsv_TLN.$(SUFFIX) qtbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h
+qtbsv_TLN.$(SUFFIX) qtbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-qtbsv_NLU.$(SUFFIX) qtbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h
+qtbsv_NLU.$(SUFFIX) qtbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-qtbsv_NLN.$(SUFFIX) qtbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h
+qtbsv_NLN.$(SUFFIX) qtbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-qtbsv_TUU.$(SUFFIX) qtbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h
+qtbsv_TUU.$(SUFFIX) qtbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-qtbsv_TUN.$(SUFFIX) qtbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h
+qtbsv_TUN.$(SUFFIX) qtbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-ctbsv_NUU.$(SUFFIX) ctbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h
+ctbsv_NUU.$(SUFFIX) ctbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-ctbsv_NUN.$(SUFFIX) ctbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h
+ctbsv_NUN.$(SUFFIX) ctbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-ctbsv_TLU.$(SUFFIX) ctbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h
+ctbsv_TLU.$(SUFFIX) ctbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-ctbsv_TLN.$(SUFFIX) ctbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h
+ctbsv_TLN.$(SUFFIX) ctbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-ctbsv_RLU.$(SUFFIX) ctbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h
+ctbsv_RLU.$(SUFFIX) ctbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-ctbsv_RLN.$(SUFFIX) ctbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h
+ctbsv_RLN.$(SUFFIX) ctbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-ctbsv_CLU.$(SUFFIX) ctbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h
+ctbsv_CLU.$(SUFFIX) ctbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-ctbsv_CLN.$(SUFFIX) ctbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h
+ctbsv_CLN.$(SUFFIX) ctbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-ctbsv_NLU.$(SUFFIX) ctbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h
+ctbsv_NLU.$(SUFFIX) ctbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-ctbsv_NLN.$(SUFFIX) ctbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h
+ctbsv_NLN.$(SUFFIX) ctbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-ctbsv_TUU.$(SUFFIX) ctbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h
+ctbsv_TUU.$(SUFFIX) ctbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-ctbsv_TUN.$(SUFFIX) ctbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h
+ctbsv_TUN.$(SUFFIX) ctbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-ctbsv_RUU.$(SUFFIX) ctbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h
+ctbsv_RUU.$(SUFFIX) ctbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-ctbsv_RUN.$(SUFFIX) ctbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h
+ctbsv_RUN.$(SUFFIX) ctbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-ctbsv_CUU.$(SUFFIX) ctbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h
+ctbsv_CUU.$(SUFFIX) ctbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-ctbsv_CUN.$(SUFFIX) ctbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h
+ctbsv_CUN.$(SUFFIX) ctbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-ztbsv_NUU.$(SUFFIX) ztbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h
+ztbsv_NUU.$(SUFFIX) ztbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-ztbsv_NUN.$(SUFFIX) ztbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h
+ztbsv_NUN.$(SUFFIX) ztbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-ztbsv_TLU.$(SUFFIX) ztbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h
+ztbsv_TLU.$(SUFFIX) ztbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-ztbsv_TLN.$(SUFFIX) ztbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h
+ztbsv_TLN.$(SUFFIX) ztbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-ztbsv_RLU.$(SUFFIX) ztbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h
+ztbsv_RLU.$(SUFFIX) ztbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-ztbsv_RLN.$(SUFFIX) ztbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h
+ztbsv_RLN.$(SUFFIX) ztbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-ztbsv_CLU.$(SUFFIX) ztbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h
+ztbsv_CLU.$(SUFFIX) ztbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-ztbsv_CLN.$(SUFFIX) ztbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h
+ztbsv_CLN.$(SUFFIX) ztbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-ztbsv_NLU.$(SUFFIX) ztbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h
+ztbsv_NLU.$(SUFFIX) ztbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-ztbsv_NLN.$(SUFFIX) ztbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h
+ztbsv_NLN.$(SUFFIX) ztbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-ztbsv_TUU.$(SUFFIX) ztbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h
+ztbsv_TUU.$(SUFFIX) ztbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-ztbsv_TUN.$(SUFFIX) ztbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h
+ztbsv_TUN.$(SUFFIX) ztbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-ztbsv_RUU.$(SUFFIX) ztbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h
+ztbsv_RUU.$(SUFFIX) ztbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-ztbsv_RUN.$(SUFFIX) ztbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h
+ztbsv_RUN.$(SUFFIX) ztbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-ztbsv_CUU.$(SUFFIX) ztbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h
+ztbsv_CUU.$(SUFFIX) ztbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-ztbsv_CUN.$(SUFFIX) ztbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h
+ztbsv_CUN.$(SUFFIX) ztbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-xtbsv_NUU.$(SUFFIX) xtbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h
+xtbsv_NUU.$(SUFFIX) xtbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-xtbsv_NUN.$(SUFFIX) xtbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h
+xtbsv_NUN.$(SUFFIX) xtbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-xtbsv_TLU.$(SUFFIX) xtbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h
+xtbsv_TLU.$(SUFFIX) xtbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-xtbsv_TLN.$(SUFFIX) xtbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h
+xtbsv_TLN.$(SUFFIX) xtbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-xtbsv_RLU.$(SUFFIX) xtbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h
+xtbsv_RLU.$(SUFFIX) xtbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-xtbsv_RLN.$(SUFFIX) xtbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h
+xtbsv_RLN.$(SUFFIX) xtbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-xtbsv_CLU.$(SUFFIX) xtbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h
+xtbsv_CLU.$(SUFFIX) xtbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-xtbsv_CLN.$(SUFFIX) xtbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h
+xtbsv_CLN.$(SUFFIX) xtbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-xtbsv_NLU.$(SUFFIX) xtbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h
+xtbsv_NLU.$(SUFFIX) xtbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-xtbsv_NLN.$(SUFFIX) xtbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h
+xtbsv_NLN.$(SUFFIX) xtbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-xtbsv_TUU.$(SUFFIX) xtbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h
+xtbsv_TUU.$(SUFFIX) xtbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-xtbsv_TUN.$(SUFFIX) xtbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h
+xtbsv_TUN.$(SUFFIX) xtbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-xtbsv_RUU.$(SUFFIX) xtbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h
+xtbsv_RUU.$(SUFFIX) xtbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-xtbsv_RUN.$(SUFFIX) xtbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h
+xtbsv_RUN.$(SUFFIX) xtbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-xtbsv_CUU.$(SUFFIX) xtbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h
+xtbsv_CUU.$(SUFFIX) xtbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-xtbsv_CUN.$(SUFFIX) xtbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h
+xtbsv_CUN.$(SUFFIX) xtbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-stpmv_NUU.$(SUFFIX) stpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h
+stpmv_NUU.$(SUFFIX) stpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-stpmv_NUN.$(SUFFIX) stpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h
+stpmv_NUN.$(SUFFIX) stpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-stpmv_TLU.$(SUFFIX) stpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h
+stpmv_TLU.$(SUFFIX) stpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-stpmv_TLN.$(SUFFIX) stpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h
+stpmv_TLN.$(SUFFIX) stpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-stpmv_NLU.$(SUFFIX) stpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h
+stpmv_NLU.$(SUFFIX) stpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-stpmv_NLN.$(SUFFIX) stpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h
+stpmv_NLN.$(SUFFIX) stpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-stpmv_TUU.$(SUFFIX) stpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h
+stpmv_TUU.$(SUFFIX) stpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-stpmv_TUN.$(SUFFIX) stpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h
+stpmv_TUN.$(SUFFIX) stpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-dtpmv_NUU.$(SUFFIX) dtpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h
+dtpmv_NUU.$(SUFFIX) dtpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-dtpmv_NUN.$(SUFFIX) dtpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h
+dtpmv_NUN.$(SUFFIX) dtpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-dtpmv_TLU.$(SUFFIX) dtpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h
+dtpmv_TLU.$(SUFFIX) dtpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-dtpmv_TLN.$(SUFFIX) dtpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h
+dtpmv_TLN.$(SUFFIX) dtpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-dtpmv_NLU.$(SUFFIX) dtpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h
+dtpmv_NLU.$(SUFFIX) dtpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-dtpmv_NLN.$(SUFFIX) dtpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h
+dtpmv_NLN.$(SUFFIX) dtpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-dtpmv_TUU.$(SUFFIX) dtpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h
+dtpmv_TUU.$(SUFFIX) dtpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-dtpmv_TUN.$(SUFFIX) dtpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h
+dtpmv_TUN.$(SUFFIX) dtpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-qtpmv_NUU.$(SUFFIX) qtpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h
+qtpmv_NUU.$(SUFFIX) qtpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-qtpmv_NUN.$(SUFFIX) qtpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h
+qtpmv_NUN.$(SUFFIX) qtpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-qtpmv_TLU.$(SUFFIX) qtpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h
+qtpmv_TLU.$(SUFFIX) qtpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-qtpmv_TLN.$(SUFFIX) qtpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h
+qtpmv_TLN.$(SUFFIX) qtpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-qtpmv_NLU.$(SUFFIX) qtpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h
+qtpmv_NLU.$(SUFFIX) qtpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-qtpmv_NLN.$(SUFFIX) qtpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h
+qtpmv_NLN.$(SUFFIX) qtpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-qtpmv_TUU.$(SUFFIX) qtpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h
+qtpmv_TUU.$(SUFFIX) qtpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-qtpmv_TUN.$(SUFFIX) qtpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h
+qtpmv_TUN.$(SUFFIX) qtpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-ctpmv_NUU.$(SUFFIX) ctpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h
+ctpmv_NUU.$(SUFFIX) ctpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-ctpmv_NUN.$(SUFFIX) ctpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h
+ctpmv_NUN.$(SUFFIX) ctpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-ctpmv_TLU.$(SUFFIX) ctpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h
+ctpmv_TLU.$(SUFFIX) ctpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-ctpmv_TLN.$(SUFFIX) ctpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h
+ctpmv_TLN.$(SUFFIX) ctpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-ctpmv_RLU.$(SUFFIX) ctpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h
+ctpmv_RLU.$(SUFFIX) ctpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-ctpmv_RLN.$(SUFFIX) ctpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h
+ctpmv_RLN.$(SUFFIX) ctpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-ctpmv_CLU.$(SUFFIX) ctpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h
+ctpmv_CLU.$(SUFFIX) ctpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-ctpmv_CLN.$(SUFFIX) ctpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h
+ctpmv_CLN.$(SUFFIX) ctpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-ctpmv_NLU.$(SUFFIX) ctpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h
+ctpmv_NLU.$(SUFFIX) ctpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-ctpmv_NLN.$(SUFFIX) ctpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h
+ctpmv_NLN.$(SUFFIX) ctpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-ctpmv_TUU.$(SUFFIX) ctpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h
+ctpmv_TUU.$(SUFFIX) ctpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-ctpmv_TUN.$(SUFFIX) ctpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h
+ctpmv_TUN.$(SUFFIX) ctpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-ctpmv_RUU.$(SUFFIX) ctpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h
+ctpmv_RUU.$(SUFFIX) ctpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-ctpmv_RUN.$(SUFFIX) ctpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h
+ctpmv_RUN.$(SUFFIX) ctpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-ctpmv_CUU.$(SUFFIX) ctpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h
+ctpmv_CUU.$(SUFFIX) ctpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-ctpmv_CUN.$(SUFFIX) ctpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h
+ctpmv_CUN.$(SUFFIX) ctpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-ztpmv_NUU.$(SUFFIX) ztpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h
+ztpmv_NUU.$(SUFFIX) ztpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-ztpmv_NUN.$(SUFFIX) ztpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h
+ztpmv_NUN.$(SUFFIX) ztpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-ztpmv_TLU.$(SUFFIX) ztpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h
+ztpmv_TLU.$(SUFFIX) ztpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-ztpmv_TLN.$(SUFFIX) ztpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h
+ztpmv_TLN.$(SUFFIX) ztpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-ztpmv_RLU.$(SUFFIX) ztpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h
+ztpmv_RLU.$(SUFFIX) ztpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-ztpmv_RLN.$(SUFFIX) ztpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h
+ztpmv_RLN.$(SUFFIX) ztpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-ztpmv_CLU.$(SUFFIX) ztpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h
+ztpmv_CLU.$(SUFFIX) ztpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-ztpmv_CLN.$(SUFFIX) ztpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h
+ztpmv_CLN.$(SUFFIX) ztpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-ztpmv_NLU.$(SUFFIX) ztpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h
+ztpmv_NLU.$(SUFFIX) ztpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-ztpmv_NLN.$(SUFFIX) ztpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h
+ztpmv_NLN.$(SUFFIX) ztpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-ztpmv_TUU.$(SUFFIX) ztpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h
+ztpmv_TUU.$(SUFFIX) ztpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-ztpmv_TUN.$(SUFFIX) ztpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h
+ztpmv_TUN.$(SUFFIX) ztpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-ztpmv_RUU.$(SUFFIX) ztpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h
+ztpmv_RUU.$(SUFFIX) ztpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-ztpmv_RUN.$(SUFFIX) ztpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h
+ztpmv_RUN.$(SUFFIX) ztpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-ztpmv_CUU.$(SUFFIX) ztpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h
+ztpmv_CUU.$(SUFFIX) ztpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-ztpmv_CUN.$(SUFFIX) ztpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h
+ztpmv_CUN.$(SUFFIX) ztpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-xtpmv_NUU.$(SUFFIX) xtpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h
+xtpmv_NUU.$(SUFFIX) xtpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-xtpmv_NUN.$(SUFFIX) xtpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h
+xtpmv_NUN.$(SUFFIX) xtpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-xtpmv_TLU.$(SUFFIX) xtpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h
+xtpmv_TLU.$(SUFFIX) xtpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-xtpmv_TLN.$(SUFFIX) xtpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h
+xtpmv_TLN.$(SUFFIX) xtpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-xtpmv_RLU.$(SUFFIX) xtpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h
+xtpmv_RLU.$(SUFFIX) xtpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-xtpmv_RLN.$(SUFFIX) xtpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h
+xtpmv_RLN.$(SUFFIX) xtpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-xtpmv_CLU.$(SUFFIX) xtpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h
+xtpmv_CLU.$(SUFFIX) xtpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-xtpmv_CLN.$(SUFFIX) xtpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h
+xtpmv_CLN.$(SUFFIX) xtpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-xtpmv_NLU.$(SUFFIX) xtpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h
+xtpmv_NLU.$(SUFFIX) xtpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-xtpmv_NLN.$(SUFFIX) xtpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h
+xtpmv_NLN.$(SUFFIX) xtpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-xtpmv_TUU.$(SUFFIX) xtpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h
+xtpmv_TUU.$(SUFFIX) xtpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-xtpmv_TUN.$(SUFFIX) xtpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h
+xtpmv_TUN.$(SUFFIX) xtpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-xtpmv_RUU.$(SUFFIX) xtpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h
+xtpmv_RUU.$(SUFFIX) xtpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-xtpmv_RUN.$(SUFFIX) xtpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h
+xtpmv_RUN.$(SUFFIX) xtpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-xtpmv_CUU.$(SUFFIX) xtpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h
+xtpmv_CUU.$(SUFFIX) xtpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-xtpmv_CUN.$(SUFFIX) xtpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h
+xtpmv_CUN.$(SUFFIX) xtpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-stpmv_thread_NUU.$(SUFFIX) stpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+stpmv_thread_NUU.$(SUFFIX) stpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F)
-stpmv_thread_NUN.$(SUFFIX) stpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+stpmv_thread_NUN.$(SUFFIX) stpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F)
-stpmv_thread_TLU.$(SUFFIX) stpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+stpmv_thread_TLU.$(SUFFIX) stpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F)
-stpmv_thread_TLN.$(SUFFIX) stpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+stpmv_thread_TLN.$(SUFFIX) stpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F)
-stpmv_thread_NLU.$(SUFFIX) stpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+stpmv_thread_NLU.$(SUFFIX) stpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F)
-stpmv_thread_NLN.$(SUFFIX) stpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+stpmv_thread_NLN.$(SUFFIX) stpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F)
-stpmv_thread_TUU.$(SUFFIX) stpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+stpmv_thread_TUU.$(SUFFIX) stpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F)
-stpmv_thread_TUN.$(SUFFIX) stpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+stpmv_thread_TUN.$(SUFFIX) stpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F)
-dtpmv_thread_NUU.$(SUFFIX) dtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+dtpmv_thread_NUU.$(SUFFIX) dtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F)
-dtpmv_thread_NUN.$(SUFFIX) dtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+dtpmv_thread_NUN.$(SUFFIX) dtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F)
-dtpmv_thread_TLU.$(SUFFIX) dtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+dtpmv_thread_TLU.$(SUFFIX) dtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F)
-dtpmv_thread_TLN.$(SUFFIX) dtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+dtpmv_thread_TLN.$(SUFFIX) dtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F)
-dtpmv_thread_NLU.$(SUFFIX) dtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+dtpmv_thread_NLU.$(SUFFIX) dtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F)
-dtpmv_thread_NLN.$(SUFFIX) dtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+dtpmv_thread_NLN.$(SUFFIX) dtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F)
-dtpmv_thread_TUU.$(SUFFIX) dtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+dtpmv_thread_TUU.$(SUFFIX) dtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F)
-dtpmv_thread_TUN.$(SUFFIX) dtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+dtpmv_thread_TUN.$(SUFFIX) dtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F)
-qtpmv_thread_NUU.$(SUFFIX) qtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+qtpmv_thread_NUU.$(SUFFIX) qtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F)
-qtpmv_thread_NUN.$(SUFFIX) qtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+qtpmv_thread_NUN.$(SUFFIX) qtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F)
-qtpmv_thread_TLU.$(SUFFIX) qtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+qtpmv_thread_TLU.$(SUFFIX) qtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F)
-qtpmv_thread_TLN.$(SUFFIX) qtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+qtpmv_thread_TLN.$(SUFFIX) qtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F)
-qtpmv_thread_NLU.$(SUFFIX) qtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+qtpmv_thread_NLU.$(SUFFIX) qtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F)
-qtpmv_thread_NLN.$(SUFFIX) qtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+qtpmv_thread_NLN.$(SUFFIX) qtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F)
-qtpmv_thread_TUU.$(SUFFIX) qtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+qtpmv_thread_TUU.$(SUFFIX) qtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F)
-qtpmv_thread_TUN.$(SUFFIX) qtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+qtpmv_thread_TUN.$(SUFFIX) qtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F)
-ctpmv_thread_NUU.$(SUFFIX) ctpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ctpmv_thread_NUU.$(SUFFIX) ctpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F)
-ctpmv_thread_NUN.$(SUFFIX) ctpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ctpmv_thread_NUN.$(SUFFIX) ctpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F)
-ctpmv_thread_TLU.$(SUFFIX) ctpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ctpmv_thread_TLU.$(SUFFIX) ctpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F)
-ctpmv_thread_TLN.$(SUFFIX) ctpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ctpmv_thread_TLN.$(SUFFIX) ctpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F)
-ctpmv_thread_RLU.$(SUFFIX) ctpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ctpmv_thread_RLU.$(SUFFIX) ctpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F)
-ctpmv_thread_RLN.$(SUFFIX) ctpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ctpmv_thread_RLN.$(SUFFIX) ctpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F)
-ctpmv_thread_CLU.$(SUFFIX) ctpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ctpmv_thread_CLU.$(SUFFIX) ctpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F)
-ctpmv_thread_CLN.$(SUFFIX) ctpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ctpmv_thread_CLN.$(SUFFIX) ctpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F)
-ctpmv_thread_NLU.$(SUFFIX) ctpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ctpmv_thread_NLU.$(SUFFIX) ctpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F)
-ctpmv_thread_NLN.$(SUFFIX) ctpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ctpmv_thread_NLN.$(SUFFIX) ctpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F)
-ctpmv_thread_TUU.$(SUFFIX) ctpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ctpmv_thread_TUU.$(SUFFIX) ctpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F)
-ctpmv_thread_TUN.$(SUFFIX) ctpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ctpmv_thread_TUN.$(SUFFIX) ctpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F)
-ctpmv_thread_RUU.$(SUFFIX) ctpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ctpmv_thread_RUU.$(SUFFIX) ctpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F)
-ctpmv_thread_RUN.$(SUFFIX) ctpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ctpmv_thread_RUN.$(SUFFIX) ctpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F)
-ctpmv_thread_CUU.$(SUFFIX) ctpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ctpmv_thread_CUU.$(SUFFIX) ctpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F)
-ctpmv_thread_CUN.$(SUFFIX) ctpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ctpmv_thread_CUN.$(SUFFIX) ctpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F)
-ztpmv_thread_NUU.$(SUFFIX) ztpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ztpmv_thread_NUU.$(SUFFIX) ztpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F)
-ztpmv_thread_NUN.$(SUFFIX) ztpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ztpmv_thread_NUN.$(SUFFIX) ztpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F)
-ztpmv_thread_TLU.$(SUFFIX) ztpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ztpmv_thread_TLU.$(SUFFIX) ztpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F)
-ztpmv_thread_TLN.$(SUFFIX) ztpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ztpmv_thread_TLN.$(SUFFIX) ztpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F)
-ztpmv_thread_RLU.$(SUFFIX) ztpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ztpmv_thread_RLU.$(SUFFIX) ztpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F)
-ztpmv_thread_RLN.$(SUFFIX) ztpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ztpmv_thread_RLN.$(SUFFIX) ztpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F)
-ztpmv_thread_CLU.$(SUFFIX) ztpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ztpmv_thread_CLU.$(SUFFIX) ztpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F)
-ztpmv_thread_CLN.$(SUFFIX) ztpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ztpmv_thread_CLN.$(SUFFIX) ztpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F)
-ztpmv_thread_NLU.$(SUFFIX) ztpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ztpmv_thread_NLU.$(SUFFIX) ztpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F)
-ztpmv_thread_NLN.$(SUFFIX) ztpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ztpmv_thread_NLN.$(SUFFIX) ztpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F)
-ztpmv_thread_TUU.$(SUFFIX) ztpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ztpmv_thread_TUU.$(SUFFIX) ztpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F)
-ztpmv_thread_TUN.$(SUFFIX) ztpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ztpmv_thread_TUN.$(SUFFIX) ztpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F)
-ztpmv_thread_RUU.$(SUFFIX) ztpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ztpmv_thread_RUU.$(SUFFIX) ztpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F)
-ztpmv_thread_RUN.$(SUFFIX) ztpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ztpmv_thread_RUN.$(SUFFIX) ztpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F)
-ztpmv_thread_CUU.$(SUFFIX) ztpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ztpmv_thread_CUU.$(SUFFIX) ztpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F)
-ztpmv_thread_CUN.$(SUFFIX) ztpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+ztpmv_thread_CUN.$(SUFFIX) ztpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F)
-xtpmv_thread_NUU.$(SUFFIX) xtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+xtpmv_thread_NUU.$(SUFFIX) xtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F)
-xtpmv_thread_NUN.$(SUFFIX) xtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+xtpmv_thread_NUN.$(SUFFIX) xtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F)
-xtpmv_thread_TLU.$(SUFFIX) xtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+xtpmv_thread_TLU.$(SUFFIX) xtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F)
-xtpmv_thread_TLN.$(SUFFIX) xtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+xtpmv_thread_TLN.$(SUFFIX) xtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F)
-xtpmv_thread_RLU.$(SUFFIX) xtpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+xtpmv_thread_RLU.$(SUFFIX) xtpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F)
-xtpmv_thread_RLN.$(SUFFIX) xtpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+xtpmv_thread_RLN.$(SUFFIX) xtpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F)
-xtpmv_thread_CLU.$(SUFFIX) xtpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+xtpmv_thread_CLU.$(SUFFIX) xtpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F)
-xtpmv_thread_CLN.$(SUFFIX) xtpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+xtpmv_thread_CLN.$(SUFFIX) xtpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F)
-xtpmv_thread_NLU.$(SUFFIX) xtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+xtpmv_thread_NLU.$(SUFFIX) xtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F)
-xtpmv_thread_NLN.$(SUFFIX) xtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+xtpmv_thread_NLN.$(SUFFIX) xtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F)
-xtpmv_thread_TUU.$(SUFFIX) xtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+xtpmv_thread_TUU.$(SUFFIX) xtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F)
-xtpmv_thread_TUN.$(SUFFIX) xtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+xtpmv_thread_TUN.$(SUFFIX) xtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F)
-xtpmv_thread_RUU.$(SUFFIX) xtpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+xtpmv_thread_RUU.$(SUFFIX) xtpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F)
-xtpmv_thread_RUN.$(SUFFIX) xtpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+xtpmv_thread_RUN.$(SUFFIX) xtpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F)
-xtpmv_thread_CUU.$(SUFFIX) xtpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
+xtpmv_thread_CUU.$(SUFFIX) xtpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F)
-xtpmv_thread_CUN.$(SUFFIX) xtpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
+xtpmv_thread_CUN.$(SUFFIX) xtpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F)
-stpsv_NUU.$(SUFFIX) stpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h
+stpsv_NUU.$(SUFFIX) stpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-stpsv_NUN.$(SUFFIX) stpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h
+stpsv_NUN.$(SUFFIX) stpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-stpsv_TLU.$(SUFFIX) stpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h
+stpsv_TLU.$(SUFFIX) stpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-stpsv_TLN.$(SUFFIX) stpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h
+stpsv_TLN.$(SUFFIX) stpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-stpsv_NLU.$(SUFFIX) stpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h
+stpsv_NLU.$(SUFFIX) stpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-stpsv_NLN.$(SUFFIX) stpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h
+stpsv_NLN.$(SUFFIX) stpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-stpsv_TUU.$(SUFFIX) stpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h
+stpsv_TUU.$(SUFFIX) stpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-stpsv_TUN.$(SUFFIX) stpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h
+stpsv_TUN.$(SUFFIX) stpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-dtpsv_NUU.$(SUFFIX) dtpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h
+dtpsv_NUU.$(SUFFIX) dtpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-dtpsv_NUN.$(SUFFIX) dtpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h
+dtpsv_NUN.$(SUFFIX) dtpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-dtpsv_TLU.$(SUFFIX) dtpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h
+dtpsv_TLU.$(SUFFIX) dtpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-dtpsv_TLN.$(SUFFIX) dtpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h
+dtpsv_TLN.$(SUFFIX) dtpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-dtpsv_NLU.$(SUFFIX) dtpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h
+dtpsv_NLU.$(SUFFIX) dtpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-dtpsv_NLN.$(SUFFIX) dtpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h
+dtpsv_NLN.$(SUFFIX) dtpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-dtpsv_TUU.$(SUFFIX) dtpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h
+dtpsv_TUU.$(SUFFIX) dtpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-dtpsv_TUN.$(SUFFIX) dtpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h
+dtpsv_TUN.$(SUFFIX) dtpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-qtpsv_NUU.$(SUFFIX) qtpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h
+qtpsv_NUU.$(SUFFIX) qtpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-qtpsv_NUN.$(SUFFIX) qtpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h
+qtpsv_NUN.$(SUFFIX) qtpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-qtpsv_TLU.$(SUFFIX) qtpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h
+qtpsv_TLU.$(SUFFIX) qtpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-qtpsv_TLN.$(SUFFIX) qtpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h
+qtpsv_TLN.$(SUFFIX) qtpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-qtpsv_NLU.$(SUFFIX) qtpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h
+qtpsv_NLU.$(SUFFIX) qtpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-qtpsv_NLN.$(SUFFIX) qtpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h
+qtpsv_NLN.$(SUFFIX) qtpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-qtpsv_TUU.$(SUFFIX) qtpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h
+qtpsv_TUU.$(SUFFIX) qtpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-qtpsv_TUN.$(SUFFIX) qtpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h
+qtpsv_TUN.$(SUFFIX) qtpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-ctpsv_NUU.$(SUFFIX) ctpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h
+ctpsv_NUU.$(SUFFIX) ctpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F)
-ctpsv_NUN.$(SUFFIX) ctpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h
+ctpsv_NUN.$(SUFFIX) ctpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F)
-ctpsv_TLU.$(SUFFIX) ctpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h
+ctpsv_TLU.$(SUFFIX) ctpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F)
-ctpsv_TLN.$(SUFFIX) ctpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h
+ctpsv_TLN.$(SUFFIX) ctpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F)
-ctpsv_RLU.$(SUFFIX) ctpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h
+ctpsv_RLU.$(SUFFIX) ctpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F)
-ctpsv_RLN.$(SUFFIX) ctpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h
+ctpsv_RLN.$(SUFFIX) ctpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F)
-ctpsv_CLU.$(SUFFIX) ctpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h
+ctpsv_CLU.$(SUFFIX) ctpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F)
-ctpsv_CLN.$(SUFFIX) ctpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h
+ctpsv_CLN.$(SUFFIX) ctpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F)
-ctpsv_NLU.$(SUFFIX) ctpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h
+ctpsv_NLU.$(SUFFIX) ctpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F)
-ctpsv_NLN.$(SUFFIX) ctpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h
+ctpsv_NLN.$(SUFFIX) ctpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F)
-ctpsv_TUU.$(SUFFIX) ctpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h
+ctpsv_TUU.$(SUFFIX) ctpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F)
-ctpsv_TUN.$(SUFFIX) ctpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h
+ctpsv_TUN.$(SUFFIX) ctpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F)
-ctpsv_RUU.$(SUFFIX) ctpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h
+ctpsv_RUU.$(SUFFIX) ctpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F)
-ctpsv_RUN.$(SUFFIX) ctpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h
+ctpsv_RUN.$(SUFFIX) ctpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F)
-ctpsv_CUU.$(SUFFIX) ctpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h
+ctpsv_CUU.$(SUFFIX) ctpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F)
-ctpsv_CUN.$(SUFFIX) ctpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h
+ctpsv_CUN.$(SUFFIX) ctpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F)
-ztpsv_NUU.$(SUFFIX) ztpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h
+ztpsv_NUU.$(SUFFIX) ztpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F)
-ztpsv_NUN.$(SUFFIX) ztpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h
+ztpsv_NUN.$(SUFFIX) ztpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F)
-ztpsv_TLU.$(SUFFIX) ztpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h
+ztpsv_TLU.$(SUFFIX) ztpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F)
-ztpsv_TLN.$(SUFFIX) ztpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h
+ztpsv_TLN.$(SUFFIX) ztpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F)
-ztpsv_RLU.$(SUFFIX) ztpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h
+ztpsv_RLU.$(SUFFIX) ztpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F)
-ztpsv_RLN.$(SUFFIX) ztpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h
+ztpsv_RLN.$(SUFFIX) ztpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F)
-ztpsv_CLU.$(SUFFIX) ztpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h
+ztpsv_CLU.$(SUFFIX) ztpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F)
-ztpsv_CLN.$(SUFFIX) ztpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h
+ztpsv_CLN.$(SUFFIX) ztpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F)
-ztpsv_NLU.$(SUFFIX) ztpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h
+ztpsv_NLU.$(SUFFIX) ztpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F)
-ztpsv_NLN.$(SUFFIX) ztpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h
+ztpsv_NLN.$(SUFFIX) ztpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F)
-ztpsv_TUU.$(SUFFIX) ztpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h
+ztpsv_TUU.$(SUFFIX) ztpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F)
-ztpsv_TUN.$(SUFFIX) ztpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h
+ztpsv_TUN.$(SUFFIX) ztpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F)
-ztpsv_RUU.$(SUFFIX) ztpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h
+ztpsv_RUU.$(SUFFIX) ztpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F)
-ztpsv_RUN.$(SUFFIX) ztpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h
+ztpsv_RUN.$(SUFFIX) ztpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F)
-ztpsv_CUU.$(SUFFIX) ztpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h
+ztpsv_CUU.$(SUFFIX) ztpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F)
-ztpsv_CUN.$(SUFFIX) ztpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h
+ztpsv_CUN.$(SUFFIX) ztpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F)
-xtpsv_NUU.$(SUFFIX) xtpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h
+xtpsv_NUU.$(SUFFIX) xtpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F)
-xtpsv_NUN.$(SUFFIX) xtpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h
+xtpsv_NUN.$(SUFFIX) xtpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F)
-xtpsv_TLU.$(SUFFIX) xtpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h
+xtpsv_TLU.$(SUFFIX) xtpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F)
-xtpsv_TLN.$(SUFFIX) xtpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h
+xtpsv_TLN.$(SUFFIX) xtpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F)
-xtpsv_RLU.$(SUFFIX) xtpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h
+xtpsv_RLU.$(SUFFIX) xtpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F)
-xtpsv_RLN.$(SUFFIX) xtpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h
+xtpsv_RLN.$(SUFFIX) xtpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F)
-xtpsv_CLU.$(SUFFIX) xtpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h
+xtpsv_CLU.$(SUFFIX) xtpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F)
-xtpsv_CLN.$(SUFFIX) xtpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h
+xtpsv_CLN.$(SUFFIX) xtpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F)
-xtpsv_NLU.$(SUFFIX) xtpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h
+xtpsv_NLU.$(SUFFIX) xtpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F)
-xtpsv_NLN.$(SUFFIX) xtpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h
+xtpsv_NLN.$(SUFFIX) xtpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F)
-xtpsv_TUU.$(SUFFIX) xtpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h
+xtpsv_TUU.$(SUFFIX) xtpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F)
-xtpsv_TUN.$(SUFFIX) xtpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h
+xtpsv_TUN.$(SUFFIX) xtpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F)
-xtpsv_RUU.$(SUFFIX) xtpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h
+xtpsv_RUU.$(SUFFIX) xtpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F)
-xtpsv_RUN.$(SUFFIX) xtpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h
+xtpsv_RUN.$(SUFFIX) xtpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F)
-xtpsv_CUU.$(SUFFIX) xtpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h
+xtpsv_CUU.$(SUFFIX) xtpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F)
-xtpsv_CUN.$(SUFFIX) xtpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h
+xtpsv_CUN.$(SUFFIX) xtpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F)
-strmv_NUU.$(SUFFIX) strmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h
+strmv_NUU.$(SUFFIX) strmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-strmv_NUN.$(SUFFIX) strmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h
+strmv_NUN.$(SUFFIX) strmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-strmv_TLU.$(SUFFIX) strmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h
+strmv_TLU.$(SUFFIX) strmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-strmv_TLN.$(SUFFIX) strmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h
+strmv_TLN.$(SUFFIX) strmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-strmv_NLU.$(SUFFIX) strmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h
+strmv_NLU.$(SUFFIX) strmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-strmv_NLN.$(SUFFIX) strmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h
+strmv_NLN.$(SUFFIX) strmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-strmv_TUU.$(SUFFIX) strmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h
+strmv_TUU.$(SUFFIX) strmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-strmv_TUN.$(SUFFIX) strmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h
+strmv_TUN.$(SUFFIX) strmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-dtrmv_NUU.$(SUFFIX) dtrmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h
+dtrmv_NUU.$(SUFFIX) dtrmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-dtrmv_NUN.$(SUFFIX) dtrmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h
+dtrmv_NUN.$(SUFFIX) dtrmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-dtrmv_TLU.$(SUFFIX) dtrmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h
+dtrmv_TLU.$(SUFFIX) dtrmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-dtrmv_TLN.$(SUFFIX) dtrmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h
+dtrmv_TLN.$(SUFFIX) dtrmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-dtrmv_NLU.$(SUFFIX) dtrmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h
+dtrmv_NLU.$(SUFFIX) dtrmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-dtrmv_NLN.$(SUFFIX) dtrmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h
+dtrmv_NLN.$(SUFFIX) dtrmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-dtrmv_TUU.$(SUFFIX) dtrmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h
+dtrmv_TUU.$(SUFFIX) dtrmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-dtrmv_TUN.$(SUFFIX) dtrmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h
+dtrmv_TUN.$(SUFFIX) dtrmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-qtrmv_NUU.$(SUFFIX) qtrmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h
+qtrmv_NUU.$(SUFFIX) qtrmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-qtrmv_NUN.$(SUFFIX) qtrmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h
+qtrmv_NUN.$(SUFFIX) qtrmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-qtrmv_TLU.$(SUFFIX) qtrmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h
+qtrmv_TLU.$(SUFFIX) qtrmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-qtrmv_TLN.$(SUFFIX) qtrmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h
+qtrmv_TLN.$(SUFFIX) qtrmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-qtrmv_NLU.$(SUFFIX) qtrmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h
+qtrmv_NLU.$(SUFFIX) qtrmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-qtrmv_NLN.$(SUFFIX) qtrmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h
+qtrmv_NLN.$(SUFFIX) qtrmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-qtrmv_TUU.$(SUFFIX) qtrmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h
+qtrmv_TUU.$(SUFFIX) qtrmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-qtrmv_TUN.$(SUFFIX) qtrmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h
+qtrmv_TUN.$(SUFFIX) qtrmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-ctrmv_NUU.$(SUFFIX) ctrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h
+ctrmv_NUU.$(SUFFIX) ctrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-ctrmv_NUN.$(SUFFIX) ctrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h
+ctrmv_NUN.$(SUFFIX) ctrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-ctrmv_TLU.$(SUFFIX) ctrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h
+ctrmv_TLU.$(SUFFIX) ctrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-ctrmv_TLN.$(SUFFIX) ctrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h
+ctrmv_TLN.$(SUFFIX) ctrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-ctrmv_RLU.$(SUFFIX) ctrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h
+ctrmv_RLU.$(SUFFIX) ctrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-ctrmv_RLN.$(SUFFIX) ctrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h
+ctrmv_RLN.$(SUFFIX) ctrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-ctrmv_CLU.$(SUFFIX) ctrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h
+ctrmv_CLU.$(SUFFIX) ctrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-ctrmv_CLN.$(SUFFIX) ctrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h
+ctrmv_CLN.$(SUFFIX) ctrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-ctrmv_NLU.$(SUFFIX) ctrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h
+ctrmv_NLU.$(SUFFIX) ctrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-ctrmv_NLN.$(SUFFIX) ctrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h
+ctrmv_NLN.$(SUFFIX) ctrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-ctrmv_TUU.$(SUFFIX) ctrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h
+ctrmv_TUU.$(SUFFIX) ctrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-ctrmv_TUN.$(SUFFIX) ctrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h
+ctrmv_TUN.$(SUFFIX) ctrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-ctrmv_RUU.$(SUFFIX) ctrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h
+ctrmv_RUU.$(SUFFIX) ctrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-ctrmv_RUN.$(SUFFIX) ctrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h
+ctrmv_RUN.$(SUFFIX) ctrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-ctrmv_CUU.$(SUFFIX) ctrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h
+ctrmv_CUU.$(SUFFIX) ctrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-ctrmv_CUN.$(SUFFIX) ctrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h
+ctrmv_CUN.$(SUFFIX) ctrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-ztrmv_NUU.$(SUFFIX) ztrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h
+ztrmv_NUU.$(SUFFIX) ztrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-ztrmv_NUN.$(SUFFIX) ztrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h
+ztrmv_NUN.$(SUFFIX) ztrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-ztrmv_TLU.$(SUFFIX) ztrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h
+ztrmv_TLU.$(SUFFIX) ztrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-ztrmv_TLN.$(SUFFIX) ztrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h
+ztrmv_TLN.$(SUFFIX) ztrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-ztrmv_RLU.$(SUFFIX) ztrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h
+ztrmv_RLU.$(SUFFIX) ztrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-ztrmv_RLN.$(SUFFIX) ztrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h
+ztrmv_RLN.$(SUFFIX) ztrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-ztrmv_CLU.$(SUFFIX) ztrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h
+ztrmv_CLU.$(SUFFIX) ztrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-ztrmv_CLN.$(SUFFIX) ztrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h
+ztrmv_CLN.$(SUFFIX) ztrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-ztrmv_NLU.$(SUFFIX) ztrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h
+ztrmv_NLU.$(SUFFIX) ztrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-ztrmv_NLN.$(SUFFIX) ztrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h
+ztrmv_NLN.$(SUFFIX) ztrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-ztrmv_TUU.$(SUFFIX) ztrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h
+ztrmv_TUU.$(SUFFIX) ztrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-ztrmv_TUN.$(SUFFIX) ztrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h
+ztrmv_TUN.$(SUFFIX) ztrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-ztrmv_RUU.$(SUFFIX) ztrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h
+ztrmv_RUU.$(SUFFIX) ztrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-ztrmv_RUN.$(SUFFIX) ztrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h
+ztrmv_RUN.$(SUFFIX) ztrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-ztrmv_CUU.$(SUFFIX) ztrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h
+ztrmv_CUU.$(SUFFIX) ztrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-ztrmv_CUN.$(SUFFIX) ztrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h
+ztrmv_CUN.$(SUFFIX) ztrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-xtrmv_NUU.$(SUFFIX) xtrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h
+xtrmv_NUU.$(SUFFIX) xtrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-xtrmv_NUN.$(SUFFIX) xtrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h
+xtrmv_NUN.$(SUFFIX) xtrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-xtrmv_TLU.$(SUFFIX) xtrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h
+xtrmv_TLU.$(SUFFIX) xtrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-xtrmv_TLN.$(SUFFIX) xtrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h
+xtrmv_TLN.$(SUFFIX) xtrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-xtrmv_RLU.$(SUFFIX) xtrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h
+xtrmv_RLU.$(SUFFIX) xtrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-xtrmv_RLN.$(SUFFIX) xtrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h
+xtrmv_RLN.$(SUFFIX) xtrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-xtrmv_CLU.$(SUFFIX) xtrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h
+xtrmv_CLU.$(SUFFIX) xtrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-xtrmv_CLN.$(SUFFIX) xtrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h
+xtrmv_CLN.$(SUFFIX) xtrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-xtrmv_NLU.$(SUFFIX) xtrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h
+xtrmv_NLU.$(SUFFIX) xtrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F)
-xtrmv_NLN.$(SUFFIX) xtrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h
+xtrmv_NLN.$(SUFFIX) xtrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F)
-xtrmv_TUU.$(SUFFIX) xtrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h
+xtrmv_TUU.$(SUFFIX) xtrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F)
-xtrmv_TUN.$(SUFFIX) xtrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h
+xtrmv_TUN.$(SUFFIX) xtrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F)
-xtrmv_RUU.$(SUFFIX) xtrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h
+xtrmv_RUU.$(SUFFIX) xtrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F)
-xtrmv_RUN.$(SUFFIX) xtrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h
+xtrmv_RUN.$(SUFFIX) xtrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F)
-xtrmv_CUU.$(SUFFIX) xtrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h
+xtrmv_CUU.$(SUFFIX) xtrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F)
-xtrmv_CUN.$(SUFFIX) xtrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h
+xtrmv_CUN.$(SUFFIX) xtrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F)
-strmv_thread_NUU.$(SUFFIX) strmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h
+strmv_thread_NUU.$(SUFFIX) strmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F)
-strmv_thread_NUN.$(SUFFIX) strmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h
+strmv_thread_NUN.$(SUFFIX) strmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F)
-strmv_thread_TLU.$(SUFFIX) strmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h
+strmv_thread_TLU.$(SUFFIX) strmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F)
-strmv_thread_TLN.$(SUFFIX) strmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h
+strmv_thread_TLN.$(SUFFIX) strmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F)
-strmv_thread_NLU.$(SUFFIX) strmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h
+strmv_thread_NLU.$(SUFFIX) strmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F)
-strmv_thread_NLN.$(SUFFIX) strmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h
+strmv_thread_NLN.$(SUFFIX) strmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F)
-strmv_thread_TUU.$(SUFFIX) strmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h
+strmv_thread_TUU.$(SUFFIX) strmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F)
-strmv_thread_TUN.$(SUFFIX) strmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h
+strmv_thread_TUN.$(SUFFIX) strmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F)
-dtrmv_thread_NUU.$(SUFFIX) dtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h
+dtrmv_thread_NUU.$(SUFFIX) dtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F)
-dtrmv_thread_NUN.$(SUFFIX) dtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h
+dtrmv_thread_NUN.$(SUFFIX) dtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F)
-dtrmv_thread_TLU.$(SUFFIX) dtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h
+dtrmv_thread_TLU.$(SUFFIX) dtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F)
-dtrmv_thread_TLN.$(SUFFIX) dtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h
+dtrmv_thread_TLN.$(SUFFIX) dtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F)
-dtrmv_thread_NLU.$(SUFFIX) dtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h
+dtrmv_thread_NLU.$(SUFFIX) dtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F)
-dtrmv_thread_NLN.$(SUFFIX) dtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h
+dtrmv_thread_NLN.$(SUFFIX) dtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F)
-dtrmv_thread_TUU.$(SUFFIX) dtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h
+dtrmv_thread_TUU.$(SUFFIX) dtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F)
-dtrmv_thread_TUN.$(SUFFIX) dtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h
+dtrmv_thread_TUN.$(SUFFIX) dtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F)
-qtrmv_thread_NUU.$(SUFFIX) qtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h
+qtrmv_thread_NUU.$(SUFFIX) qtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F)
-qtrmv_thread_NUN.$(SUFFIX) qtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h
+qtrmv_thread_NUN.$(SUFFIX) qtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F)
-qtrmv_thread_TLU.$(SUFFIX) qtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h
+qtrmv_thread_TLU.$(SUFFIX) qtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F)
-qtrmv_thread_TLN.$(SUFFIX) qtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h
+qtrmv_thread_TLN.$(SUFFIX) qtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F)
-qtrmv_thread_NLU.$(SUFFIX) qtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h
+qtrmv_thread_NLU.$(SUFFIX) qtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F)
-qtrmv_thread_NLN.$(SUFFIX) qtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h
+qtrmv_thread_NLN.$(SUFFIX) qtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F)
-qtrmv_thread_TUU.$(SUFFIX) qtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h
+qtrmv_thread_TUU.$(SUFFIX) qtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F)
-qtrmv_thread_TUN.$(SUFFIX) qtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h
+qtrmv_thread_TUN.$(SUFFIX) qtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F)
-ctrmv_thread_NUU.$(SUFFIX) ctrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h
+ctrmv_thread_NUU.$(SUFFIX) ctrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F)
-ctrmv_thread_NUN.$(SUFFIX) ctrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h
+ctrmv_thread_NUN.$(SUFFIX) ctrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F)
-ctrmv_thread_TLU.$(SUFFIX) ctrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h
+ctrmv_thread_TLU.$(SUFFIX) ctrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F)
-ctrmv_thread_TLN.$(SUFFIX) ctrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h
+ctrmv_thread_TLN.$(SUFFIX) ctrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F)
-ctrmv_thread_RLU.$(SUFFIX) ctrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h
+ctrmv_thread_RLU.$(SUFFIX) ctrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F)
-ctrmv_thread_RLN.$(SUFFIX) ctrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h
+ctrmv_thread_RLN.$(SUFFIX) ctrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F)
-ctrmv_thread_CLU.$(SUFFIX) ctrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h
+ctrmv_thread_CLU.$(SUFFIX) ctrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F)
-ctrmv_thread_CLN.$(SUFFIX) ctrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h
+ctrmv_thread_CLN.$(SUFFIX) ctrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F)
-ctrmv_thread_NLU.$(SUFFIX) ctrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h
+ctrmv_thread_NLU.$(SUFFIX) ctrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F)
-ctrmv_thread_NLN.$(SUFFIX) ctrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h
+ctrmv_thread_NLN.$(SUFFIX) ctrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F)
-ctrmv_thread_TUU.$(SUFFIX) ctrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h
+ctrmv_thread_TUU.$(SUFFIX) ctrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F)
-ctrmv_thread_TUN.$(SUFFIX) ctrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h
+ctrmv_thread_TUN.$(SUFFIX) ctrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F)
-ctrmv_thread_RUU.$(SUFFIX) ctrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h
+ctrmv_thread_RUU.$(SUFFIX) ctrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F)
-ctrmv_thread_RUN.$(SUFFIX) ctrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h
+ctrmv_thread_RUN.$(SUFFIX) ctrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F)
-ctrmv_thread_CUU.$(SUFFIX) ctrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h
+ctrmv_thread_CUU.$(SUFFIX) ctrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F)
-ctrmv_thread_CUN.$(SUFFIX) ctrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h
+ctrmv_thread_CUN.$(SUFFIX) ctrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F)
-ztrmv_thread_NUU.$(SUFFIX) ztrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h
+ztrmv_thread_NUU.$(SUFFIX) ztrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F)
-ztrmv_thread_NUN.$(SUFFIX) ztrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h
+ztrmv_thread_NUN.$(SUFFIX) ztrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F)
-ztrmv_thread_TLU.$(SUFFIX) ztrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h
+ztrmv_thread_TLU.$(SUFFIX) ztrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F)
-ztrmv_thread_TLN.$(SUFFIX) ztrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h
+ztrmv_thread_TLN.$(SUFFIX) ztrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F)
-ztrmv_thread_RLU.$(SUFFIX) ztrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h
+ztrmv_thread_RLU.$(SUFFIX) ztrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F)
-ztrmv_thread_RLN.$(SUFFIX) ztrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h
+ztrmv_thread_RLN.$(SUFFIX) ztrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F)
-ztrmv_thread_CLU.$(SUFFIX) ztrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h
+ztrmv_thread_CLU.$(SUFFIX) ztrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F)
-ztrmv_thread_CLN.$(SUFFIX) ztrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h
+ztrmv_thread_CLN.$(SUFFIX) ztrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F)
-ztrmv_thread_NLU.$(SUFFIX) ztrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h
+ztrmv_thread_NLU.$(SUFFIX) ztrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F)
-ztrmv_thread_NLN.$(SUFFIX) ztrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h
+ztrmv_thread_NLN.$(SUFFIX) ztrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F)
-ztrmv_thread_TUU.$(SUFFIX) ztrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h
+ztrmv_thread_TUU.$(SUFFIX) ztrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F)
-ztrmv_thread_TUN.$(SUFFIX) ztrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h
+ztrmv_thread_TUN.$(SUFFIX) ztrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F)
-ztrmv_thread_RUU.$(SUFFIX) ztrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h
+ztrmv_thread_RUU.$(SUFFIX) ztrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F)
-ztrmv_thread_RUN.$(SUFFIX) ztrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h
+ztrmv_thread_RUN.$(SUFFIX) ztrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F)
-ztrmv_thread_CUU.$(SUFFIX) ztrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h
+ztrmv_thread_CUU.$(SUFFIX) ztrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F)
-ztrmv_thread_CUN.$(SUFFIX) ztrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h
+ztrmv_thread_CUN.$(SUFFIX) ztrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F)
-xtrmv_thread_NUU.$(SUFFIX) xtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h
+xtrmv_thread_NUU.$(SUFFIX) xtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F)
-xtrmv_thread_NUN.$(SUFFIX) xtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h
+xtrmv_thread_NUN.$(SUFFIX) xtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F)
-xtrmv_thread_TLU.$(SUFFIX) xtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h
+xtrmv_thread_TLU.$(SUFFIX) xtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F)
-xtrmv_thread_TLN.$(SUFFIX) xtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h
+xtrmv_thread_TLN.$(SUFFIX) xtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F)
-xtrmv_thread_RLU.$(SUFFIX) xtrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h
+xtrmv_thread_RLU.$(SUFFIX) xtrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F)
-xtrmv_thread_RLN.$(SUFFIX) xtrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h
+xtrmv_thread_RLN.$(SUFFIX) xtrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F)
-xtrmv_thread_CLU.$(SUFFIX) xtrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h
+xtrmv_thread_CLU.$(SUFFIX) xtrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F)
-xtrmv_thread_CLN.$(SUFFIX) xtrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h
+xtrmv_thread_CLN.$(SUFFIX) xtrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F)
-xtrmv_thread_NLU.$(SUFFIX) xtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h
+xtrmv_thread_NLU.$(SUFFIX) xtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F)
-xtrmv_thread_NLN.$(SUFFIX) xtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h
+xtrmv_thread_NLN.$(SUFFIX) xtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F)
-xtrmv_thread_TUU.$(SUFFIX) xtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h
+xtrmv_thread_TUU.$(SUFFIX) xtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F)
-xtrmv_thread_TUN.$(SUFFIX) xtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h
+xtrmv_thread_TUN.$(SUFFIX) xtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F)
-xtrmv_thread_RUU.$(SUFFIX) xtrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h
+xtrmv_thread_RUU.$(SUFFIX) xtrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F)
-xtrmv_thread_RUN.$(SUFFIX) xtrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h
+xtrmv_thread_RUN.$(SUFFIX) xtrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F)
-xtrmv_thread_CUU.$(SUFFIX) xtrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h
+xtrmv_thread_CUU.$(SUFFIX) xtrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F)
-xtrmv_thread_CUN.$(SUFFIX) xtrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h
+xtrmv_thread_CUN.$(SUFFIX) xtrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F)
-strsv_NUU.$(SUFFIX) strsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h
+strsv_NUU.$(SUFFIX) strsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-strsv_NUN.$(SUFFIX) strsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h
+strsv_NUN.$(SUFFIX) strsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-strsv_TLU.$(SUFFIX) strsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h
+strsv_TLU.$(SUFFIX) strsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-strsv_TLN.$(SUFFIX) strsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h
+strsv_TLN.$(SUFFIX) strsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-strsv_NLU.$(SUFFIX) strsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h
+strsv_NLU.$(SUFFIX) strsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-strsv_NLN.$(SUFFIX) strsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h
+strsv_NLN.$(SUFFIX) strsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-strsv_TUU.$(SUFFIX) strsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h
+strsv_TUU.$(SUFFIX) strsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-strsv_TUN.$(SUFFIX) strsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h
+strsv_TUN.$(SUFFIX) strsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-dtrsv_NUU.$(SUFFIX) dtrsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h
+dtrsv_NUU.$(SUFFIX) dtrsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-dtrsv_NUN.$(SUFFIX) dtrsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h
+dtrsv_NUN.$(SUFFIX) dtrsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-dtrsv_TLU.$(SUFFIX) dtrsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h
+dtrsv_TLU.$(SUFFIX) dtrsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-dtrsv_TLN.$(SUFFIX) dtrsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h
+dtrsv_TLN.$(SUFFIX) dtrsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-dtrsv_NLU.$(SUFFIX) dtrsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h
+dtrsv_NLU.$(SUFFIX) dtrsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-dtrsv_NLN.$(SUFFIX) dtrsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h
+dtrsv_NLN.$(SUFFIX) dtrsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-dtrsv_TUU.$(SUFFIX) dtrsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h
+dtrsv_TUU.$(SUFFIX) dtrsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-dtrsv_TUN.$(SUFFIX) dtrsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h
+dtrsv_TUN.$(SUFFIX) dtrsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-qtrsv_NUU.$(SUFFIX) qtrsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h
+qtrsv_NUU.$(SUFFIX) qtrsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-qtrsv_NUN.$(SUFFIX) qtrsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h
+qtrsv_NUN.$(SUFFIX) qtrsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-qtrsv_TLU.$(SUFFIX) qtrsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h
+qtrsv_TLU.$(SUFFIX) qtrsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-qtrsv_TLN.$(SUFFIX) qtrsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h
+qtrsv_TLN.$(SUFFIX) qtrsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-qtrsv_NLU.$(SUFFIX) qtrsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h
+qtrsv_NLU.$(SUFFIX) qtrsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F)
-qtrsv_NLN.$(SUFFIX) qtrsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h
+qtrsv_NLN.$(SUFFIX) qtrsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F)
-qtrsv_TUU.$(SUFFIX) qtrsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h
+qtrsv_TUU.$(SUFFIX) qtrsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F)
-qtrsv_TUN.$(SUFFIX) qtrsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h
+qtrsv_TUN.$(SUFFIX) qtrsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F)
-ctrsv_NUU.$(SUFFIX) ctrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h
+ctrsv_NUU.$(SUFFIX) ctrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F)
-ctrsv_NUN.$(SUFFIX) ctrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h
+ctrsv_NUN.$(SUFFIX) ctrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F)
-ctrsv_TLU.$(SUFFIX) ctrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h
+ctrsv_TLU.$(SUFFIX) ctrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F)
-ctrsv_TLN.$(SUFFIX) ctrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h
+ctrsv_TLN.$(SUFFIX) ctrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F)
-ctrsv_RLU.$(SUFFIX) ctrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h
+ctrsv_RLU.$(SUFFIX) ctrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F)
-ctrsv_RLN.$(SUFFIX) ctrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h
+ctrsv_RLN.$(SUFFIX) ctrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F)
-ctrsv_CLU.$(SUFFIX) ctrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h
+ctrsv_CLU.$(SUFFIX) ctrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F)
-ctrsv_CLN.$(SUFFIX) ctrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h
+ctrsv_CLN.$(SUFFIX) ctrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F)
-ctrsv_NLU.$(SUFFIX) ctrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h
+ctrsv_NLU.$(SUFFIX) ctrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F)
-ctrsv_NLN.$(SUFFIX) ctrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h
+ctrsv_NLN.$(SUFFIX) ctrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F)
-ctrsv_TUU.$(SUFFIX) ctrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h
+ctrsv_TUU.$(SUFFIX) ctrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F)
-ctrsv_TUN.$(SUFFIX) ctrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h
+ctrsv_TUN.$(SUFFIX) ctrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F)
-ctrsv_RUU.$(SUFFIX) ctrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h
+ctrsv_RUU.$(SUFFIX) ctrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F)
-ctrsv_RUN.$(SUFFIX) ctrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h
+ctrsv_RUN.$(SUFFIX) ctrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F)
-ctrsv_CUU.$(SUFFIX) ctrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h
+ctrsv_CUU.$(SUFFIX) ctrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F)
-ctrsv_CUN.$(SUFFIX) ctrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h
+ctrsv_CUN.$(SUFFIX) ctrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F)
-ztrsv_NUU.$(SUFFIX) ztrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h
+ztrsv_NUU.$(SUFFIX) ztrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F)
-ztrsv_NUN.$(SUFFIX) ztrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h
+ztrsv_NUN.$(SUFFIX) ztrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F)
-ztrsv_TLU.$(SUFFIX) ztrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h
+ztrsv_TLU.$(SUFFIX) ztrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F)
-ztrsv_TLN.$(SUFFIX) ztrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h
+ztrsv_TLN.$(SUFFIX) ztrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F)
-ztrsv_RLU.$(SUFFIX) ztrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h
+ztrsv_RLU.$(SUFFIX) ztrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F)
-ztrsv_RLN.$(SUFFIX) ztrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h
+ztrsv_RLN.$(SUFFIX) ztrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F)
-ztrsv_CLU.$(SUFFIX) ztrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h
+ztrsv_CLU.$(SUFFIX) ztrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F)
-ztrsv_CLN.$(SUFFIX) ztrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h
+ztrsv_CLN.$(SUFFIX) ztrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F)
-ztrsv_NLU.$(SUFFIX) ztrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h
+ztrsv_NLU.$(SUFFIX) ztrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F)
-ztrsv_NLN.$(SUFFIX) ztrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h
+ztrsv_NLN.$(SUFFIX) ztrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F)
-ztrsv_TUU.$(SUFFIX) ztrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h
+ztrsv_TUU.$(SUFFIX) ztrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F)
-ztrsv_TUN.$(SUFFIX) ztrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h
+ztrsv_TUN.$(SUFFIX) ztrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F)
-ztrsv_RUU.$(SUFFIX) ztrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h
+ztrsv_RUU.$(SUFFIX) ztrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F)
-ztrsv_RUN.$(SUFFIX) ztrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h
+ztrsv_RUN.$(SUFFIX) ztrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F)
-ztrsv_CUU.$(SUFFIX) ztrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h
+ztrsv_CUU.$(SUFFIX) ztrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F)
-ztrsv_CUN.$(SUFFIX) ztrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h
+ztrsv_CUN.$(SUFFIX) ztrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F)
-xtrsv_NUU.$(SUFFIX) xtrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h
+xtrsv_NUU.$(SUFFIX) xtrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F)
-xtrsv_NUN.$(SUFFIX) xtrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h
+xtrsv_NUN.$(SUFFIX) xtrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F)
-xtrsv_TLU.$(SUFFIX) xtrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h
+xtrsv_TLU.$(SUFFIX) xtrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F)
-xtrsv_TLN.$(SUFFIX) xtrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h
+xtrsv_TLN.$(SUFFIX) xtrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F)
-xtrsv_RLU.$(SUFFIX) xtrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h
+xtrsv_RLU.$(SUFFIX) xtrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F)
-xtrsv_RLN.$(SUFFIX) xtrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h
+xtrsv_RLN.$(SUFFIX) xtrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F)
-xtrsv_CLU.$(SUFFIX) xtrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h
+xtrsv_CLU.$(SUFFIX) xtrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F)
-xtrsv_CLN.$(SUFFIX) xtrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h
+xtrsv_CLN.$(SUFFIX) xtrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F)
-xtrsv_NLU.$(SUFFIX) xtrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h
+xtrsv_NLU.$(SUFFIX) xtrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F)
-xtrsv_NLN.$(SUFFIX) xtrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h
+xtrsv_NLN.$(SUFFIX) xtrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F)
-xtrsv_TUU.$(SUFFIX) xtrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h
+xtrsv_TUU.$(SUFFIX) xtrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F)
-xtrsv_TUN.$(SUFFIX) xtrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h
+xtrsv_TUN.$(SUFFIX) xtrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F)
-xtrsv_RUU.$(SUFFIX) xtrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h
+xtrsv_RUU.$(SUFFIX) xtrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F)
-xtrsv_RUN.$(SUFFIX) xtrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h
+xtrsv_RUN.$(SUFFIX) xtrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F)
-xtrsv_CUU.$(SUFFIX) xtrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h
+xtrsv_CUU.$(SUFFIX) xtrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F)
-xtrsv_CUN.$(SUFFIX) xtrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h
+xtrsv_CUN.$(SUFFIX) xtrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F)
include ../../Makefile.tail
diff --git a/driver/level2/gbmv_k.c b/driver/level2/gbmv_k.c
index 317d420..4b29d70 100644
--- a/driver/level2/gbmv_k.c
+++ b/driver/level2/gbmv_k.c
@@ -84,12 +84,12 @@ void CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT alpha,
#ifndef TRANS
AXPYU_K(length, 0, 0,
- alpha * X[i],
+ alpha * X[i],
a + start, 1, Y + start - offset_u, 1, NULL, 0);
#else
Y[i] += alpha * DOTU_K(length, a + start, 1, X + start - offset_u, 1);
#endif
-
+
offset_u --;
offset_l --;
diff --git a/driver/level2/gbmv_thread.c b/driver/level2/gbmv_thread.c
index 18aae26..9efe170 100644
--- a/driver/level2/gbmv_thread.c
+++ b/driver/level2/gbmv_thread.c
@@ -105,13 +105,13 @@ static int gbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
args -> m,
#else
args -> n,
-#endif
- 0, 0, ZERO,
+#endif
+ 0, 0, ZERO,
#ifdef COMPLEX
ZERO,
#endif
- y, 1, NULL, 0, NULL, 0);
-
+ y, 1, NULL, 0, NULL, 0);
+
offset_u = ku - n_from;
offset_l = ku - n_from + args -> m;
@@ -157,7 +157,7 @@ static int gbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
x += COMPSIZE;
#endif
-
+
y += COMPSIZE;
offset_u --;
@@ -190,7 +190,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -198,27 +198,27 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
args.m = m;
args.n = n;
-
+
args.a = (void *)a;
args.b = (void *)x;
args.c = (void *)buffer;
-
+
args.lda = lda;
args.ldb = incx;
args.ldc = ku;
args.ldd = kl;
num_cpu = 0;
-
+
range_n[0] = 0;
i = n;
-
+
while (i > 0){
width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu);
@@ -227,7 +227,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT
if (i < width) width = i;
range_n[num_cpu + 1] = range_n[num_cpu] + width;
-
+
#ifndef TRANSA
range_m[num_cpu] = num_cpu * ((m + 15) & ~15);
#else
@@ -242,7 +242,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i -= width;
}
@@ -254,12 +254,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT
#else
queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE;
#endif
-
+
queue[num_cpu - 1].next = NULL;
-
+
exec_blas(num_cpu, queue);
}
-
+
for (i = 1; i < num_cpu; i ++) {
AXPYU_K(
#ifndef TRANSA
diff --git a/driver/level2/gemv_thread.c b/driver/level2/gemv_thread.c
index 5f8abf2..ddd4753 100644
--- a/driver/level2/gemv_thread.c
+++ b/driver/level2/gemv_thread.c
@@ -110,7 +110,7 @@ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
*((FLOAT *)args -> alpha + 1),
#endif
a, lda, x, incx, y, incy, buffer);
-
+
return 0;
}
@@ -134,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -142,17 +142,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
args.m = m;
args.n = n;
-
+
args.a = (void *)a;
args.b = (void *)x;
args.c = (void *)y;
-
+
args.lda = lda;
args.ldb = incx;
args.ldc = incy;
@@ -164,14 +164,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
#endif
num_cpu = 0;
-
+
range[0] = 0;
#ifndef TRANSA
i = m;
#else
i = n;
#endif
-
+
while (i > 0){
width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu);
@@ -179,7 +179,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
if (i < width) width = i;
range[num_cpu + 1] = range[num_cpu] + width;
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = gemv_kernel;
queue[num_cpu].args = &args;
@@ -193,7 +193,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i -= width;
}
@@ -202,9 +202,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
queue[0].sa = NULL;
queue[0].sb = buffer;
queue[num_cpu - 1].next = NULL;
-
+
exec_blas(num_cpu, queue);
}
-
+
return 0;
}
diff --git a/driver/level2/ger_thread.c b/driver/level2/ger_thread.c
index 9e2f520..0a5e14c 100644
--- a/driver/level2/ger_thread.c
+++ b/driver/level2/ger_thread.c
@@ -102,7 +102,7 @@ static int ger_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
#endif
#endif
x, 1, a, 1, NULL, 0);
-
+
y += incy * COMPSIZE;
a += lda * COMPSIZE;
}
@@ -130,7 +130,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -138,17 +138,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
args.m = m;
args.n = n;
-
+
args.a = (void *)x;
args.b = (void *)y;
args.c = (void *)a;
-
+
args.lda = incx;
args.ldb = incy;
args.ldc = lda;
@@ -160,18 +160,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *
#endif
num_cpu = 0;
-
+
range_n[0] = 0;
i = n;
-
+
while (i > 0){
-
+
width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu);
if (width < 4) width = 4;
if (i < width) width = i;
range_n[num_cpu + 1] = range_n[num_cpu] + width;
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = ger_kernel;
queue[num_cpu].args = &args;
@@ -179,19 +179,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i -= width;
}
-
+
if (num_cpu) {
queue[0].sa = NULL;
queue[0].sb = buffer;
-
+
queue[num_cpu - 1].next = NULL;
-
+
exec_blas(num_cpu, queue);
}
-
+
return 0;
}
diff --git a/driver/level2/sbmv_k.c b/driver/level2/sbmv_k.c
index d0adc67..ef7fa37 100644
--- a/driver/level2/sbmv_k.c
+++ b/driver/level2/sbmv_k.c
@@ -72,7 +72,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha,
if (length > k) length = k;
AXPYU_K(length + 1, 0, 0,
- alpha * X[i],
+ alpha * X[i],
a + k - length, 1, Y + i - length, 1, NULL, 0);
Y[i] += alpha * DOTU_K(length, a + k - length, 1, X + i - length, 1);
#else
@@ -80,11 +80,11 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha,
if (n - i - 1 < k) length = n - i - 1;
AXPYU_K(length + 1, 0, 0,
- alpha * X[i],
+ alpha * X[i],
a, 1, Y + i, 1, NULL, 0);
Y[i] += alpha * DOTU_K(length, a + 1, 1, X + i + 1, 1);
#endif
-
+
a += lda;
}
diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c
index 7dfabfa..5b7fc73 100644
--- a/driver/level2/sbmv_thread.c
+++ b/driver/level2/sbmv_thread.c
@@ -76,7 +76,7 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
n_to = n;
//Use y as each thread's n* COMPSIZE elements in sb buffer
- y = buffer;
+ y = buffer;
buffer += ((COMPSIZE * n + 1023) & ~1023);
if (range_m) {
@@ -94,12 +94,12 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
buffer += ((COMPSIZE * n + 1023) & ~1023);
}
- SCAL_K(n, 0, 0, ZERO,
+ SCAL_K(n, 0, 0, ZERO,
#ifdef COMPLEX
ZERO,
#endif
- y, 1, NULL, 0, NULL, 0);
-
+ y, 1, NULL, 0, NULL, 0);
+
for (i = n_from; i < n_to; i++) {
#ifndef LOWER
@@ -193,7 +193,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -201,52 +201,52 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
args.n = n;
args.k = k;
-
+
args.a = (void *)a;
args.b = (void *)x;
args.c = (void *)buffer;
-
+
args.lda = lda;
args.ldb = incx;
args.ldc = incy;
dnum = (double)n * (double)n / (double)nthreads;
num_cpu = 0;
-
+
if (n < 2 * k) {
#ifndef LOWER
range_m[MAX_CPU_NUMBER] = n;
i = 0;
-
+
while (i < n){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)(n - i);
if (di * di - dnum > 0) {
width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
} else {
width = n - i;
}
-
+
if (width < 16) width = 16;
if (width > n - i) width = n - i;
-
+
} else {
width = n - i;
}
-
+
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = sbmv_kernel;
queue[num_cpu].args = &args;
@@ -255,37 +255,37 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
#else
-
+
range_m[0] = 0;
i = 0;
-
+
while (i < n){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)(n - i);
if (di * di - dnum > 0) {
width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
} else {
width = n - i;
}
-
+
if (width < 16) width = 16;
if (width > n - i) width = n - i;
-
+
} else {
width = n - i;
}
-
+
range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = sbmv_kernel;
queue[num_cpu].args = &args;
@@ -294,29 +294,29 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
#endif
-
+
} else {
-
+
range_m[0] = 0;
i = n;
-
+
while (i > 0){
-
+
width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu);
-
+
if (width < 4) width = 4;
if (i < width) width = i;
-
+
range_m[num_cpu + 1] = range_m[num_cpu] + width;
-
+
range_n[num_cpu] = num_cpu * ((n + 15) & ~15);
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = sbmv_kernel;
queue[num_cpu].args = &args;
@@ -325,7 +325,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i -= width;
}
@@ -335,10 +335,10 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
queue[0].sa = NULL;
queue[0].sb = buffer;
queue[num_cpu - 1].next = NULL;
-
+
exec_blas(num_cpu, queue);
}
-
+
for (i = 1; i < num_cpu; i ++) {
AXPYU_K(n, 0, 0,
#ifndef COMPLEX
@@ -356,6 +356,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
alpha[0], alpha[1],
#endif
buffer, 1, y, incy, NULL, 0);
-
+
return 0;
}
diff --git a/driver/level2/spmv_k.c b/driver/level2/spmv_k.c
index 07ec660..8ce0abd 100644
--- a/driver/level2/spmv_k.c
+++ b/driver/level2/spmv_k.c
@@ -68,7 +68,7 @@ int CNAME(BLASLONG m, FLOAT alpha, FLOAT *a,
if (i > 0) Y[i] += alpha * DOTU_K(i, a, 1, X, 1);
AXPYU_K(i + 1, 0, 0, alpha * X[i], a, 1, Y, 1, NULL, 0);
a += i + 1;
-
+
#else
Y[i] += alpha * DOTU_K(m - i, a + i, 1, X + i, 1);
if (m - i > 1) AXPYU_K(m - i - 1, 0, 0, alpha * X[i],
diff --git a/driver/level2/spmv_thread.c b/driver/level2/spmv_thread.c
index 7717bbf..93a2f44 100644
--- a/driver/level2/spmv_thread.c
+++ b/driver/level2/spmv_thread.c
@@ -91,17 +91,17 @@ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
}
#ifndef LOWER
- SCAL_K(m_to, 0, 0, ZERO,
+ SCAL_K(m_to, 0, 0, ZERO,
#ifdef COMPLEX
ZERO,
#endif
- y, 1, NULL, 0, NULL, 0);
+ y, 1, NULL, 0, NULL, 0);
#else
- SCAL_K(args -> m - m_from, 0, 0, ZERO,
+ SCAL_K(args -> m - m_from, 0, 0, ZERO,
#ifdef COMPLEX
ZERO,
#endif
- y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
+ y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
#endif
#ifndef LOWER
@@ -139,7 +139,7 @@ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
a, 1, y, 1, NULL, 0);
a += (i + 1) * COMPSIZE;
-
+
#else
#if !defined(HEMV) && !defined(HEMVREV)
result = MYDOT(args -> m - i , a + i * COMPSIZE, 1, x + i * COMPSIZE, 1);
@@ -198,7 +198,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -206,31 +206,31 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
args.m = m;
-
+
args.a = (void *)a;
args.b = (void *)x;
args.c = (void *)buffer;
-
+
args.ldb = incx;
args.ldc = incy;
dnum = (double)m * (double)m / (double)nthreads;
num_cpu = 0;
-
+
#ifndef LOWER
range_m[MAX_CPU_NUMBER] = m;
i = 0;
-
+
while (i < m){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)(m - i);
if (di * di - dnum > 0) {
width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
@@ -240,14 +240,14 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
if (width < 16) width = 16;
if (width > m - i) width = m - i;
-
+
} else {
width = m - i;
}
-
+
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = spmv_kernel;
queue[num_cpu].args = &args;
@@ -256,20 +256,20 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
#else
range_m[0] = 0;
i = 0;
-
+
while (i < m){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)(m - i);
if (di * di - dnum > 0) {
width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
@@ -279,14 +279,14 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
if (width < 16) width = 16;
if (width > m - i) width = m - i;
-
+
} else {
width = m - i;
}
-
+
range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = spmv_kernel;
queue[num_cpu].args = &args;
@@ -295,44 +295,44 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
#endif
if (num_cpu) {
queue[0].sa = NULL;
queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
-
+
queue[num_cpu - 1].next = NULL;
-
+
exec_blas(num_cpu, queue);
}
-
+
for (i = 1; i < num_cpu; i ++) {
-
+
#ifndef LOWER
-
+
AXPYU_K(range_m[MAX_CPU_NUMBER - i], 0, 0, ONE,
#ifdef COMPLEX
- ZERO,
+ ZERO,
#endif
buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0);
-
+
#else
-
+
AXPYU_K(m - range_m[i], 0, 0, ONE,
#ifdef COMPLEX
- ZERO,
+ ZERO,
#endif
buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0);
-
+
#endif
-
+
}
-
+
AXPYU_K(m, 0, 0,
#ifndef COMPLEX
alpha,
@@ -340,6 +340,6 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
alpha[0], alpha[1],
#endif
buffer, 1, y, incy, NULL, 0);
-
+
return 0;
}
diff --git a/driver/level2/spr2_k.c b/driver/level2/spr2_k.c
index 58e14eb..e742b24 100644
--- a/driver/level2/spr2_k.c
+++ b/driver/level2/spr2_k.c
@@ -40,7 +40,7 @@
#include <ctype.h>
#include "common.h"
-int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx,
+int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx,
FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer){
BLASLONG i;
diff --git a/driver/level2/spr2_thread.c b/driver/level2/spr2_thread.c
index b20eb05..10edb1e 100644
--- a/driver/level2/spr2_thread.c
+++ b/driver/level2/spr2_thread.c
@@ -116,7 +116,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
#else
if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) {
#ifndef LOWER
- AXPYU_K(i + 1, 0, 0,
+ AXPYU_K(i + 1, 0, 0,
alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1],
alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1],
y, 1, a, 1, NULL, 0);
@@ -129,7 +129,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
}
if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) {
#ifndef LOWER
- AXPYU_K(i + 1, 0, 0,
+ AXPYU_K(i + 1, 0, 0,
alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1],
alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1],
x, 1, a, 1, NULL, 0);
@@ -145,7 +145,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) {
#ifndef HEMVREV
#ifndef LOWER
- AXPYU_K(i + 1, 0, 0,
+ AXPYU_K(i + 1, 0, 0,
alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1],
- alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1],
y, 1, a, 1, NULL, 0);
@@ -157,7 +157,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
#endif
#else
#ifndef LOWER
- AXPYC_K(i + 1, 0, 0,
+ AXPYC_K(i + 1, 0, 0,
alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1],
alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1],
y, 1, a, 1, NULL, 0);
@@ -172,7 +172,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) {
#ifndef HEMVREV
#ifndef LOWER
- AXPYU_K(i + 1, 0, 0,
+ AXPYU_K(i + 1, 0, 0,
alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1],
alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1],
x, 1, a, 1, NULL, 0);
@@ -184,7 +184,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
#endif
#else
#ifndef LOWER
- AXPYC_K(i + 1, 0, 0,
+ AXPYC_K(i + 1, 0, 0,
alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1],
- alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1],
x, 1, a, 1, NULL, 0);
@@ -202,14 +202,14 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
a[ 1] = ZERO;
#endif
#endif
-
+
#ifndef LOWER
a += (i + 1) * COMPSIZE;
#else
a += (args -> m - i) * COMPSIZE;
#endif
}
-
+
return 0;
}
@@ -236,7 +236,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -244,16 +244,16 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
args.m = m;
-
+
args.a = (void *)x;
args.b = (void *)y;
args.c = (void *)a;
-
+
args.lda = incx;
args.ldb = incy;
#ifndef COMPLEX
@@ -264,16 +264,16 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG
dnum = (double)m * (double)m / (double)nthreads;
num_cpu = 0;
-
+
#ifndef LOWER
range_m[MAX_CPU_NUMBER] = m;
i = 0;
-
+
while (i < m){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)(m - i);
if (di * di - dnum > 0) {
width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
@@ -283,13 +283,13 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG
if (width < 16) width = 16;
if (width > m - i) width = m - i;
-
+
} else {
width = m - i;
}
-
+
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = syr_kernel;
queue[num_cpu].args = &args;
@@ -298,20 +298,20 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
#else
range_m[0] = 0;
i = 0;
-
+
while (i < m){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)(m - i);
if (di * di - dnum > 0) {
width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
@@ -321,13 +321,13 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG
if (width < 16) width = 16;
if (width > m - i) width = m - i;
-
+
} else {
width = m - i;
}
-
+
range_m[num_cpu + 1] = range_m[num_cpu] + width;
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = syr_kernel;
queue[num_cpu].args = &args;
@@ -336,21 +336,21 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
#endif
if (num_cpu) {
queue[0].sa = NULL;
queue[0].sb = buffer;
-
+
queue[num_cpu - 1].next = NULL;
-
+
exec_blas(num_cpu, queue);
}
-
+
return 0;
}
diff --git a/driver/level2/spr_k.c b/driver/level2/spr_k.c
index 996d925..84fb4e8 100644
--- a/driver/level2/spr_k.c
+++ b/driver/level2/spr_k.c
@@ -38,7 +38,7 @@
#include "common.h"
-int CNAME(BLASLONG m, FLOAT alpha_r,
+int CNAME(BLASLONG m, FLOAT alpha_r,
FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer){
BLASLONG i;
diff --git a/driver/level2/spr_thread.c b/driver/level2/spr_thread.c
index f889506..4a194cb 100644
--- a/driver/level2/spr_thread.c
+++ b/driver/level2/spr_thread.c
@@ -96,7 +96,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
#else
if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) {
#ifndef LOWER
- AXPYU_K(i + 1, 0, 0,
+ AXPYU_K(i + 1, 0, 0,
alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1],
alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1],
x, 1, a, 1, NULL, 0);
@@ -112,7 +112,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) {
#ifndef HEMVREV
#ifndef LOWER
- AXPYU_K(i + 1, 0, 0,
+ AXPYU_K(i + 1, 0, 0,
alpha_r * x[i * COMPSIZE + 0], - alpha_r * x[i * COMPSIZE + 1],
x, 1, a, 1, NULL, 0);
#else
@@ -122,7 +122,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
#endif
#else
#ifndef LOWER
- AXPYC_K(i + 1, 0, 0,
+ AXPYC_K(i + 1, 0, 0,
alpha_r * x[i * COMPSIZE + 0], alpha_r * x[i * COMPSIZE + 1],
x, 1, a, 1, NULL, 0);
#else
@@ -145,7 +145,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
a += (args -> m - i) * COMPSIZE;
#endif
}
-
+
return 0;
}
@@ -172,7 +172,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *bu
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -180,15 +180,15 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *bu
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
args.m = m;
-
+
args.a = (void *)x;
args.b = (void *)a;
-
+
args.lda = incx;
#if !defined(COMPLEX) || defined(HEMV) || defined(HEMVREV)
@@ -199,16 +199,16 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *bu
dnum = (double)m * (double)m / (double)nthreads;
num_cpu = 0;
-
+
#ifndef LOWER
range_m[MAX_CPU_NUMBER] = m;
i = 0;
-
+
while (i < m){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)(m - i);
if (di * di - dnum > 0) {
width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
@@ -218,13 +218,13 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *bu
if (width < 16) width = 16;
if (width > m - i) width = m - i;
-
+
} else {
width = m - i;
}
-
+
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = syr_kernel;
queue[num_cpu].args = &args;
@@ -233,20 +233,20 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *bu
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
#else
range_m[0] = 0;
i = 0;
-
+
while (i < m){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)(m - i);
if (di * di - dnum > 0) {
width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
@@ -256,13 +256,13 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *bu
if (width < 16) width = 16;
if (width > m - i) width = m - i;
-
+
} else {
width = m - i;
}
-
+
range_m[num_cpu + 1] = range_m[num_cpu] + width;
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = syr_kernel;
queue[num_cpu].args = &args;
@@ -271,21 +271,21 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *bu
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
#endif
if (num_cpu) {
queue[0].sa = NULL;
queue[0].sb = buffer;
-
+
queue[num_cpu - 1].next = NULL;
-
+
exec_blas(num_cpu, queue);
}
-
+
return 0;
}
diff --git a/driver/level2/symv_thread.c b/driver/level2/symv_thread.c
index cf0e2d0..95d6c9b 100644
--- a/driver/level2/symv_thread.c
+++ b/driver/level2/symv_thread.c
@@ -78,11 +78,11 @@ static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
#ifndef LOWER
- SCAL_K(m_to, 0, 0, ZERO,
+ SCAL_K(m_to, 0, 0, ZERO,
#ifdef COMPLEX
ZERO,
#endif
- y, 1, NULL, 0, NULL, 0);
+ y, 1, NULL, 0, NULL, 0);
MYSYMV_U (m_to, m_to - m_from, ONE,
#ifdef COMPLEX
@@ -92,11 +92,11 @@ static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
#else
- SCAL_K(args -> m - m_from, 0, 0, ZERO,
+ SCAL_K(args -> m - m_from, 0, 0, ZERO,
#ifdef COMPLEX
ZERO,
#endif
- y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
+ y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
MYSYMV_L (args -> m - m_from, m_to - m_from, ONE,
#ifdef COMPLEX
@@ -132,7 +132,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -140,45 +140,45 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
args.m = m;
-
+
args.a = (void *)a;
args.b = (void *)x;
args.c = (void *)buffer;
-
+
args.lda = lda;
args.ldb = incx;
args.ldc = incy;
dnum = (double)m * (double)m / (double)nthreads;
num_cpu = 0;
-
+
#ifndef LOWER
range_m[0] = 0;
i = 0;
-
+
while (i < m){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)i;
width = ((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask;
if (width < 4) width = 4;
if (width > m - i) width = m - i;
-
+
} else {
width = m - i;
}
-
+
range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
-
+
queue[MAX_CPU_NUMBER - num_cpu - 1].mode = mode;
queue[MAX_CPU_NUMBER - num_cpu - 1].routine = symv_kernel;
queue[MAX_CPU_NUMBER - num_cpu - 1].args = &args;
@@ -187,29 +187,29 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i
queue[MAX_CPU_NUMBER - num_cpu - 1].sa = NULL;
queue[MAX_CPU_NUMBER - num_cpu - 1].sb = NULL;
queue[MAX_CPU_NUMBER - num_cpu - 1].next = &queue[MAX_CPU_NUMBER - num_cpu];
-
+
num_cpu ++;
i += width;
}
-
+
if (num_cpu) {
queue[MAX_CPU_NUMBER - num_cpu].sa = NULL;
queue[MAX_CPU_NUMBER - num_cpu].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
-
+
queue[MAX_CPU_NUMBER - 1].next = NULL;
-
+
exec_blas(num_cpu, &queue[MAX_CPU_NUMBER - num_cpu]);
}
-
+
#else
range_m[0] = 0;
i = 0;
-
+
while (i < m){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)(m - i);
if (di * di - dnum > 0) {
width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
@@ -219,14 +219,14 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i
if (width < 4) width = 4;
if (width > m - i) width = m - i;
-
+
} else {
width = m - i;
}
-
+
range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = symv_kernel;
queue[num_cpu].args = &args;
@@ -235,32 +235,32 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
if (num_cpu) {
queue[0].sa = NULL;
queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
-
+
queue[num_cpu - 1].next = NULL;
-
+
exec_blas(num_cpu, queue);
}
-
+
#endif
#ifndef LOWER
for (i = 0; i < num_cpu - 1; i ++) {
-
+
AXPYU_K(range_m[i + 1], 0, 0, ONE,
#ifdef COMPLEX
- ZERO,
+ ZERO,
#endif
buffer + range_n[i] * COMPSIZE, 1, buffer + range_n[num_cpu - 1] * COMPSIZE, 1, NULL, 0);
- }
+ }
AXPYU_K(m, 0, 0,
#ifndef COMPLEX
@@ -271,12 +271,12 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i
buffer + range_n[num_cpu - 1] * COMPSIZE, 1, y, incy, NULL, 0);
#else
-
+
for (i = 1; i < num_cpu; i ++) {
AXPYU_K(m - range_m[i], 0, 0, ONE,
#ifdef COMPLEX
- ZERO,
+ ZERO,
#endif
buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0);
}
@@ -288,8 +288,8 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i
alpha[0], alpha[1],
#endif
buffer, 1, y, incy, NULL, 0);
-
+
#endif
-
+
return 0;
}
diff --git a/driver/level2/syr2_k.c b/driver/level2/syr2_k.c
index bca8b3b..5bbd47b 100644
--- a/driver/level2/syr2_k.c
+++ b/driver/level2/syr2_k.c
@@ -40,7 +40,7 @@
#include <ctype.h>
#include "common.h"
-int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx,
+int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx,
FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){
BLASLONG i;
diff --git a/driver/level2/syr2_thread.c b/driver/level2/syr2_thread.c
index 130a62d..4c32944 100644
--- a/driver/level2/syr2_thread.c
+++ b/driver/level2/syr2_thread.c
@@ -112,7 +112,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
#else
if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) {
#ifndef LOWER
- AXPYU_K(i + 1, 0, 0,
+ AXPYU_K(i + 1, 0, 0,
alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1],
alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1],
y, 1, a, 1, NULL, 0);
@@ -125,7 +125,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
}
if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) {
#ifndef LOWER
- AXPYU_K(i + 1, 0, 0,
+ AXPYU_K(i + 1, 0, 0,
alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1],
alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1],
x, 1, a, 1, NULL, 0);
@@ -141,7 +141,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) {
#ifndef HERREV
#ifndef LOWER
- AXPYU_K(i + 1, 0, 0,
+ AXPYU_K(i + 1, 0, 0,
alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1],
- alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1],
y, 1, a, 1, NULL, 0);
@@ -153,7 +153,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
#endif
#else
#ifndef LOWER
- AXPYC_K(i + 1, 0, 0,
+ AXPYC_K(i + 1, 0, 0,
alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1],
alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1],
y, 1, a, 1, NULL, 0);
@@ -168,7 +168,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) {
#ifndef HERREV
#ifndef LOWER
- AXPYU_K(i + 1, 0, 0,
+ AXPYU_K(i + 1, 0, 0,
alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1],
alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1],
x, 1, a, 1, NULL, 0);
@@ -180,7 +180,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
#endif
#else
#ifndef LOWER
- AXPYC_K(i + 1, 0, 0,
+ AXPYC_K(i + 1, 0, 0,
alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1],
- alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1],
x, 1, a, 1, NULL, 0);
@@ -197,7 +197,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
a += lda * COMPSIZE;
}
-
+
return 0;
}
@@ -224,7 +224,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -232,16 +232,16 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
args.m = m;
-
+
args.a = (void *)x;
args.b = (void *)y;
args.c = (void *)a;
-
+
args.lda = incx;
args.ldb = incy;
args.ldc = lda;
@@ -253,16 +253,16 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG
dnum = (double)m * (double)m / (double)nthreads;
num_cpu = 0;
-
+
#ifndef LOWER
range_m[MAX_CPU_NUMBER] = m;
i = 0;
-
+
while (i < m){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)(m - i);
if (di * di - dnum > 0) {
width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
@@ -272,13 +272,13 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG
if (width < 16) width = 16;
if (width > m - i) width = m - i;
-
+
} else {
width = m - i;
}
-
+
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = syr_kernel;
queue[num_cpu].args = &args;
@@ -287,20 +287,20 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
#else
range_m[0] = 0;
i = 0;
-
+
while (i < m){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)(m - i);
if (di * di - dnum > 0) {
width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
@@ -310,13 +310,13 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG
if (width < 16) width = 16;
if (width > m - i) width = m - i;
-
+
} else {
width = m - i;
}
-
+
range_m[num_cpu + 1] = range_m[num_cpu] + width;
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = syr_kernel;
queue[num_cpu].args = &args;
@@ -325,21 +325,21 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
#endif
if (num_cpu) {
queue[0].sa = NULL;
queue[0].sb = buffer;
-
+
queue[num_cpu - 1].next = NULL;
-
+
exec_blas(num_cpu, queue);
}
-
+
return 0;
}
diff --git a/driver/level2/syr_k.c b/driver/level2/syr_k.c
index a0d9a2f..4f18cc6 100644
--- a/driver/level2/syr_k.c
+++ b/driver/level2/syr_k.c
@@ -38,7 +38,7 @@
#include "common.h"
-int CNAME(BLASLONG m, FLOAT alpha_r,
+int CNAME(BLASLONG m, FLOAT alpha_r,
FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer){
BLASLONG i;
diff --git a/driver/level2/syr_thread.c b/driver/level2/syr_thread.c
index 250e8c0..0eb5428 100644
--- a/driver/level2/syr_thread.c
+++ b/driver/level2/syr_thread.c
@@ -95,7 +95,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
#else
if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) {
#ifndef LOWER
- AXPYU_K(i + 1, 0, 0,
+ AXPYU_K(i + 1, 0, 0,
alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1],
alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1],
x, 1, a, 1, NULL, 0);
@@ -111,7 +111,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) {
#ifndef HERREV
#ifndef LOWER
- AXPYU_K(i + 1, 0, 0,
+ AXPYU_K(i + 1, 0, 0,
alpha_r * x[i * COMPSIZE + 0], -alpha_r * x[i * COMPSIZE + 1],
x, 1, a, 1, NULL, 0);
#else
@@ -121,7 +121,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
#endif
#else
#ifndef LOWER
- AXPYC_K(i + 1, 0, 0,
+ AXPYC_K(i + 1, 0, 0,
alpha_r * x[i * COMPSIZE + 0], alpha_r * x[i * COMPSIZE + 1],
x, 1, a, 1, NULL, 0);
#else
@@ -137,7 +137,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
a += lda * COMPSIZE;
}
-
+
return 0;
}
@@ -164,7 +164,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -172,15 +172,15 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
args.m = m;
-
+
args.a = (void *)x;
args.b = (void *)a;
-
+
args.lda = incx;
args.ldb = lda;
#if !defined(COMPLEX) || defined(HER) || defined(HERREV)
@@ -191,16 +191,16 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG
dnum = (double)m * (double)m / (double)nthreads;
num_cpu = 0;
-
+
#ifndef LOWER
range_m[MAX_CPU_NUMBER] = m;
i = 0;
-
+
while (i < m){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)(m - i);
if (di * di - dnum > 0) {
width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
@@ -210,13 +210,13 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG
if (width < 16) width = 16;
if (width > m - i) width = m - i;
-
+
} else {
width = m - i;
}
-
+
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = syr_kernel;
queue[num_cpu].args = &args;
@@ -225,20 +225,20 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
#else
range_m[0] = 0;
i = 0;
-
+
while (i < m){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)(m - i);
if (di * di - dnum > 0) {
width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
@@ -248,13 +248,13 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG
if (width < 16) width = 16;
if (width > m - i) width = m - i;
-
+
} else {
width = m - i;
}
-
+
range_m[num_cpu + 1] = range_m[num_cpu] + width;
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = syr_kernel;
queue[num_cpu].args = &args;
@@ -263,21 +263,21 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
#endif
if (num_cpu) {
queue[0].sa = NULL;
queue[0].sb = buffer;
-
+
queue[num_cpu - 1].next = NULL;
-
+
exec_blas(num_cpu, queue);
}
-
+
return 0;
}
diff --git a/driver/level2/tbmv_L.c b/driver/level2/tbmv_L.c
index 05e7cf8..b41b414 100644
--- a/driver/level2/tbmv_L.c
+++ b/driver/level2/tbmv_L.c
@@ -54,7 +54,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1);
}
-
+
a += (n - 1) * lda;
for (i = n - 1; i >= 0; i--) {
@@ -65,7 +65,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
if (length > 0) {
AXPYU_K(length, 0, 0,
- B[i],
+ B[i],
a + 1, 1, B + i + 1, 1, NULL, 0);
}
#endif
@@ -77,7 +77,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
B[i] *= a[k];
#endif
#endif
-
+
#ifdef TRANSA
length = i;
if (length > k) length = k;
@@ -89,7 +89,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
a -= lda;
}
-
+
if (incb != 1) {
COPY_K(n, buffer, 1, b, incb);
}
diff --git a/driver/level2/tbmv_U.c b/driver/level2/tbmv_U.c
index 49d28dc..50c1032 100644
--- a/driver/level2/tbmv_U.c
+++ b/driver/level2/tbmv_U.c
@@ -56,14 +56,14 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
}
for (i = 0; i < n; i++) {
-
+
#ifndef TRANSA
length = i;
if (length > k) length = k;
if (length > 0) {
AXPYU_K(length, 0, 0,
- B[i],
+ B[i],
a + k - length, 1, B + i - length, 1, NULL, 0);
}
#endif
@@ -75,7 +75,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
B[i] *= a[0];
#endif
#endif
-
+
#ifdef TRANSA
length = n - i - 1;
if (length > k) length = k;
diff --git a/driver/level2/tbmv_thread.c b/driver/level2/tbmv_thread.c
index e3d0588..3c12494 100644
--- a/driver/level2/tbmv_thread.c
+++ b/driver/level2/tbmv_thread.c
@@ -105,18 +105,18 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
if (incx != 1) {
COPY_K(args -> n, x, incx, buffer, 1);
-
+
x = buffer;
buffer += ((args -> n * COMPSIZE + 1023) & ~1023);
- }
+ }
if (range_n) y += *range_n * COMPSIZE;
- SCAL_K(args -> n, 0, 0, ZERO,
+ SCAL_K(args -> n, 0, 0, ZERO,
#ifdef COMPLEX
ZERO,
#endif
- y, 1, NULL, 0, NULL, 0);
+ y, 1, NULL, 0, NULL, 0);
for (i = n_from; i < n_to; i++) {
@@ -148,7 +148,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
#endif
}
#endif
-
+
#ifndef COMPLEX
#ifdef UNIT
*(y + i * COMPSIZE) += *(x + i * COMPSIZE);
@@ -183,19 +183,19 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
#endif
#endif
#endif
-
+
#ifdef LOWER
if (length > 0) {
#ifndef TRANS
MYAXPY(length, 0, 0,
- *(x + i * COMPSIZE + 0),
+ *(x + i * COMPSIZE + 0),
#ifdef COMPLEX
*(x + i * COMPSIZE + 1),
#endif
a + COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0);
#else
result = MYDOT(length, a + COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1);
-
+
#ifndef COMPLEX
*(y + i * COMPSIZE + 0) += result;
#else
@@ -205,10 +205,10 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
#endif
}
#endif
-
+
a += lda * COMPSIZE;
}
-
+
return 0;
}
@@ -236,7 +236,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -244,51 +244,51 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
args.n = n;
args.k = k;
-
+
args.a = (void *)a;
args.b = (void *)x;
args.c = (void *)(buffer);
-
+
args.lda = lda;
args.ldb = incx;
-
+
dnum = (double)n * (double)n / (double)nthreads;
num_cpu = 0;
-
+
if (n < 2 * k) {
#ifndef LOWER
-
+
range_m[MAX_CPU_NUMBER] = n;
i = 0;
-
+
while (i < n){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)(n - i);
if (di * di - dnum > 0) {
width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
} else {
width = n - i;
}
-
+
if (width < 16) width = 16;
if (width > n - i) width = n - i;
-
+
} else {
width = n - i;
}
-
+
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel;
queue[num_cpu].args = &args;
@@ -297,37 +297,37 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
#else
-
+
range_m[0] = 0;
i = 0;
-
+
while (i < n){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)(n - i);
if (di * di - dnum > 0) {
width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
} else {
width = n - i;
}
-
+
if (width < 16) width = 16;
if (width > n - i) width = n - i;
-
+
} else {
width = n - i;
}
-
+
range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel;
queue[num_cpu].args = &args;
@@ -336,27 +336,27 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
#endif
} else {
-
+
range_m[0] = 0;
i = n;
-
+
while (i > 0){
-
+
width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu);
-
+
if (width < 4) width = 4;
if (i < width) width = i;
-
+
range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel;
queue[num_cpu].args = &args;
@@ -365,7 +365,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i -= width;
}
@@ -376,20 +376,20 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
if (num_cpu) {
queue[0].sa = NULL;
queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE;
-
+
queue[num_cpu - 1].next = NULL;
-
+
exec_blas(num_cpu, queue);
}
-
+
for (i = 1; i < num_cpu; i ++) {
AXPYU_K(n, 0, 0, ONE,
#ifdef COMPLEX
- ZERO,
+ ZERO,
#endif
buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0);
}
-
+
COPY_K(n, buffer, 1, x, incx);
return 0;
diff --git a/driver/level2/tbsv_L.c b/driver/level2/tbsv_L.c
index e9c9158..0d03644 100644
--- a/driver/level2/tbsv_L.c
+++ b/driver/level2/tbsv_L.c
@@ -56,7 +56,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
}
for (i = 0; i < n; i++) {
-
+
#ifdef TRANSA
length = i;
if (length > k) length = k;
@@ -73,14 +73,14 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
B[i] /= a[0];
#endif
#endif
-
+
#ifndef TRANSA
length = n - i - 1;
if (length > k) length = k;
if (length > 0) {
AXPYU_K(length, 0, 0,
- -B[i],
+ -B[i],
a + 1, 1, B + i + 1, 1, NULL, 0);
}
#endif
diff --git a/driver/level2/tbsv_U.c b/driver/level2/tbsv_U.c
index 0b1fca8..1dc1a99 100644
--- a/driver/level2/tbsv_U.c
+++ b/driver/level2/tbsv_U.c
@@ -54,7 +54,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1);
}
-
+
a += (n - 1) * lda;
for (i = n - 1; i >= 0; i--) {
@@ -75,21 +75,21 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
B[i] /= a[k];
#endif
#endif
-
+
#ifndef TRANSA
length = i;
if (length > k) length = k;
if (length > 0) {
AXPYU_K(length, 0, 0,
- - B[i],
+ - B[i],
a + k - length, 1, B + i - length, 1, NULL, 0);
}
#endif
a -= lda;
}
-
+
if (incb != 1) {
COPY_K(n, buffer, 1, b, incb);
}
diff --git a/driver/level2/tpmv_L.c b/driver/level2/tpmv_L.c
index c139eb7..d01478c 100644
--- a/driver/level2/tpmv_L.c
+++ b/driver/level2/tpmv_L.c
@@ -51,14 +51,14 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
B = buffer;
COPY_K(m, b, incb, buffer, 1);
}
-
+
a += (m + 1) * m / 2 - 1;
for (i = 0; i < m; i++) {
#ifndef TRANSA
if (i > 0) AXPYU_K(i, 0, 0, B[m - i - 1], a + 1, 1, B + m - i, 1, NULL, 0);
#endif
-
+
#ifndef UNIT
B[m - i - 1] *= a[0];
#endif
@@ -73,7 +73,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
a -= (m - i);
#endif
}
-
+
if (incb != 1) {
COPY_K(m, buffer, 1, b, incb);
}
diff --git a/driver/level2/tpmv_U.c b/driver/level2/tpmv_U.c
index 6d69df6..5d311f8 100644
--- a/driver/level2/tpmv_U.c
+++ b/driver/level2/tpmv_U.c
@@ -53,11 +53,11 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
}
for (i = 0; i < m; i++) {
-
+
#ifndef TRANSA
if (i > 0) AXPYU_K(i, 0, 0, B[i], a, 1, B, 1, NULL, 0);
#endif
-
+
#ifndef UNIT
#ifndef TRANSA
B[i] *= a[i];
@@ -65,7 +65,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
B[i] *= a[0];
#endif
#endif
-
+
#ifdef TRANSA
if (i < m - 1) B[i] += DOTU_K(m - i - 1, a + 1, 1, B + i + 1, 1);
#endif
diff --git a/driver/level2/tpmv_thread.c b/driver/level2/tpmv_thread.c
index 64b725f..3b91cee 100644
--- a/driver/level2/tpmv_thread.c
+++ b/driver/level2/tpmv_thread.c
@@ -110,35 +110,35 @@ static int tpmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
#else
COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1);
#endif
-
+
x = buffer;
buffer += ((COMPSIZE * args -> m + 1023) & ~1023);
- }
+ }
#ifndef TRANS
if (range_n) y += *range_n * COMPSIZE;
#ifndef LOWER
- SCAL_K(m_to, 0, 0, ZERO,
+ SCAL_K(m_to, 0, 0, ZERO,
#ifdef COMPLEX
ZERO,
#endif
- y, 1, NULL, 0, NULL, 0);
+ y, 1, NULL, 0, NULL, 0);
#else
- SCAL_K(args -> m - m_from, 0, 0, ZERO,
+ SCAL_K(args -> m - m_from, 0, 0, ZERO,
#ifdef COMPLEX
ZERO,
#endif
- y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
+ y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
#endif
#else
- SCAL_K(m_to - m_from, 0, 0, ZERO,
+ SCAL_K(m_to - m_from, 0, 0, ZERO,
#ifdef COMPLEX
ZERO,
#endif
- y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
+ y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
#endif
@@ -154,9 +154,9 @@ static int tpmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
if (i > 0) {
#ifndef TRANS
MYAXPY(i, 0, 0,
- *(x + i * COMPSIZE + 0),
+ *(x + i * COMPSIZE + 0),
#ifdef COMPLEX
- *(x + i * COMPSIZE + 1),
+ *(x + i * COMPSIZE + 1),
#endif
a, 1, y, 1, NULL, 0);
#else
@@ -202,7 +202,7 @@ static int tpmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
#ifdef LOWER
if (args -> m > i + 1) {
#ifndef TRANS
- MYAXPY(args -> m - i - 1, 0, 0,
+ MYAXPY(args -> m - i - 1, 0, 0,
*(x + i * COMPSIZE + 0),
#ifdef COMPLEX
*(x + i * COMPSIZE + 1),
@@ -258,7 +258,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -266,31 +266,31 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
args.m = m;
-
+
args.a = (void *)a;
args.b = (void *)x;
args.c = (void *)(buffer);
-
+
args.ldb = incx;
args.ldc = incx;
-
+
dnum = (double)m * (double)m / (double)nthreads;
num_cpu = 0;
-
+
#ifndef LOWER
range_m[MAX_CPU_NUMBER] = m;
i = 0;
-
+
while (i < m){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)(m - i);
if (di * di - dnum > 0) {
width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
@@ -300,14 +300,14 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr
if (width < 16) width = 16;
if (width > m - i) width = m - i;
-
+
} else {
width = m - i;
}
-
+
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = tpmv_kernel;
queue[num_cpu].args = &args;
@@ -316,20 +316,20 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
#else
range_m[0] = 0;
i = 0;
-
+
while (i < m){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)(m - i);
if (di * di - dnum > 0) {
width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
@@ -339,14 +339,14 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr
if (width < 16) width = 16;
if (width > m - i) width = m - i;
-
+
} else {
width = m - i;
}
-
+
range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = tpmv_kernel;
queue[num_cpu].args = &args;
@@ -355,46 +355,46 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
#endif
if (num_cpu) {
queue[0].sa = NULL;
queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
-
+
queue[num_cpu - 1].next = NULL;
-
+
exec_blas(num_cpu, queue);
}
-
+
#ifndef TRANS
for (i = 1; i < num_cpu; i ++) {
-
+
#ifndef LOWER
-
+
AXPYU_K(range_m[MAX_CPU_NUMBER - i], 0, 0, ONE,
#ifdef COMPLEX
- ZERO,
+ ZERO,
#endif
buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0);
-
+
#else
-
+
AXPYU_K(m - range_m[i], 0, 0, ONE,
#ifdef COMPLEX
- ZERO,
+ ZERO,
#endif
buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0);
-
+
#endif
}
#endif
-
+
COPY_K(m, buffer, 1, x, incx);
return 0;
diff --git a/driver/level2/tpsv_L.c b/driver/level2/tpsv_L.c
index 9f76181..3fafa90 100644
--- a/driver/level2/tpsv_L.c
+++ b/driver/level2/tpsv_L.c
@@ -41,7 +41,7 @@
#include "common.h"
int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
-
+
BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b;
@@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
#ifdef TRANSA
if (i > 0) B[i] -= DOTU_K(i, a, 1, B, 1);
#endif
-
+
#ifndef UNIT
#ifndef TRANSA
B[i] /= a[0];
@@ -64,7 +64,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
B[i] /= a[i];
#endif
#endif
-
+
#ifndef TRANSA
if (i < m - 1) {
AXPYU_K(m - i - 1 , 0, 0, - B[i],
@@ -78,7 +78,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
a += (i + 1);
#endif
}
-
+
if (incb != 1) {
COPY_K(m, buffer, 1, b, incb);
}
diff --git a/driver/level2/tpsv_U.c b/driver/level2/tpsv_U.c
index 7a09580..fb5ef02 100644
--- a/driver/level2/tpsv_U.c
+++ b/driver/level2/tpsv_U.c
@@ -51,18 +51,18 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1);
}
-
+
a += (m + 1) * m / 2 - 1;
for (i = 0; i < m; i++) {
#ifdef TRANSA
if (i > 0) B[m - i - 1] -= DOTU_K(i, a + 1, 1, B + m - i, 1);
#endif
-
+
#ifndef UNIT
B[m - i - 1] /= a[0];
#endif
-
+
#ifndef TRANSA
if (i < m - 1) AXPYU_K(m - i - 1, 0, 0, -B[m - i - 1], a - (m - i - 1), 1, B, 1, NULL, 0);
#endif
diff --git a/driver/level2/trmv_L.c b/driver/level2/trmv_L.c
index e515ba6..0de48a6 100644
--- a/driver/level2/trmv_L.c
+++ b/driver/level2/trmv_L.c
@@ -53,14 +53,14 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1);
}
-
+
for (is = m; is > 0; is -= DTB_ENTRIES){
min_i = MIN(is, DTB_ENTRIES);
-
+
#ifndef TRANSA
if (m - is > 0){
- GEMV_N(m - is, min_i, 0, dp1,
+ GEMV_N(m - is, min_i, 0, dp1,
a + is + (is - min_i) * lda, lda,
B + is - min_i, 1,
B + is, 1, gemvbuffer);
@@ -83,10 +83,10 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
if (i < min_i - 1) BB[0] += DOTU_K(min_i - i - 1, AA - (min_i - i - 1), 1, BB - (min_i - i - 1), 1);
#endif
}
-
+
#ifdef TRANSA
if (is - min_i > 0){
- GEMV_T(is - min_i, min_i, 0, dp1,
+ GEMV_T(is - min_i, min_i, 0, dp1,
a + (is - min_i) * lda, lda,
B, 1,
B + is - min_i, 1, gemvbuffer);
diff --git a/driver/level2/trmv_U.c b/driver/level2/trmv_U.c
index 3c36f77..a0aa7ef 100644
--- a/driver/level2/trmv_U.c
+++ b/driver/level2/trmv_U.c
@@ -55,12 +55,12 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
}
for (is = 0; is < m; is += DTB_ENTRIES){
-
+
min_i = MIN(m - is, DTB_ENTRIES);
#ifndef TRANSA
if (is > 0){
- GEMV_N(is, min_i, 0, dp1,
+ GEMV_N(is, min_i, 0, dp1,
a + is * lda, lda,
B + is, 1,
B, 1, gemvbuffer);
@@ -70,7 +70,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
for (i = 0; i < min_i; i++) {
FLOAT *AA = a + is + (i + is) * lda;
FLOAT *BB = B + is;
-
+
#ifndef TRANSA
if (i > 0) AXPYU_K(i, 0, 0, BB[i], AA, 1, BB, 1, NULL, 0);
#endif
@@ -86,7 +86,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
#ifdef TRANSA
if (m - is > min_i){
- GEMV_T(m - is - min_i, min_i, 0, dp1,
+ GEMV_T(m - is - min_i, min_i, 0, dp1,
a + is + min_i + is * lda, lda,
B + is + min_i, 1,
B + is, 1, gemvbuffer);
diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c
index 4f5b27c..29e9799 100644
--- a/driver/level2/trmv_thread.c
+++ b/driver/level2/trmv_thread.c
@@ -117,40 +117,40 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
#else
COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1);
#endif
-
+
x = buffer;
buffer += ((COMPSIZE * args -> m + 1023) & ~1023);
- }
+ }
#ifndef TRANS
if (range_n) y += *range_n * COMPSIZE;
#ifndef LOWER
- SCAL_K(m_to, 0, 0, ZERO,
+ SCAL_K(m_to, 0, 0, ZERO,
#ifdef COMPLEX
ZERO,
#endif
- y, 1, NULL, 0, NULL, 0);
+ y, 1, NULL, 0, NULL, 0);
#else
- SCAL_K(args -> m - m_from, 0, 0, ZERO,
+ SCAL_K(args -> m - m_from, 0, 0, ZERO,
#ifdef COMPLEX
ZERO,
#endif
- y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
+ y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
#endif
#else
- SCAL_K(m_to - m_from, 0, 0, ZERO,
+ SCAL_K(m_to - m_from, 0, 0, ZERO,
#ifdef COMPLEX
ZERO,
#endif
- y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
+ y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
#endif
for (is = m_from; is < m_to; is += DTB_ENTRIES){
-
+
min_i = MIN(m_to - is, DTB_ENTRIES);
#ifndef LOWER
@@ -178,13 +178,13 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
if (i - is > 0) {
#ifndef TRANS
MYAXPY(i - is, 0, 0,
- *(x + i * COMPSIZE + 0),
+ *(x + i * COMPSIZE + 0),
#ifdef COMPLEX
- *(x + i * COMPSIZE + 1),
+ *(x + i * COMPSIZE + 1),
#endif
a + (is + i * lda) * COMPSIZE, 1, y + is * COMPSIZE, 1, NULL, 0);
#else
-
+
result = MYDOT(i - is, a + (is + i * lda) * COMPSIZE, 1, x + is * COMPSIZE, 1);
#ifndef COMPLEX
@@ -227,7 +227,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
#ifdef LOWER
if (is + min_i > i + 1) {
#ifndef TRANS
- MYAXPY(is + min_i - i - 1, 0, 0,
+ MYAXPY(is + min_i - i - 1, 0, 0,
*(x + i * COMPSIZE + 0),
#ifdef COMPLEX
*(x + i * COMPSIZE + 1),
@@ -248,7 +248,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
}
#endif
}
-
+
#ifdef LOWER
if (args -> m > is + min_i){
MYGEMV(args -> m - is - min_i, min_i, 0,
@@ -259,9 +259,9 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
a + (is + min_i + is * lda) * COMPSIZE, lda,
#ifndef TRANS
x + is * COMPSIZE, 1,
- y + (is + min_i) * COMPSIZE, 1,
+ y + (is + min_i) * COMPSIZE, 1,
#else
- x + (is + min_i) * COMPSIZE, 1,
+ x + (is + min_i) * COMPSIZE, 1,
y + is * COMPSIZE, 1,
#endif
buffer);
@@ -296,7 +296,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -304,32 +304,32 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
args.m = m;
-
+
args.a = (void *)a;
args.b = (void *)x;
args.c = (void *)(buffer);
-
+
args.lda = lda;
args.ldb = incx;
args.ldc = incx;
-
+
dnum = (double)m * (double)m / (double)nthreads;
num_cpu = 0;
-
+
#ifndef LOWER
range_m[MAX_CPU_NUMBER] = m;
i = 0;
-
+
while (i < m){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)(m - i);
if (di * di - dnum > 0) {
width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
@@ -339,14 +339,14 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
if (width < 16) width = 16;
if (width > m - i) width = m - i;
-
+
} else {
width = m - i;
}
-
+
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel;
queue[num_cpu].args = &args;
@@ -355,20 +355,20 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
#else
range_m[0] = 0;
i = 0;
-
+
while (i < m){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)(m - i);
if (di * di - dnum > 0) {
width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
@@ -378,14 +378,14 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
if (width < 16) width = 16;
if (width > m - i) width = m - i;
-
+
} else {
width = m - i;
}
-
+
range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel;
queue[num_cpu].args = &args;
@@ -394,46 +394,46 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
#endif
if (num_cpu) {
queue[0].sa = NULL;
queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
-
+
queue[num_cpu - 1].next = NULL;
-
+
exec_blas(num_cpu, queue);
}
-
+
#ifndef TRANS
for (i = 1; i < num_cpu; i ++) {
-
+
#ifndef LOWER
-
+
AXPYU_K(range_m[MAX_CPU_NUMBER - i], 0, 0, ONE,
#ifdef COMPLEX
- ZERO,
+ ZERO,
#endif
buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0);
-
+
#else
-
+
AXPYU_K(m - range_m[i], 0, 0, ONE,
#ifdef COMPLEX
- ZERO,
+ ZERO,
#endif
buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0);
-
+
#endif
}
#endif
-
+
COPY_K(m, buffer, 1, x, incx);
return 0;
diff --git a/driver/level2/trsv_L.c b/driver/level2/trsv_L.c
index 44bcfe3..95ec572 100644
--- a/driver/level2/trsv_L.c
+++ b/driver/level2/trsv_L.c
@@ -46,7 +46,7 @@ const static FLOAT dm1 = -1.;
#define GEMV_UNROLL DTB_ENTRIES
int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
-
+
BLASLONG i, is, min_i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b;
@@ -58,14 +58,14 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf
}
for (is = 0; is < m; is += GEMV_UNROLL){
-
+
min_i = MIN(m - is, GEMV_UNROLL);
#ifdef TRANSA
if (is > 0){
- GEMV_T(is, min_i, 0, dm1,
+ GEMV_T(is, min_i, 0, dm1,
a + is * lda , lda,
- B, 1,
+ B, 1,
B + is, 1, gemvbuffer);
}
#endif
@@ -89,12 +89,12 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf
}
#endif
}
-
+
#ifndef TRANSA
if (m - is > min_i){
- GEMV_N(m - is - min_i, min_i, 0, dm1,
+ GEMV_N(m - is - min_i, min_i, 0, dm1,
a + is + min_i + is * lda, lda,
- B + is, 1,
+ B + is, 1,
B + (is + min_i), 1, gemvbuffer);
}
#endif
diff --git a/driver/level2/trsv_U.c b/driver/level2/trsv_U.c
index f02512b..823ca2e 100644
--- a/driver/level2/trsv_U.c
+++ b/driver/level2/trsv_U.c
@@ -53,20 +53,20 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1);
}
-
+
for (is = m; is > 0; is -= DTB_ENTRIES){
min_i = MIN(is, DTB_ENTRIES);
#ifdef TRANSA
if (m - is > 0){
- GEMV_T(m - is, min_i, 0, dm1,
+ GEMV_T(m - is, min_i, 0, dm1,
a + is + (is - min_i) * lda, lda,
B + is, 1,
B + is - min_i, 1, gemvbuffer);
}
#endif
-
+
for (i = 0; i < min_i; i++) {
FLOAT *AA = a + (is - i - 1) + (is - i - 1) * lda;
FLOAT *BB = B + (is - i - 1);
@@ -86,13 +86,13 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf
#ifndef TRANSA
if (is - min_i > 0){
- GEMV_N(is - min_i, min_i, 0, dm1,
+ GEMV_N(is - min_i, min_i, 0, dm1,
a + (is - min_i) * lda, lda,
B + is - min_i, 1,
B, 1, gemvbuffer);
}
#endif
-
+
}
if (incb != 1) {
diff --git a/driver/level2/zgbmv_k.c b/driver/level2/zgbmv_k.c
index 7832a7e..68d6045 100644
--- a/driver/level2/zgbmv_k.c
+++ b/driver/level2/zgbmv_k.c
@@ -129,7 +129,7 @@ void CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT alpha_r, FLOA
Y[i * 2 + 1] += alpha_i * CREAL(temp) - alpha_r * CIMAG(temp);
#endif
#endif
-
+
offset_u --;
offset_l --;
diff --git a/driver/level2/zhbmv_k.c b/driver/level2/zhbmv_k.c
index 8771942..70e92e0 100644
--- a/driver/level2/zhbmv_k.c
+++ b/driver/level2/zhbmv_k.c
@@ -81,8 +81,8 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
if (length > 0) {
AXPYU_K(length, 0, 0,
- alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
- alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
+ alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
+ alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0);
}
@@ -106,8 +106,8 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
if (length > 0) {
AXPYU_K(length, 0, 0,
- alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
- alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
+ alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
+ alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
a + COMPSIZE, 1, Y + (i + 1) * COMPSIZE, 1, NULL, 0);
}
@@ -131,8 +131,8 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
if (length > 0) {
AXPYC_K(length, 0, 0,
- alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
- alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
+ alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
+ alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0);
}
@@ -156,8 +156,8 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
if (length > 0) {
AXPYC_K(length, 0, 0,
- alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
- alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
+ alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
+ alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
a + COMPSIZE, 1, Y + (i + 1) * COMPSIZE, 1, NULL, 0);
}
@@ -176,7 +176,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
#endif
#endif
-
+
a += lda * 2;
}
diff --git a/driver/level2/zher2_k.c b/driver/level2/zher2_k.c
index 3e92458..94a8b7c 100644
--- a/driver/level2/zher2_k.c
+++ b/driver/level2/zher2_k.c
@@ -41,7 +41,7 @@
#include "common.h"
int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
- FLOAT *x, BLASLONG incx,
+ FLOAT *x, BLASLONG incx,
FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){
BLASLONG i;
@@ -65,7 +65,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
for (i = 0; i < m; i++){
#ifndef HEMVREV
#ifndef LOWER
- AXPYU_K(i + 1, 0, 0,
+ AXPYU_K(i + 1, 0, 0,
alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
- alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1],
Y, 1, a, 1, NULL, 0);
@@ -89,7 +89,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
#endif
#else
#ifndef LOWER
- AXPYC_K(i + 1, 0, 0,
+ AXPYC_K(i + 1, 0, 0,
alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1],
Y, 1, a, 1, NULL, 0);
diff --git a/driver/level2/zhpmv_k.c b/driver/level2/zhpmv_k.c
index 5f95ce7..96bceaa 100644
--- a/driver/level2/zhpmv_k.c
+++ b/driver/level2/zhpmv_k.c
@@ -40,7 +40,7 @@
#include <ctype.h>
#include "common.h"
-int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
+int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){
BLASLONG i;
@@ -70,7 +70,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
#ifndef LOWER
if (i > 0) {
FLOAT _Complex result = DOTC_K(i, a, 1, X, 1);
-
+
Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result);
Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result);
}
@@ -83,18 +83,18 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
if (i > 0) {
AXPYU_K(i, 0, 0,
- alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
- alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
+ alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
+ alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
a, 1, Y, 1, NULL, 0);
}
a += (i + 1) * 2;
-
+
#else
if (m - i > 1) {
FLOAT _Complex result = DOTC_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1);
-
+
Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result);
Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result);
}
@@ -107,8 +107,8 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
if (m - i > 1) {
AXPYU_K(m - i - 1, 0, 0,
- alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
- alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
+ alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
+ alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0);
}
@@ -119,7 +119,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
#ifndef LOWER
if (i > 0) {
FLOAT _Complex result = DOTU_K(i, a, 1, X, 1);
-
+
Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result);
Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result);
}
@@ -132,18 +132,18 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
if (i > 0) {
AXPYC_K(i, 0, 0,
- alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
- alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
+ alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
+ alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
a, 1, Y, 1, NULL, 0);
}
a += (i + 1) * 2;
-
+
#else
if (m - i > 1) {
FLOAT _Complex result = DOTU_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1);
-
+
Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result);
Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result);
}
@@ -156,8 +156,8 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
if (m - i > 1) {
AXPYC_K(m - i - 1, 0, 0,
- alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
- alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
+ alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
+ alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0);
}
@@ -167,7 +167,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
#endif
}
-
+
if (incy != 1) {
COPY_K(m, Y, 1, y, incy);
}
diff --git a/driver/level2/zhpr2_k.c b/driver/level2/zhpr2_k.c
index f4608ff..cb7113f 100644
--- a/driver/level2/zhpr2_k.c
+++ b/driver/level2/zhpr2_k.c
@@ -41,7 +41,7 @@
#include "common.h"
int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
- FLOAT *x, BLASLONG incx,
+ FLOAT *x, BLASLONG incx,
FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer){
BLASLONG i;
@@ -63,7 +63,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
for (i = 0; i < m; i++){
#ifndef HEMVREV
#ifndef LOWER
- AXPYU_K(i + 1, 0, 0,
+ AXPYU_K(i + 1, 0, 0,
alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
- alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1],
Y, 1, a, 1, NULL, 0);
@@ -87,7 +87,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
#endif
#else
#ifndef LOWER
- AXPYC_K(i + 1, 0, 0,
+ AXPYC_K(i + 1, 0, 0,
alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1],
Y, 1, a, 1, NULL, 0);
diff --git a/driver/level2/zsbmv_k.c b/driver/level2/zsbmv_k.c
index de5dfdd..30e2f91 100644
--- a/driver/level2/zsbmv_k.c
+++ b/driver/level2/zsbmv_k.c
@@ -78,8 +78,8 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
length = k - offset;
AXPYU_K(length + 1, 0, 0,
- alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
- alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
+ alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
+ alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0);
if (length > 0) {
@@ -95,18 +95,18 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
if (n - i - 1 < k) length = n - i - 1;
AXPYU_K(length + 1, 0, 0,
- alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
- alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
+ alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
+ alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
a, 1, Y + i * COMPSIZE, 1, NULL, 0);
if (length > 0) {
FLOAT _Complex result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1);
-
+
Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result);
Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result);
}
#endif
-
+
a += lda * 2;
}
diff --git a/driver/level2/zspmv_k.c b/driver/level2/zspmv_k.c
index c93b1e1..76657ea 100644
--- a/driver/level2/zspmv_k.c
+++ b/driver/level2/zspmv_k.c
@@ -69,29 +69,29 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
if (i > 0) {
result = DOTU_K(i, a, 1, X, 1);
-
+
Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result);
Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result);
}
AXPYU_K(i + 1, 0, 0,
- alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
- alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
+ alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
+ alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
a, 1, Y, 1, NULL, 0);
a += (i + 1) * 2;
-
+
#else
result = DOTU_K(m - i, a + i * 2, 1, X + i * 2, 1);
-
+
Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result);
Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result);
-
+
if (m - i > 1)
AXPYU_K(m - i - 1, 0, 0,
- alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
- alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
+ alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
+ alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0],
a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0);
a += (m - i - 1) * 2;
diff --git a/driver/level2/zspr2_k.c b/driver/level2/zspr2_k.c
index 48c81a3..e41a8de 100644
--- a/driver/level2/zspr2_k.c
+++ b/driver/level2/zspr2_k.c
@@ -40,7 +40,7 @@
#include <ctype.h>
#include "common.h"
-int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx,
+int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx,
FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer){
BLASLONG i;
@@ -61,7 +61,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx,
for (i = 0; i < m; i++){
#ifndef LOWER
- AXPYU_K(i + 1, 0, 0,
+ AXPYU_K(i + 1, 0, 0,
alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1],
Y, 1, a, 1, NULL, 0);
diff --git a/driver/level2/zspr_k.c b/driver/level2/zspr_k.c
index a187bdb..d888a81 100644
--- a/driver/level2/zspr_k.c
+++ b/driver/level2/zspr_k.c
@@ -54,7 +54,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
for (i = 0; i < m; i++){
#ifndef LOWER
if ((X[i * 2 + 0] != ZERO) && (X[i * 2 + 1] != ZERO)) {
- AXPYU_K(i + 1, 0, 0,
+ AXPYU_K(i + 1, 0, 0,
alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1],
X, 1, a, 1, NULL, 0);
diff --git a/driver/level2/zsyr2_k.c b/driver/level2/zsyr2_k.c
index f7bbbb2..03daf92 100644
--- a/driver/level2/zsyr2_k.c
+++ b/driver/level2/zsyr2_k.c
@@ -40,7 +40,7 @@
#include <ctype.h>
#include "common.h"
-int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx,
+int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx,
FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){
BLASLONG i;
@@ -63,7 +63,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx,
for (i = 0; i < m; i++){
#ifndef LOWER
- AXPYU_K(i + 1, 0, 0,
+ AXPYU_K(i + 1, 0, 0,
alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1],
Y, 1, a, 1, NULL, 0);
diff --git a/driver/level2/zsyr_k.c b/driver/level2/zsyr_k.c
index 9d800d3..57d1769 100644
--- a/driver/level2/zsyr_k.c
+++ b/driver/level2/zsyr_k.c
@@ -55,7 +55,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
for (i = 0; i < m; i++){
#ifndef LOWER
if ((X[i * 2 + 0] != ZERO) || (X[i * 2 + 1] != ZERO)) {
- AXPYU_K(i + 1, 0, 0,
+ AXPYU_K(i + 1, 0, 0,
alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1],
X, 1, a, 1, NULL, 0);
diff --git a/driver/level2/ztbmv_L.c b/driver/level2/ztbmv_L.c
index 9b604c0..74ff0bc 100644
--- a/driver/level2/ztbmv_L.c
+++ b/driver/level2/ztbmv_L.c
@@ -60,7 +60,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1);
}
-
+
a += (n - 1) * lda * COMPSIZE;
for (i = n - 1; i >= 0; i--) {
@@ -102,7 +102,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
B[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1;
#endif
#endif
-
+
#if (TRANSA == 2) || (TRANSA == 4)
length = i;
if (length > k) length = k;
@@ -121,7 +121,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
a -= lda * COMPSIZE;
}
-
+
if (incb != 1) {
COPY_K(n, buffer, 1, b, incb);
}
diff --git a/driver/level2/ztbmv_U.c b/driver/level2/ztbmv_U.c
index 4e86f4f..933275d 100644
--- a/driver/level2/ztbmv_U.c
+++ b/driver/level2/ztbmv_U.c
@@ -62,7 +62,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
}
for (i = 0; i < n; i++) {
-
+
#if (TRANSA == 1) || (TRANSA == 3)
length = i;
if (length > k) length = k;
diff --git a/driver/level2/ztbsv_L.c b/driver/level2/ztbsv_L.c
index f32ddff..0726bbd 100644
--- a/driver/level2/ztbsv_L.c
+++ b/driver/level2/ztbsv_L.c
@@ -62,7 +62,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
}
for (i = 0; i < n; i++) {
-
+
#if (TRANSA == 2) || (TRANSA == 4)
length = i;
if (length > k) length = k;
@@ -87,11 +87,11 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
ar = a[k * 2 + 0];
ai = a[k * 2 + 1];
#endif
-
+
if (fabs(ar) >= fabs(ai)){
ratio = ai / ar;
den = 1./(ar * ( 1 + ratio * ratio));
-
+
ar = den;
#if TRANSA < 3
ai = -ratio * den;
@@ -108,10 +108,10 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
ai = den;
#endif
}
-
+
br = B[i * 2 + 0];
bi = B[i * 2 + 1];
-
+
B[i * 2 + 0] = ar*br - ai*bi;
B[i * 2 + 1] = ar*bi + ai*br;
#endif
diff --git a/driver/level2/ztbsv_U.c b/driver/level2/ztbsv_U.c
index 252f3ba..d022650 100644
--- a/driver/level2/ztbsv_U.c
+++ b/driver/level2/ztbsv_U.c
@@ -60,7 +60,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1);
}
-
+
a += (n - 1) * lda * COMPSIZE;
for (i = n - 1; i >= 0; i--) {
@@ -89,11 +89,11 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
ar = a[0];
ai = a[1];
#endif
-
+
if (fabs(ar) >= fabs(ai)){
ratio = ai / ar;
den = 1./(ar * ( 1 + ratio * ratio));
-
+
ar = den;
#if TRANSA < 3
ai = -ratio * den;
@@ -110,10 +110,10 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
ai = den;
#endif
}
-
+
br = B[i * 2 + 0];
bi = B[i * 2 + 1];
-
+
B[i * 2 + 0] = ar*br - ai*bi;
B[i * 2 + 1] = ar*bi + ai*br;
#endif
@@ -138,7 +138,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
a -= lda * COMPSIZE;
}
-
+
if (incb != 1) {
COPY_K(n, buffer, 1, b, incb);
}
diff --git a/driver/level2/ztpmv_L.c b/driver/level2/ztpmv_L.c
index 62b9dc6..12c254c 100644
--- a/driver/level2/ztpmv_L.c
+++ b/driver/level2/ztpmv_L.c
@@ -61,14 +61,14 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
a += (m + 1) * m - 2;
for (i = 0; i < m; i++) {
-
+
#if (TRANSA == 1) || (TRANSA == 3)
#if TRANSA == 1
- if (i > 0) AXPYU_K (i, 0, 0,
+ if (i > 0) AXPYU_K (i, 0, 0,
B[(m - i - 1) * 2 + 0], B[(m - i - 1) * 2 + 1],
a + 2, 1, B + (m - i) * 2, 1, NULL, 0);
#else
- if (i > 0) AXPYC_K(i, 0, 0,
+ if (i > 0) AXPYC_K(i, 0, 0,
B[(m - i - 1) * 2 + 0], B[(m - i - 1) * 2 + 1],
a + 2, 1, B + (m - i) * 2, 1, NULL, 0);
#endif
@@ -110,7 +110,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
#endif
}
-
+
if (incb != 1) {
COPY_K(m, buffer, 1, b, incb);
diff --git a/driver/level2/ztpmv_U.c b/driver/level2/ztpmv_U.c
index 2ff3bfb..59708b8 100644
--- a/driver/level2/ztpmv_U.c
+++ b/driver/level2/ztpmv_U.c
@@ -41,7 +41,7 @@
#include "common.h"
int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
-
+
BLASLONG i;
#if (TRANSA == 2) || (TRANSA == 4)
FLOAT _Complex temp;
@@ -114,7 +114,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
a += (m - i) * 2;
#endif
}
-
+
if (incb != 1) {
COPY_K(m, buffer, 1, b, incb);
}
diff --git a/driver/level2/ztpsv_L.c b/driver/level2/ztpsv_L.c
index e9317fb..3b8e562 100644
--- a/driver/level2/ztpsv_L.c
+++ b/driver/level2/ztpsv_L.c
@@ -43,7 +43,7 @@
const static FLOAT dm1 = -1.;
int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
-
+
BLASLONG i;
#if (TRANSA == 2) || (TRANSA == 4)
FLOAT _Complex result;
@@ -61,7 +61,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
}
for (i = 0; i < m; i++) {
-
+
#if (TRANSA == 2) || (TRANSA == 4)
if (i > 0) {
#if TRANSA == 2
@@ -69,7 +69,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
#else
result = DOTC_K(i, a, 1, B, 1);
#endif
-
+
B[i * COMPSIZE + 0] -= CREAL(result);
B[i * COMPSIZE + 1] -= CIMAG(result);
}
@@ -83,11 +83,11 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
ar = a[i * COMPSIZE + 0];
ai = a[i * COMPSIZE + 1];
#endif
-
+
if (fabs(ar) >= fabs(ai)){
ratio = ai / ar;
den = 1./(ar * ( 1 + ratio * ratio));
-
+
ar = den;
#if TRANSA < 3
ai = -ratio * den;
@@ -107,7 +107,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
br = B[i * COMPSIZE + 0];
bi = B[i * COMPSIZE + 1];
-
+
B[i * COMPSIZE + 0] = ar*br - ai*bi;
B[i * COMPSIZE + 1] = ar*bi + ai*br;
#endif
diff --git a/driver/level2/ztpsv_U.c b/driver/level2/ztpsv_U.c
index 54903dc..601ac2f 100644
--- a/driver/level2/ztpsv_U.c
+++ b/driver/level2/ztpsv_U.c
@@ -61,7 +61,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
a += (m + 1) * m - 2;
for (i = 0; i < m; i++) {
-
+
#if (TRANSA == 2) || (TRANSA == 4)
if (i > 0) {
#if TRANSA == 2
@@ -69,20 +69,20 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
#else
result = DOTC_K(i, a + 2, 1, B + (m - i) * 2, 1);
#endif
-
+
B[(m - i - 1) * 2 + 0] -= CREAL(result);
B[(m - i - 1) * 2 + 1] -= CIMAG(result);
}
#endif
-
+
#ifndef UNIT
ar = a[0];
ai = a[1];
-
+
if (fabs(ar) >= fabs(ai)){
ratio = ai / ar;
den = 1./(ar * ( 1 + ratio * ratio));
-
+
ar = den;
#if (TRANSA == 1) || (TRANSA == 2)
ai = -ratio * den;
@@ -99,10 +99,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
ai = den;
#endif
}
-
+
br = B[(m - i - 1) * 2 + 0];
bi = B[(m - i - 1) * 2 + 1];
-
+
B[(m - i - 1) * 2 + 0] = ar*br - ai*bi;
B[(m - i - 1) * 2 + 1] = ar*bi + ai*br;
#endif
diff --git a/driver/level2/ztrmv_L.c b/driver/level2/ztrmv_L.c
index 3688f58..63522cf 100644
--- a/driver/level2/ztrmv_L.c
+++ b/driver/level2/ztrmv_L.c
@@ -122,7 +122,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
#endif
}
-
+
#if (TRANSA == 2) || (TRANSA == 4)
if (is - min_i > 0){
#if TRANSA == 2
diff --git a/driver/level2/ztrmv_U.c b/driver/level2/ztrmv_U.c
index a9fb6d1..8a4494f 100644
--- a/driver/level2/ztrmv_U.c
+++ b/driver/level2/ztrmv_U.c
@@ -43,7 +43,7 @@
static FLOAT dp1 = 1.;
int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer){
-
+
BLASLONG i, is, min_i;
#if (TRANSA == 2) || (TRANSA == 4)
FLOAT _Complex temp;
@@ -61,7 +61,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
}
for (is =0; is < m; is += DTB_ENTRIES){
-
+
min_i = MIN(m - is, DTB_ENTRIES);
#if (TRANSA) == 1 || (TRANSA == 3)
@@ -128,7 +128,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
#endif
}
-
+
#if (TRANSA) == 2 || (TRANSA == 4)
if (m - is > min_i){
#if TRANSA == 2
diff --git a/driver/level2/ztrsv_L.c b/driver/level2/ztrsv_L.c
index f825c61..90f1c2c 100644
--- a/driver/level2/ztrsv_L.c
+++ b/driver/level2/ztrsv_L.c
@@ -43,7 +43,7 @@
const static FLOAT dm1 = -1.;
int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
-
+
BLASLONG i, is, min_i;
#if (TRANSA == 2) || (TRANSA == 4)
FLOAT _Complex result;
@@ -100,11 +100,11 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf
#ifndef UNIT
ar = AA[i * COMPSIZE + 0];
ai = AA[i * COMPSIZE + 1];
-
+
if (fabs(ar) >= fabs(ai)){
ratio = ai / ar;
den = 1./(ar * ( 1 + ratio * ratio));
-
+
ar = den;
#if TRANSA < 3
ai = -ratio * den;
@@ -124,7 +124,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf
br = BB[i * COMPSIZE + 0];
bi = BB[i * COMPSIZE + 1];
-
+
BB[i * COMPSIZE + 0] = ar*br - ai*bi;
BB[i * COMPSIZE + 1] = ar*bi + ai*br;
#endif
diff --git a/driver/level2/ztrsv_U.c b/driver/level2/ztrsv_U.c
index 3b750a2..bec8114 100644
--- a/driver/level2/ztrsv_U.c
+++ b/driver/level2/ztrsv_U.c
@@ -100,11 +100,11 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf
#ifndef UNIT
ar = AA[0];
ai = AA[1];
-
+
if (fabs(ar) >= fabs(ai)){
ratio = ai / ar;
den = 1./(ar * ( 1 + ratio * ratio));
-
+
ar = den;
#if TRANSA < 3
ai = -ratio * den;
@@ -124,7 +124,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf
br = BB[0];
bi = BB[1];
-
+
BB[0] = ar*br - ai*bi;
BB[1] = ar*bi + ai*br;
#endif
diff --git a/driver/level3/Makefile b/driver/level3/Makefile
index 7d7d723..d62921e 100644
--- a/driver/level3/Makefile
+++ b/driver/level3/Makefile
@@ -1,12 +1,14 @@
TOPDIR = ../..
include ../../Makefile.system
+USE_GEMM3M = 0
+
ifeq ($(ARCH), x86)
-USE_GEMM3M = 1
+USE_GEMM3M = 0
endif
ifeq ($(ARCH), x86_64)
-USE_GEMM3M = 1
+USE_GEMM3M = 0
endif
ifeq ($(ARCH), ia64)
@@ -168,7 +170,7 @@ XBLASOBJS += \
xher2k_kernel_UN.$(SUFFIX) xher2k_kernel_UC.$(SUFFIX) \
xher2k_kernel_LN.$(SUFFIX) xher2k_kernel_LC.$(SUFFIX)
-ifdef USE_GEMM3M
+ifeq ($(USE_GEMM3M), 1)
CBLASOBJS += \
cgemm3m_nn.$(SUFFIX) cgemm3m_cn.$(SUFFIX) cgemm3m_tn.$(SUFFIX) cgemm3m_nc.$(SUFFIX) \
@@ -239,7 +241,7 @@ CBLASOBJS += cherk_thread_UN.$(SUFFIX) cherk_thread_UC.$(SUFFIX) cherk_thread
ZBLASOBJS += zherk_thread_UN.$(SUFFIX) zherk_thread_UC.$(SUFFIX) zherk_thread_LN.$(SUFFIX) zherk_thread_LC.$(SUFFIX)
XBLASOBJS += xherk_thread_UN.$(SUFFIX) xherk_thread_UC.$(SUFFIX) xherk_thread_LN.$(SUFFIX) xherk_thread_LC.$(SUFFIX)
-ifdef USE_GEMM3M
+ifeq ($(USE_GEMM3M), 1)
CBLASOBJS += cgemm3m_thread_nn.$(SUFFIX) cgemm3m_thread_nt.$(SUFFIX) cgemm3m_thread_nr.$(SUFFIX) cgemm3m_thread_nc.$(SUFFIX)
CBLASOBJS += cgemm3m_thread_tn.$(SUFFIX) cgemm3m_thread_tt.$(SUFFIX) cgemm3m_thread_tr.$(SUFFIX) cgemm3m_thread_tc.$(SUFFIX)
@@ -1094,7 +1096,7 @@ ssymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h
ssymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
ssymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -1106,7 +1108,7 @@ dsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h
dsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
dsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -1118,7 +1120,7 @@ qsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h
qsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
qsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -1130,7 +1132,7 @@ csymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h
csymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
csymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -1142,7 +1144,7 @@ zsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h
zsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
zsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -1154,7 +1156,7 @@ xsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h
xsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
xsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -1166,7 +1168,7 @@ ssymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h
ssymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
ssymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -1178,7 +1180,7 @@ dsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h
dsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
dsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -1190,7 +1192,7 @@ qsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h
qsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
qsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -1202,7 +1204,7 @@ csymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h
csymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
csymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -1214,7 +1216,7 @@ zsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h
zsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
zsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -1226,7 +1228,7 @@ xsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h
xsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
xsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -1529,7 +1531,7 @@ chemm_LU.$(SUFFIX) : zhemm_k.c ../../param.h
chemm_LL.$(SUFFIX) : zhemm_k.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
chemm_RU.$(SUFFIX) : zhemm_k.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F)
@@ -1541,7 +1543,7 @@ zhemm_LU.$(SUFFIX) : zhemm_k.c ../../param.h
zhemm_LL.$(SUFFIX) : zhemm_k.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
zhemm_RU.$(SUFFIX) : zhemm_k.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F)
@@ -1553,7 +1555,7 @@ xhemm_LU.$(SUFFIX) : zhemm_k.c ../../param.h
xhemm_LL.$(SUFFIX) : zhemm_k.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
xhemm_RU.$(SUFFIX) : zhemm_k.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F)
@@ -1565,7 +1567,7 @@ chemm_thread_LU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h
chemm_thread_LL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
chemm_thread_RU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F)
@@ -1577,7 +1579,7 @@ zhemm_thread_LU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h
zhemm_thread_LL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
zhemm_thread_RU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F)
@@ -1589,7 +1591,7 @@ xhemm_thread_LU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h
xhemm_thread_LL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
xhemm_thread_RU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F)
@@ -1776,76 +1778,76 @@ xher2k_kernel_LN.$(SUFFIX) : zher2k_kernel.c
xher2k_kernel_LC.$(SUFFIX) : zher2k_kernel.c
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F)
-cgemm3m_nn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_nn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $(@F)
-cgemm3m_nt.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_nt.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $(@F)
-cgemm3m_nr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_nr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $(@F)
-cgemm3m_nc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_nc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $(@F)
-cgemm3m_tn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_tn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $(@F)
-cgemm3m_tt.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_tt.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $(@F)
-cgemm3m_tr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_tr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $(@F)
-cgemm3m_tc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_tc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $(@F)
-cgemm3m_rn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_rn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $(@F)
-cgemm3m_rt.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_rt.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $(@F)
-cgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
-cgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
-cgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
-cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
-cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
-cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
-zgemm3m_nn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+zgemm3m_nn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $(@F)
-zgemm3m_nt.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+zgemm3m_nt.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $(@F)
-zgemm3m_nr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+zgemm3m_nr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $(@F)
-zgemm3m_nc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+zgemm3m_nc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $(@F)
-zgemm3m_tn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+zgemm3m_tn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $(@F)
-zgemm3m_tt.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+zgemm3m_tt.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $(@F)
-zgemm3m_tr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+zgemm3m_tr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $(@F)
-zgemm3m_tc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
+zgemm3m_tc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $(@F)
zgemm3m_rn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
@@ -2078,7 +2080,7 @@ csymm3m_LU.$(SUFFIX) : symm3m_k.c ../../param.h
csymm3m_LL.$(SUFFIX) : symm3m_k.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
csymm3m_RU.$(SUFFIX) : symm3m_k.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -2090,7 +2092,7 @@ zsymm3m_LU.$(SUFFIX) : symm3m_k.c ../../param.h
zsymm3m_LL.$(SUFFIX) : symm3m_k.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
zsymm3m_RU.$(SUFFIX) : symm3m_k.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -2102,7 +2104,7 @@ xsymm3m_LU.$(SUFFIX) : symm3m_k.c ../../param.h
xsymm3m_LL.$(SUFFIX) : symm3m_k.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
xsymm3m_RU.$(SUFFIX) : symm3m_k.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -2114,7 +2116,7 @@ csymm3m_thread_LU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h
csymm3m_thread_LL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
csymm3m_thread_RU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -2126,7 +2128,7 @@ zsymm3m_thread_LU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h
zsymm3m_thread_LL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
zsymm3m_thread_RU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -2138,7 +2140,7 @@ xsymm3m_thread_LU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h
xsymm3m_thread_LL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
xsymm3m_thread_RU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -2150,7 +2152,7 @@ chemm3m_LU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h
chemm3m_LL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
chemm3m_RU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -2162,7 +2164,7 @@ zhemm3m_LU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h
zhemm3m_LL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
zhemm3m_RU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -2174,7 +2176,7 @@ xhemm3m_LU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h
xhemm3m_LL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
xhemm3m_RU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -2186,7 +2188,7 @@ chemm3m_thread_LU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h
chemm3m_thread_LL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
chemm3m_thread_RU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -2198,7 +2200,7 @@ zhemm3m_thread_LU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h
zhemm3m_thread_LL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
zhemm3m_thread_RU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -2210,7 +2212,7 @@ xhemm3m_thread_LU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h
xhemm3m_thread_LL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
xhemm3m_thread_RU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -3463,7 +3465,7 @@ ssymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h
ssymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
ssymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -3475,7 +3477,7 @@ dsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h
dsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
dsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -3487,7 +3489,7 @@ qsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h
qsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
qsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -3499,7 +3501,7 @@ csymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h
csymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
csymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -3511,7 +3513,7 @@ zsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h
zsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
zsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -3523,7 +3525,7 @@ xsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h
xsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
xsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h
$(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -3535,7 +3537,7 @@ ssymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h
ssymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
ssymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -3547,7 +3549,7 @@ dsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h
dsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
dsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -3559,7 +3561,7 @@ qsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h
qsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
qsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -3571,7 +3573,7 @@ csymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h
csymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
csymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -3583,7 +3585,7 @@ zsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h
zsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
zsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -3595,7 +3597,7 @@ xsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h
xsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
xsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -3898,7 +3900,7 @@ chemm_LU.$(PSUFFIX) : zhemm_k.c ../../param.h
chemm_LL.$(PSUFFIX) : zhemm_k.c ../../param.h
$(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
chemm_RU.$(PSUFFIX) : zhemm_k.c ../../param.h
$(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F)
@@ -3910,7 +3912,7 @@ zhemm_LU.$(PSUFFIX) : zhemm_k.c ../../param.h
zhemm_LL.$(PSUFFIX) : zhemm_k.c ../../param.h
$(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
zhemm_RU.$(PSUFFIX) : zhemm_k.c ../../param.h
$(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F)
@@ -3922,7 +3924,7 @@ xhemm_LU.$(PSUFFIX) : zhemm_k.c ../../param.h
xhemm_LL.$(PSUFFIX) : zhemm_k.c ../../param.h
$(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
xhemm_RU.$(PSUFFIX) : zhemm_k.c ../../param.h
$(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F)
@@ -3934,7 +3936,7 @@ chemm_thread_LU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h
chemm_thread_LL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
chemm_thread_RU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F)
@@ -3946,7 +3948,7 @@ zhemm_thread_LU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h
zhemm_thread_LL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
zhemm_thread_RU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F)
@@ -3958,7 +3960,7 @@ xhemm_thread_LU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h
xhemm_thread_LL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
xhemm_thread_RU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F)
@@ -4145,76 +4147,76 @@ xher2k_kernel_LN.$(PSUFFIX) : zher2k_kernel.c
xher2k_kernel_LC.$(PSUFFIX) : zher2k_kernel.c
$(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F)
-cgemm3m_nn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_nn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $(@F)
-cgemm3m_nt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_nt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $(@F)
-cgemm3m_nr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_nr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $(@F)
-cgemm3m_nc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_nc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $(@F)
-cgemm3m_tn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_tn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $(@F)
-cgemm3m_tt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_tt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $(@F)
-cgemm3m_tr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_tr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $(@F)
-cgemm3m_tc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_tc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $(@F)
-cgemm3m_rn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_rn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $(@F)
-cgemm3m_rt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_rt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $(@F)
-cgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
-cgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
-cgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
-cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
-cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
-cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
-zgemm3m_nn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+zgemm3m_nn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $(@F)
-zgemm3m_nt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+zgemm3m_nt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $(@F)
-zgemm3m_nr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+zgemm3m_nr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $(@F)
-zgemm3m_nc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+zgemm3m_nc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $(@F)
-zgemm3m_tn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+zgemm3m_tn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $(@F)
-zgemm3m_tt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+zgemm3m_tt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $(@F)
-zgemm3m_tr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+zgemm3m_tr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $(@F)
-zgemm3m_tc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
+zgemm3m_tc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $(@F)
zgemm3m_rn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
@@ -4447,7 +4449,7 @@ csymm3m_LU.$(PSUFFIX) : symm3m_k.c ../../param.h
csymm3m_LL.$(PSUFFIX) : symm3m_k.c ../../param.h
$(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
csymm3m_RU.$(PSUFFIX) : symm3m_k.c ../../param.h
$(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -4459,7 +4461,7 @@ zsymm3m_LU.$(PSUFFIX) : symm3m_k.c ../../param.h
zsymm3m_LL.$(PSUFFIX) : symm3m_k.c ../../param.h
$(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
zsymm3m_RU.$(PSUFFIX) : symm3m_k.c ../../param.h
$(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -4471,7 +4473,7 @@ xsymm3m_LU.$(PSUFFIX) : symm3m_k.c ../../param.h
xsymm3m_LL.$(PSUFFIX) : symm3m_k.c ../../param.h
$(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
xsymm3m_RU.$(PSUFFIX) : symm3m_k.c ../../param.h
$(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -4483,7 +4485,7 @@ csymm3m_thread_LU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h
csymm3m_thread_LL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
csymm3m_thread_RU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -4495,7 +4497,7 @@ zsymm3m_thread_LU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h
zsymm3m_thread_LL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
zsymm3m_thread_RU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -4507,7 +4509,7 @@ xsymm3m_thread_LU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h
xsymm3m_thread_LL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
xsymm3m_thread_RU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -4519,7 +4521,7 @@ chemm3m_LU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h
chemm3m_LL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h
$(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
chemm3m_RU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h
$(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -4531,7 +4533,7 @@ zhemm3m_LU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h
zhemm3m_LL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h
$(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
zhemm3m_RU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h
$(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -4543,7 +4545,7 @@ xhemm3m_LU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h
xhemm3m_LL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h
$(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
xhemm3m_RU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h
$(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -4555,7 +4557,7 @@ chemm3m_thread_LU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h
chemm3m_thread_LL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
chemm3m_thread_RU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -4567,7 +4569,7 @@ zhemm3m_thread_LU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h
zhemm3m_thread_LL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
zhemm3m_thread_RU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
@@ -4579,7 +4581,7 @@ xhemm3m_thread_LU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h
xhemm3m_thread_LL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F)
-
+
xhemm3m_thread_RU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h
$(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F)
diff --git a/driver/level3/gemm3m_level3.c b/driver/level3/gemm3m_level3.c
index df4d723..0649682 100644
--- a/driver/level3/gemm3m_level3.c
+++ b/driver/level3/gemm3m_level3.c
@@ -306,10 +306,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for(js = n_from; js < n_to; js += GEMM3M_R){
min_j = n_to - js;
if (min_j > GEMM3M_R) min_j = GEMM3M_R;
-
+
for(ls = 0; ls < k; ls += min_l){
min_l = k - ls;
-
+
if (min_l >= GEMM3M_Q * 2) {
min_l = GEMM3M_Q;
} else {
@@ -320,7 +320,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#endif
}
}
-
+
min_i = m_to - m_from;
if (min_i >= GEMM3M_P * 2) {
min_i = GEMM3M_P;
@@ -331,53 +331,53 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
}
START_RPCC();
-
+
ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
-
+
STOP_RPCC(innercost);
-
+
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
-
+
START_RPCC();
-
+
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT)
OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js));
#else
OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
#endif
-
+
STOP_RPCC(outercost);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6,
sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs);
-
+
STOP_RPCC(kernelcost);
-
- }
-
+
+ }
+
for(is = m_from + min_i; is < m_to; is += min_i){
min_i = m_to - is;
if (min_i >= GEMM3M_P * 2) {
min_i = GEMM3M_P;
- } else
+ } else
if (min_i > GEMM3M_P) {
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
}
-
+
START_RPCC();
-
+
ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa);
-
+
STOP_RPCC(innercost);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, min_j, min_l, ALPHA5, ALPHA6, sa, sb, c, ldc, is, js);
-
+
STOP_RPCC(kernelcost);
}
@@ -389,19 +389,19 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
}
}
-
+
START_RPCC();
-
+
ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
-
+
STOP_RPCC(innercost);
-
+
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
-
+
START_RPCC();
-
+
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js));
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
@@ -413,37 +413,37 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#endif
STOP_RPCC(outercost);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12,
sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs);
-
+
STOP_RPCC(kernelcost);
-
- }
-
+
+ }
+
for(is = m_from + min_i; is < m_to; is += min_i){
min_i = m_to - is;
if (min_i >= GEMM3M_P * 2) {
min_i = GEMM3M_P;
- } else
+ } else
if (min_i > GEMM3M_P) {
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
}
-
+
START_RPCC();
-
+
ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa);
-
+
STOP_RPCC(innercost);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, min_j, min_l, ALPHA11, ALPHA12, sa, sb, c, ldc, is, js);
-
+
STOP_RPCC(kernelcost);
-
+
}
min_i = m_to - m_from;
@@ -454,20 +454,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
}
}
-
+
START_RPCC();
-
+
ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
-
+
STOP_RPCC(innercost);
-
+
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
-
+
START_RPCC();
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js));
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
@@ -478,42 +478,42 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#endif
STOP_RPCC(outercost);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18,
sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs);
-
+
STOP_RPCC(kernelcost);
-
- }
-
+
+ }
+
for(is = m_from + min_i; is < m_to; is += min_i){
min_i = m_to - is;
if (min_i >= GEMM3M_P * 2) {
min_i = GEMM3M_P;
- } else
+ } else
if (min_i > GEMM3M_P) {
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
}
-
+
START_RPCC();
-
+
ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa);
-
+
STOP_RPCC(innercost);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, min_j, min_l, ALPHA17, ALPHA18, sa, sb, c, ldc, is, js);
-
+
STOP_RPCC(kernelcost);
-
+
}
} /* end of js */
} /* end of ls */
-
+
#ifdef TIMING
total = (double)outercost + (double)innercost + (double)kernelcost;
@@ -526,6 +526,6 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
((double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost / 2 * 100,
2400. * (2. * (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost);
#endif
-
+
return 0;
}
diff --git a/driver/level3/gemm_thread_m.c b/driver/level3/gemm_thread_m.c
index 52c9b2d..8813e55 100644
--- a/driver/level3/gemm_thread_m.c
+++ b/driver/level3/gemm_thread_m.c
@@ -58,7 +58,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
num_cpu = 0;
while (i > 0){
-
+
width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu);
i -= width;
@@ -76,15 +76,15 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
queue[num_cpu].next = &queue[num_cpu + 1];
num_cpu ++;
}
-
+
if (num_cpu) {
queue[0].sa = sa;
queue[0].sb = sb;
queue[num_cpu - 1].next = NULL;
-
+
exec_blas(num_cpu, queue);
}
-
+
return 0;
}
diff --git a/driver/level3/gemm_thread_mn.c b/driver/level3/gemm_thread_mn.c
index b81c6fa..2966eac 100644
--- a/driver/level3/gemm_thread_mn.c
+++ b/driver/level3/gemm_thread_mn.c
@@ -40,7 +40,7 @@
#include <stdlib.h>
#include "common.h"
-static const int divide_rule[][2] =
+static const int divide_rule[][2] =
{{ 0, 0},
{ 1, 1}, { 1, 2}, { 1, 3}, { 2, 2},
{ 1, 5}, { 2, 3}, { 1, 7}, { 2, 4},
@@ -84,7 +84,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
num_cpu_m = 0;
while (i > 0){
-
+
width = blas_quickdivide(i + divM - num_cpu_m - 1, divM - num_cpu_m);
i -= width;
@@ -106,7 +106,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
num_cpu_n = 0;
while (i > 0){
-
+
width = blas_quickdivide(i + divN - num_cpu_n - 1, divN - num_cpu_n);
i -= width;
@@ -134,15 +134,15 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
procs ++;
}
}
-
+
if (procs) {
queue[0].sa = sa;
queue[0].sb = sb;
queue[procs - 1].next = NULL;
-
+
exec_blas(procs, queue);
}
-
+
return 0;
}
diff --git a/driver/level3/gemm_thread_n.c b/driver/level3/gemm_thread_n.c
index 3e11f9a..9668841 100644
--- a/driver/level3/gemm_thread_n.c
+++ b/driver/level3/gemm_thread_n.c
@@ -54,11 +54,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
range[0] = range_n[0];
i = range_n[1] - range_n[0];
}
-
+
num_cpu = 0;
while (i > 0){
-
+
width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu);
i -= width;
@@ -81,7 +81,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
queue[num_cpu].next = &queue[num_cpu + 1];
num_cpu ++;
}
-
+
if (num_cpu) {
#if 0 //defined(LOONGSON3A)
queue[0].sa = sa;
@@ -91,10 +91,10 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
queue[0].sb = sb;
#endif
queue[num_cpu - 1].next = NULL;
-
+
exec_blas(num_cpu,
queue);
}
-
+
return 0;
}
diff --git a/driver/level3/gemm_thread_variable.c b/driver/level3/gemm_thread_variable.c
index 9ffe170..162a75f 100644
--- a/driver/level3/gemm_thread_variable.c
+++ b/driver/level3/gemm_thread_variable.c
@@ -62,7 +62,7 @@ int CNAME(int mode,
num_cpu_m = 0;
while (i > 0){
-
+
width = blas_quickdivide(i + divM - num_cpu_m - 1, divM - num_cpu_m);
i -= width;
@@ -84,7 +84,7 @@ int CNAME(int mode,
num_cpu_n = 0;
while (i > 0){
-
+
width = blas_quickdivide(i + divN - num_cpu_n - 1, divN - num_cpu_n);
i -= width;
@@ -112,7 +112,7 @@ int CNAME(int mode,
procs ++;
}
}
-
+
if (procs) {
queue[0].sa = sa;
queue[0].sb = sb;
@@ -121,7 +121,7 @@ int CNAME(int mode,
exec_blas(procs, queue);
}
-
+
return 0;
}
diff --git a/driver/level3/level3.c b/driver/level3/level3.c
index 5f74664..2612040 100644
--- a/driver/level3/level3.c
+++ b/driver/level3/level3.c
@@ -241,7 +241,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
) {
#if defined(XDOUBLE) && defined(QUAD_PRECISION)
xidouble xbeta;
-
+
qtox(&xbeta, beta);
#endif
BETA_OPERATION(m_from, m_to, n_from, n_to, beta, c, ldc);
@@ -287,7 +287,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for(js = n_from; js < n_to; js += GEMM_R){
min_j = n_to - js;
if (min_j > GEMM_R) min_j = GEMM_R;
-
+
for(ls = 0; ls < k; ls += min_l){
min_l = k - ls;
@@ -302,11 +302,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1));
while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M;
}
-
+
/* First, we have to move data A to L2 cache */
min_i = m_to - m_from;
l1stride = 1;
-
+
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
} else {
@@ -316,13 +316,13 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
l1stride = 0;
}
}
-
+
START_RPCC();
-
+
ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
-
+
STOP_RPCC(innercost);
-
+
#if defined(FUSED_GEMM) && !defined(TIMING)
FUSED_KERNEL_OPERATION(min_i, min_j, min_l, alpha,
@@ -344,16 +344,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
-
+
START_RPCC();
-
- OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
+
+ OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
sb + min_l * (jjs - js) * COMPSIZE * l1stride);
-
+
STOP_RPCC(outercost);
-
+
START_RPCC();
-
+
#if !defined(XDOUBLE) || !defined(QUAD_PRECISION)
KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs);
@@ -363,39 +363,39 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#endif
STOP_RPCC(kernelcost);
- }
+ }
#endif
-
+
for(is = m_from + min_i; is < m_to; is += min_i){
min_i = m_to - is;
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
- } else
+ } else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
}
-
+
START_RPCC();
-
+
ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
-
+
STOP_RPCC(innercost);
-
+
START_RPCC();
-
+
#if !defined(XDOUBLE) || !defined(QUAD_PRECISION)
KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js);
#else
KERNEL_OPERATION(min_i, min_j, min_l, (void *)&xalpha, sa, sb, c, ldc, is, js);
#endif
-
+
STOP_RPCC(kernelcost);
} /* end of is */
} /* end of js */
} /* end of ls */
-
+
#ifdef TIMING
total = (double)outercost + (double)innercost + (double)kernelcost;
diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c
index bcb0f9d..02bf57e 100644
--- a/driver/level3/level3_gemm3m_thread.c
+++ b/driver/level3/level3_gemm3m_thread.c
@@ -49,7 +49,7 @@
#endif
//The array of job_t may overflow the stack.
-//Instead, use malloc to alloc job_t.
+//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
#define USE_ALLOC_HEAP
#endif
@@ -362,12 +362,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#endif
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
-
+
buffer[0] = sb;
for (i = 1; i < DIVIDE_RATE; i++) {
buffer[i] = buffer[i - 1] + GEMM3M_Q * ((div_n + GEMM3M_UNROLL_N - 1) & ~(GEMM3M_UNROLL_N - 1));
}
-
+
for(ls = 0; ls < k; ls += min_l){
min_l = k - ls;
if (min_l >= GEMM3M_Q * 2) {
@@ -379,7 +379,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
}
min_i = m_to - m_from;
-
+
if (min_i >= GEMM3M_P * 2) {
min_i = GEMM3M_P;
} else {
@@ -390,73 +390,73 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC();
-
+
ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
-
+
STOP_RPCC(copy_A);
-
+
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
-
+
for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
-
+
START_RPCC();
-
+
/* Make sure if no one is using another buffer */
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
-
+
STOP_RPCC(waiting1);
-
+
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(n_to, xxx + div_n) - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
-
+
START_RPCC();
-
+
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT)
OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
#else
OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
#endif
-
+
STOP_RPCC(copy_B);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6,
sa, buffer[bufferside] + min_l * (jjs - xxx),
c, ldc, m_from, jjs);
-
+
STOP_RPCC(kernel);
#ifdef TIMING
ops += 2 * min_i * min_jj * min_l;
#endif
}
-
+
for (i = 0; i < args -> nthreads; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
}
-
+
current = mypos;
-
+
do {
current ++;
if (current >= args -> nthreads) current = 0;
-
+
div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
-
+
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
-
+
if (current != mypos) {
-
+
START_RPCC();
-
+
/* thread has to wait */
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
-
+
STOP_RPCC(waiting2);
-
+
START_RPCC();
@@ -469,42 +469,42 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
#endif
}
-
+
if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
}
}
} while (current != mypos);
-
+
for(is = m_from + min_i; is < m_to; is += min_i){
min_i = m_to - is;
if (min_i >= GEMM3M_P * 2) {
min_i = GEMM3M_P;
- } else
+ } else
if (min_i > GEMM3M_P) {
min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
}
-
+
START_RPCC();
-
+
ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa);
-
+
STOP_RPCC(copy_A);
-
+
current = mypos;
do {
-
+
div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
-
+
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, ldc, is, xxx);
-
+
STOP_RPCC(kernel);
#ifdef TIMING
ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l;
@@ -514,38 +514,38 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
}
}
-
+
current ++;
if (current >= args -> nthreads) current = 0;
-
+
} while (current != mypos);
-
+
} /* end of is */
-
+
START_RPCC();
-
+
ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
-
+
STOP_RPCC(copy_A);
-
+
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
-
+
for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
-
+
START_RPCC();
-
+
/* Make sure if no one is using another buffer */
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
-
+
STOP_RPCC(waiting1);
-
+
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(n_to, xxx + div_n) - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
-
+
START_RPCC();
-
+
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
@@ -557,43 +557,43 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#endif
STOP_RPCC(copy_B);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12,
sa, buffer[bufferside] + min_l * (jjs - xxx),
c, ldc, m_from, jjs);
-
+
STOP_RPCC(kernel);
#ifdef TIMING
ops += 2 * min_i * min_jj * min_l;
#endif
}
-
+
for (i = 0; i < args -> nthreads; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
}
-
+
current = mypos;
-
+
do {
current ++;
if (current >= args -> nthreads) current = 0;
-
+
div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
-
+
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
-
+
if (current != mypos) {
-
+
START_RPCC();
-
+
/* thread has to wait */
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
-
+
STOP_RPCC(waiting2);
-
+
START_RPCC();
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12,
@@ -605,41 +605,41 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
#endif
}
-
+
if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
}
}
} while (current != mypos);
-
+
for(is = m_from + min_i; is < m_to; is += min_i){
min_i = m_to - is;
if (min_i >= GEMM3M_P * 2) {
min_i = GEMM3M_P;
- } else
+ } else
if (min_i > GEMM3M_P) {
min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
}
-
+
START_RPCC();
-
+
ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa);
-
+
STOP_RPCC(copy_A);
-
+
current = mypos;
do {
-
+
div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
-
+
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
-
+
START_RPCC();
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, ldc, is, xxx);
-
+
STOP_RPCC(kernel);
#ifdef TIMING
ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l;
@@ -649,40 +649,40 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
}
}
-
+
current ++;
if (current >= args -> nthreads) current = 0;
-
+
} while (current != mypos);
-
+
} /* end of is */
-
+
START_RPCC();
-
+
ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
-
+
STOP_RPCC(copy_A);
-
+
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
-
+
for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
-
+
START_RPCC();
-
+
/* Make sure if no one is using another buffer */
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
-
+
STOP_RPCC(waiting1);
-
+
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(n_to, xxx + div_n) - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
-
+
START_RPCC();
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
@@ -693,43 +693,43 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#endif
STOP_RPCC(copy_B);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18,
sa, buffer[bufferside] + min_l * (jjs - xxx),
c, ldc, m_from, jjs);
-
+
STOP_RPCC(kernel);
#ifdef TIMING
ops += 2 * min_i * min_jj * min_l;
#endif
}
-
+
for (i = 0; i < args -> nthreads; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
}
-
+
current = mypos;
-
+
do {
current ++;
if (current >= args -> nthreads) current = 0;
-
+
div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
-
+
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
-
+
if (current != mypos) {
-
+
START_RPCC();
-
+
/* thread has to wait */
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
-
+
STOP_RPCC(waiting2);
-
+
START_RPCC();
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18,
@@ -741,41 +741,41 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
#endif
}
-
+
if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
}
}
} while (current != mypos);
-
+
for(is = m_from + min_i; is < m_to; is += min_i){
min_i = m_to - is;
if (min_i >= GEMM3M_P * 2) {
min_i = GEMM3M_P;
- } else
+ } else
if (min_i > GEMM3M_P) {
min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
}
-
+
START_RPCC();
-
+
ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa);
-
+
STOP_RPCC(copy_A);
-
+
current = mypos;
do {
-
+
div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
-
+
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, ldc, is, xxx);
-
+
STOP_RPCC(kernel);
#ifdef TIMING
ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l;
@@ -785,16 +785,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
}
}
-
+
current ++;
if (current >= args -> nthreads) current = 0;
-
+
} while (current != mypos);
-
+
} /* end of is */
}
-
+
START_RPCC();
for (i = 0; i < args -> nthreads; i++) {
@@ -862,7 +862,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE;
#else
mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
-#endif
+#endif
newarg.m = args -> m;
newarg.n = args -> n;
@@ -886,7 +886,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
#endif
newarg.common = (void *)job;
-
+
if (!range_m) {
range_M[0] = 0;
m = args -> m;
@@ -898,7 +898,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
num_cpu_m = 0;
while (m > 0){
-
+
width = blas_quickdivide(m + nthreads - num_cpu_m - 1, nthreads - num_cpu_m);
m -= width;
@@ -919,10 +919,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
queue[i].sb = NULL;
queue[i].next = &queue[i + 1];
}
-
+
queue[0].sa = sa;
queue[0].sb = sb;
-
+
if (!range_n) {
n_from = 0;
n_to = args -> n;
@@ -934,23 +934,23 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
for(js = n_from; js < n_to; js += GEMM_R * nthreads){
n = n_to - js;
if (n > GEMM_R * nthreads) n = GEMM_R * nthreads;
-
+
range_N[0] = js;
num_cpu_n = 0;
while (n > 0){
-
+
width = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n);
-
+
n -= width;
if (n < 0) width = width + n;
-
+
range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width;
-
+
num_cpu_n ++;
}
-
+
for (j = 0; j < num_cpu_m; j++) {
for (i = 0; i < num_cpu_m; i++) {
for (k = 0; k < DIVIDE_RATE; k++) {
@@ -958,9 +958,9 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
}
}
}
-
+
queue[num_cpu_m - 1].next = NULL;
-
+
exec_blas(num_cpu_m, queue);
}
@@ -978,7 +978,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
BLASLONG nthreads = args -> nthreads;
BLASLONG divN, divT;
int mode;
-
+
if (range_m) {
BLASLONG m_from = *(((BLASLONG *)range_m) + 0);
BLASLONG m_to = *(((BLASLONG *)range_m) + 1);
@@ -1020,8 +1020,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
-
+#endif
+
#if defined(TN) || defined(TT) || defined(TR) || defined(TC) || \
defined(CN) || defined(CT) || defined(CR) || defined(CC)
mode |= (BLAS_TRANSA_T);
@@ -1030,8 +1030,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
defined(NC) || defined(TC) || defined(RC) || defined(CC)
mode |= (BLAS_TRANSB_T);
#endif
-
- gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN);
+
+ gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN);
}
return 0;
diff --git a/driver/level3/level3_syr2k.c b/driver/level3/level3_syr2k.c
index 2db1857..a75d379 100644
--- a/driver/level3/level3_syr2k.c
+++ b/driver/level3/level3_syr2k.c
@@ -178,16 +178,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_l = k - ls;
if (min_l >= GEMM_Q * 2) {
min_l = GEMM_Q;
- } else
+ } else
if (min_l > GEMM_Q) {
min_l = (min_l + 1) / 2;
}
min_i = m_end - m_start;
-
+
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
- } else
+ } else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
}
@@ -195,44 +195,44 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
#ifndef LOWER
if (m_start >= js) {
-
+
ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
aa = sb + min_l * (m_start - js) * COMPSIZE;
-
+
OCOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, aa);
-
+
KERNEL_OPERATION(min_i, min_i, min_l, alpha, sa, aa, c, ldc, m_start, m_start, 1);
-
+
jjs = m_start + min_i;
} else {
-
+
ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
jjs = js;
}
-
+
for(; jjs < js + min_j; jjs += GEMM_UNROLL_MN){
min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
-
+
OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
-
+
KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
sa, sb + min_l * (jjs - js) * COMPSIZE,
c, ldc, m_start, jjs, 1);
}
-
+
for(is = m_start + min_i; is < m_end; is += min_i){
min_i = m_end - is;
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
- } else
+ } else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
}
-
+
ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 1);
@@ -243,50 +243,50 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
- } else
+ } else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
}
if (m_start >= js) {
-
+
ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa);
aa = sb + min_l * (m_start - js) * COMPSIZE;
-
+
OCOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, aa);
-
+
KERNEL_OPERATION_C(min_i, min_i, min_l, alpha, sa, aa, c, ldc, m_start, m_start, 0);
-
+
jjs = m_start + min_i;
} else {
-
+
ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa);
jjs = js;
}
-
+
for(; jjs < js + min_j; jjs += GEMM_UNROLL_MN){
min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
-
+
OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
-
+
KERNEL_OPERATION_C(min_i, min_jj, min_l, alpha,
sa, sb + min_l * (jjs - js) * COMPSIZE,
c, ldc, m_start, jjs, 0);
}
-
+
for(is = m_start + min_i; is < m_end; is += min_i){
min_i = m_end - is;
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
- } else
+ } else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
}
-
+
ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa);
KERNEL_OPERATION_C(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 0);
@@ -300,49 +300,49 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
OCOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, aa);
-
+
KERNEL_OPERATION(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha,
sa, aa, c, ldc, m_start, m_start, 1);
for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_MN){
min_jj = m_start - jjs;
if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
-
+
OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
-
+
KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs, 1);
}
for(is = m_start + min_i; is < m_end; is += min_i){
-
+
min_i = m_end - is;
-
+
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
- } else
+ } else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
}
-
+
aa = sb + min_l * (is - js) * COMPSIZE;
if (is < js + min_j) {
-
+
ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
-
+
OCOPY_OPERATION(min_l, min_i, b, ldb, ls, is, aa);
-
+
KERNEL_OPERATION(min_i, MIN(min_i, min_j - is + js), min_l, alpha, sa, aa, c, ldc, is, is, 1);
-
+
KERNEL_OPERATION(min_i, is - js, min_l, alpha, sa, sb, c, ldc, is, js, 1);
-
+
} else {
-
+
ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
-
+
KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 1);
-
+
}
}
@@ -351,7 +351,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
- } else
+ } else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
}
@@ -361,49 +361,49 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa);
OCOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, aa);
-
+
KERNEL_OPERATION_C(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha,
sa, aa, c, ldc, m_start, m_start, 0);
for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_MN){
min_jj = m_start - jjs;
if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
-
+
OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
-
+
KERNEL_OPERATION_C(min_i, min_jj, min_l, alpha,
sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs, 0);
}
for(is = m_start + min_i; is < m_end; is += min_i){
-
+
min_i = m_end - is;
-
+
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
- } else
+ } else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
}
-
+
aa = sb + min_l * (is - js) * COMPSIZE;
if (is < js + min_j) {
-
+
ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa);
-
+
OCOPY_OPERATION(min_l, min_i, a, lda, ls, is, aa);
-
+
KERNEL_OPERATION_C(min_i, MIN(min_i, min_j - is + js), min_l, alpha, sa, aa, c, ldc, is, is, 0);
-
+
KERNEL_OPERATION_C(min_i, is - js, min_l, alpha, sa, sb, c, ldc, is, js, 0);
-
+
} else {
-
+
ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa);
-
+
KERNEL_OPERATION_C(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 0);
-
+
}
}
diff --git a/driver/level3/level3_syrk.c b/driver/level3/level3_syrk.c
index 249c140..ba544a0 100644
--- a/driver/level3/level3_syrk.c
+++ b/driver/level3/level3_syrk.c
@@ -187,16 +187,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_l = k - ls;
if (min_l >= GEMM_Q * 2) {
min_l = GEMM_Q;
- } else
+ } else
if (min_l > GEMM_Q) {
min_l = (min_l + 1) / 2;
}
min_i = m_end - m_start;
-
+
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
- } else
+ } else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
}
@@ -207,29 +207,29 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
aa = sb + min_l * MAX(m_start - js, 0) * COMPSIZE;
if (!shared) aa = sa;
-
+
for(jjs = MAX(m_start, js); jjs < js + min_j; jjs += min_jj){
min_jj = js + min_j - jjs;
if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
-
+
if (!shared && (jjs - MAX(m_start, js) < min_i)) {
START_RPCC();
-
+
ICOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sa + min_l * (jjs - js) * COMPSIZE);
-
+
STOP_RPCC(innercost);
}
-
+
START_RPCC();
-
+
OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
-
+
STOP_RPCC(outercost);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, min_jj, min_l, alpha, aa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, MAX(m_start, js), jjs);
-
+
STOP_RPCC(kernelcost);
}
@@ -237,30 +237,30 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = m_end - is;
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
- } else
+ } else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
}
-
+
aa = sb + min_l * (is - js) * COMPSIZE;
-
+
if (!shared) {
-
+
START_RPCC();
-
+
ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
-
+
STOP_RPCC(innercost);
aa = sa;
}
START_RPCC();
-
+
KERNEL_OPERATION(min_i, min_j, min_l, alpha, aa, sb, c, ldc, is, js);
-
+
STOP_RPCC(kernelcost);
-
+
}
}
@@ -268,27 +268,27 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (m_start < js) {
if (m_end < js) {
-
+
START_RPCC();
-
+
ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
-
+
STOP_RPCC(innercost);
-
+
for(jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_MN){
min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
-
+
START_RPCC();
-
+
OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
-
+
STOP_RPCC(outercost);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs);
-
+
STOP_RPCC(kernelcost);
}
@@ -301,180 +301,180 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = MIN(m_end, js)- is;
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
- } else
+ } else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
}
-
+
START_RPCC();
-
+
ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
-
+
STOP_RPCC(innercost);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js);
-
+
STOP_RPCC(kernelcost);
-
+
}
}
#else
if (m_start < js + min_j) {
-
+
aa = sb + min_l * (m_start - js) * COMPSIZE;
-
+
if (!shared) {
START_RPCC();
-
+
ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
-
+
STOP_RPCC(innercost);
-
+
}
START_RPCC();
-
+
OCOPY_OPERATION(min_l, (shared? (min_i) : MIN(min_i, min_j + js - m_start)), a, lda, ls, m_start, aa);
-
+
STOP_RPCC(outercost);
START_RPCC();
-
+
KERNEL_OPERATION(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha, (shared? (aa) : (sa)), aa, c, ldc, m_start, m_start);
-
+
STOP_RPCC(kernelcost);
for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_N){
min_jj = m_start - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+
START_RPCC();
-
+
OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
-
+
STOP_RPCC(outercost);
-
+
START_RPCC();
KERNEL_OPERATION(min_i, min_jj, min_l, alpha, (shared? (aa) : (sa)), sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs);
-
+
STOP_RPCC(kernelcost);
-
+
}
for(is = m_start + min_i; is < m_end; is += min_i){
-
+
min_i = m_end - is;
-
+
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
- } else
+ } else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
}
-
+
if (is < js + min_j) {
-
+
if (!shared) {
START_RPCC();
-
+
ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
-
+
STOP_RPCC(innercost);
}
aa = sb + min_l * (is - js) * COMPSIZE;
-
+
START_RPCC();
-
+
OCOPY_OPERATION(min_l, (shared? (min_i) : MIN(min_i, min_j - is + js)), a, lda, ls, is, aa);
-
+
STOP_RPCC(outercost);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, MIN(min_i, min_j - is + js), min_l, alpha, (shared? (aa) : (sa)), aa, c, ldc, is, is);
-
+
STOP_RPCC(kernelcost);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, is - js, min_l, alpha, (shared? (aa) : (sa)), sb, c, ldc, is, js);
-
+
STOP_RPCC(kernelcost);
-
+
} else {
-
+
START_RPCC();
-
+
ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
-
+
STOP_RPCC(innercost);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js);
-
+
STOP_RPCC(kernelcost);
-
+
}
-
+
}
} else {
START_RPCC();
-
+
ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa);
-
+
STOP_RPCC(innercost);
-
+
for(jjs = js; jjs < min_j; jjs += GEMM_UNROLL_N){
min_jj = min_j - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+
START_RPCC();
-
+
OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE);
-
+
STOP_RPCC(outercost);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs);
-
+
STOP_RPCC(kernelcost);
-
+
}
-
+
for(is = m_start + min_i; is < m_end; is += min_i){
-
+
min_i = m_end - is;
-
+
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
- } else
+ } else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
}
-
+
START_RPCC();
-
+
ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
-
+
STOP_RPCC(innercost);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js);
-
+
STOP_RPCC(kernelcost);
-
+
}
}
#endif
diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c
index 4a3f7a8..01c7b23 100644
--- a/driver/level3/level3_syrk_threaded.c
+++ b/driver/level3/level3_syrk_threaded.c
@@ -49,7 +49,7 @@
#endif
//The array of job_t may overflow the stack.
-//Instead, use malloc to alloc job_t.
+//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
#define USE_ALLOC_HEAP
#endif
@@ -217,7 +217,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for (i = 1; i < DIVIDE_RATE; i++) {
buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE;
}
-
+
for(ls = 0; ls < k; ls += min_l){
min_l = k - ls;
@@ -228,7 +228,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
}
min_i = m_to - m_from;
-
+
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
} else {
@@ -244,22 +244,22 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#endif
START_RPCC();
-
+
#ifndef LOWER
ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
#else
ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_to - min_i, sa);
#endif
-
+
STOP_RPCC(copy_A);
-
+
div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE
+ GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
-
+
for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) {
-
+
START_RPCC();
-
+
/* Make sure if no one is using buffer */
#ifndef LOWER
for (i = 0; i < mypos; i++)
@@ -267,9 +267,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for (i = mypos + 1; i < args -> nthreads; i++)
#endif
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
-
+
STOP_RPCC(waiting1);
-
+
#ifndef LOWER
for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){
@@ -281,16 +281,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
} else {
if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
}
-
+
START_RPCC();
-
- OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs,
+
+ OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs,
buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE);
-
+
STOP_RPCC(copy_B);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE,
c, ldc, m_from, jjs);
@@ -310,20 +310,20 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_jj = MIN(m_to, xxx + div_n) - jjs;
if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
-
+
START_RPCC();
-
- OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs,
+
+ OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs,
buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE);
-
+
STOP_RPCC(copy_B);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE,
c, ldc, m_to - min_i, jjs);
-
+
STOP_RPCC(kernel);
#ifdef TIMING
@@ -333,7 +333,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
}
#endif
-
+
#ifndef LOWER
for (i = 0; i <= mypos; i++)
#else
@@ -344,7 +344,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
WMB;
}
-
+
#ifndef LOWER
current = mypos + 1;
while (current < args -> nthreads) {
@@ -355,42 +355,42 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE
+ GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
-
+
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
-
+
START_RPCC();
-
+
/* thread has to wait */
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
-
+
STOP_RPCC(waiting2);
-
+
START_RPCC();
-
+
#ifndef LOWER
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
- c, ldc,
+ c, ldc,
m_from,
xxx);
#else
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
- c, ldc,
+ c, ldc,
m_to - min_i,
xxx);
#endif
-
+
STOP_RPCC(kernel);
#ifdef TIMING
ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
#endif
-
+
if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
}
}
-
+
#ifndef LOWER
current ++;
#else
@@ -410,38 +410,38 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
- } else
+ } else
if (min_i > GEMM_P) {
min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
}
START_RPCC();
-
+
ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
-
+
STOP_RPCC(copy_A);
-
+
current = mypos;
do {
-
+
div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE
+ GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
-
+
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
-
+
START_RPCC();
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, ldc, is, xxx);
-
+
STOP_RPCC(kernel);
#ifdef TIMING
ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
#endif
-
+
#ifndef LOWER
if (is + min_i >= m_to) {
#else
@@ -452,7 +452,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
WMB;
}
}
-
+
#ifndef LOWER
current ++;
} while (current != args -> nthreads);
@@ -460,11 +460,11 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
current --;
} while (current >= 0);
#endif
-
-
+
+
}
}
-
+
START_RPCC();
for (i = 0; i < args -> nthreads; i++) {
@@ -528,7 +528,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
double dnum;
if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) {
- SYRK_LOCAL(args, range_m, range_n, sa, sb, 0);
+ SYRK_LOCAL(args, range_m, range_n, sa, sb, 0);
return 0;
}
@@ -542,7 +542,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
#else
mode = BLAS_SINGLE | BLAS_REAL;
mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
-#endif
+#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -553,7 +553,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1;
-#endif
+#endif
#endif
newarg.m = args -> m;
@@ -577,7 +577,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
#endif
newarg.common = (void *)job;
-
+
if (!range_n) {
n_from = 0;
n_to = args -> n;
@@ -597,17 +597,17 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
dnum = (double)n * (double)n /(double)nthreads;
while (i < n){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)i;
-
+
width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
-
+
if (num_cpu == 0) width = n - ((n - width) & ~mask);
-
+
if ((width > n - i) || (width < mask)) width = n - i;
-
+
} else {
width = n - i;
}
@@ -622,7 +622,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
@@ -639,21 +639,21 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
dnum = (double)n * (double)n /(double)nthreads;
while (i < n){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)i;
-
+
width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
-
+
if ((width > n - i) || (width < mask)) width = n - i;
-
+
} else {
width = n - i;
}
range[num_cpu + 1] = range[num_cpu] + width;
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = inner_thread;
queue[num_cpu].args = &newarg;
@@ -662,7 +662,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
@@ -680,14 +680,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
}
}
}
-
+
queue[0].sa = sa;
queue[0].sb = sb;
queue[num_cpu - 1].next = NULL;
-
+
exec_blas(num_cpu, queue);
}
-
+
#ifdef USE_ALLOC_HEAP
free(job);
#endif
diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index ee1a8db..95860d0 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -49,7 +49,7 @@
#endif
//The array of job_t may overflow the stack.
-//Instead, use malloc to alloc job_t.
+//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
#define USE_ALLOC_HEAP
#endif
@@ -309,12 +309,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#endif
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
-
+
buffer[0] = sb;
for (i = 1; i < DIVIDE_RATE; i++) {
buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1)) * COMPSIZE;
}
-
+
for(ls = 0; ls < k; ls += min_l){
@@ -328,7 +328,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
l1stride = 1;
min_i = m_to - m_from;
-
+
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
} else {
@@ -340,23 +340,23 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
}
START_RPCC();
-
+
ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
-
+
STOP_RPCC(copy_A);
-
+
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
-
+
for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
-
+
START_RPCC();
-
+
/* Make sure if no one is using buffer */
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
-
+
STOP_RPCC(waiting1);
-
+
#if defined(FUSED_GEMM) && !defined(TIMING)
FUSED_KERNEL_OPERATION(min_i, MIN(n_to, xxx + div_n) - xxx, min_l, alpha,
@@ -376,21 +376,21 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-#endif
+#endif
START_RPCC();
-
- OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
+
+ OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE * l1stride);
-
+
STOP_RPCC(copy_B);
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE * l1stride,
c, ldc, m_from, jjs);
-
+
STOP_RPCC(kernel);
#ifdef TIMING
@@ -399,30 +399,30 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
}
#endif
-
+
for (i = 0; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
WMB;
}
current = mypos;
-
+
do {
current ++;
if (current >= args -> nthreads) current = 0;
-
+
div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
-
+
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
-
+
if (current != mypos) {
-
+
START_RPCC();
-
+
/* thread has to wait */
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
-
+
STOP_RPCC(waiting2);
-
+
START_RPCC();
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha,
@@ -434,43 +434,43 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l;
#endif
}
-
+
if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
}
}
} while (current != mypos);
-
+
for(is = m_from + min_i; is < m_to; is += min_i){
min_i = m_to - is;
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
- } else
+ } else
if (min_i > GEMM_P) {
min_i = ((min_i + 1) / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
}
-
+
START_RPCC();
-
+
ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
-
+
STOP_RPCC(copy_A);
-
+
current = mypos;
do {
-
+
div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
-
+
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
-
+
START_RPCC();
-
+
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, ldc, is, xxx);
-
+
STOP_RPCC(kernel);
#ifdef TIMING
@@ -483,16 +483,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
WMB;
}
}
-
+
current ++;
if (current >= args -> nthreads) current = 0;
-
+
} while (current != mypos);
-
+
}
-
+
}
-
+
START_RPCC();
for (i = 0; i < args -> nthreads; i++) {
@@ -561,7 +561,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE;
#else
mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
-#endif
+#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX | BLAS_NODE;
@@ -569,7 +569,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
mode = BLAS_DOUBLE | BLAS_COMPLEX | BLAS_NODE;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX | BLAS_NODE;
-#endif
+#endif
#endif
newarg.m = args -> m;
@@ -594,7 +594,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
#endif
newarg.common = (void *)job;
-
+
#ifdef PARAMTEST
newarg.gemm_p = args -> gemm_p;
newarg.gemm_q = args -> gemm_q;
@@ -612,7 +612,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
num_cpu_m = 0;
while (m > 0){
-
+
width = blas_quickdivide(m + nthreads - num_cpu_m - 1, nthreads - num_cpu_m);
m -= width;
@@ -633,10 +633,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
queue[i].sb = NULL;
queue[i].next = &queue[i + 1];
}
-
+
queue[0].sa = sa;
queue[0].sb = sb;
-
+
if (!range_n) {
n_from = 0;
n_to = args -> n;
@@ -648,23 +648,23 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
for(js = n_from; js < n_to; js += GEMM_R * nthreads){
n = n_to - js;
if (n > GEMM_R * nthreads) n = GEMM_R * nthreads;
-
+
range_N[0] = js;
num_cpu_n = 0;
while (n > 0){
-
+
width = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n);
-
+
n -= width;
if (n < 0) width = width + n;
-
+
range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width;
-
+
num_cpu_n ++;
}
-
+
for (j = 0; j < num_cpu_m; j++) {
for (i = 0; i < num_cpu_m; i++) {
for (k = 0; k < DIVIDE_RATE; k++) {
@@ -672,7 +672,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
}
}
}
-
+
queue[num_cpu_m - 1].next = NULL;
exec_blas(num_cpu_m, queue);
@@ -692,9 +692,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
BLASLONG nthreads = args -> nthreads;
BLASLONG divN, divT;
int mode;
-
+
if (nthreads == 1) {
- GEMM_LOCAL(args, range_m, range_n, sa, sb, 0);
+ GEMM_LOCAL(args, range_m, range_n, sa, sb, 0);
return 0;
}
@@ -745,7 +745,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -753,9 +753,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
#endif
-
+#endif
+
#if defined(TN) || defined(TT) || defined(TR) || defined(TC) || \
defined(CN) || defined(CT) || defined(CR) || defined(CC)
mode |= (BLAS_TRANSA_T);
@@ -764,11 +764,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
defined(NC) || defined(TC) || defined(RC) || defined(CC)
mode |= (BLAS_TRANSB_T);
#endif
-
+
#ifdef OS_WINDOWS
- gemm_thread_n(mode, args, range_m, range_n, GEMM_LOCAL, sa, sb, divN);
+ gemm_thread_n(mode, args, range_m, range_n, GEMM_LOCAL, sa, sb, divN);
#else
- gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN);
+ gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN);
#endif
}
diff --git a/driver/level3/syr2k_k.c b/driver/level3/syr2k_k.c
index 01251d4..8df0f12 100644
--- a/driver/level3/syr2k_k.c
+++ b/driver/level3/syr2k_k.c
@@ -78,7 +78,7 @@ static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLA
#else
- SCAL_K(MIN(m_to - i + m_from - n_from, m_to), 0, 0, alpha[0],
+ SCAL_K(MIN(m_to - i + m_from - n_from, m_to), 0, 0, alpha[0],
#ifdef COMPLEX
alpha[1],
#endif
diff --git a/driver/level3/syr2k_kernel.c b/driver/level3/syr2k_kernel.c
index 8c476f5..f9e4a4c 100644
--- a/driver/level3/syr2k_kernel.c
+++ b/driver/level3/syr2k_kernel.c
@@ -56,7 +56,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#ifdef COMPLEX
alpha_i,
#endif
- a, b, c, ldc);
+ a, b, c, ldc);
#endif
return 0;
}
@@ -68,7 +68,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#ifdef COMPLEX
alpha_i,
#endif
- a, b, c, ldc);
+ a, b, c, ldc);
#endif
return 0;
}
@@ -81,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#ifdef COMPLEX
alpha_i,
#endif
- a, b, c, ldc);
+ a, b, c, ldc);
#endif
b += offset * k * COMPSIZE;
c += offset * ldc * COMPSIZE;
@@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#endif
a,
b + (m + offset) * k * COMPSIZE,
- c + (m + offset) * ldc * COMPSIZE, ldc);
+ c + (m + offset) * ldc * COMPSIZE, ldc);
#endif
n = m + offset;
@@ -115,7 +115,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#ifdef COMPLEX
alpha_i,
#endif
- a, b, c, ldc);
+ a, b, c, ldc);
#endif
a -= offset * k * COMPSIZE;
c -= offset * COMPSIZE;
@@ -134,53 +134,53 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#endif
a + (n - offset) * k * COMPSIZE,
b,
- c + (n - offset) * COMPSIZE, ldc);
+ c + (n - offset) * COMPSIZE, ldc);
#endif
m = n + offset;
if (m <= 0) return 0;
}
for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) {
-
+
int mm, nn;
-
+
mm = (loop & ~(GEMM_UNROLL_MN - 1));
nn = MIN(GEMM_UNROLL_MN, n - loop);
-
+
#ifndef LOWER
GEMM_KERNEL_N(mm, nn, k,
alpha_r,
#ifdef COMPLEX
alpha_i,
#endif
- a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc);
+ a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc);
#endif
-
+
if (flag) {
- GEMM_BETA(nn, nn, 0, ZERO,
+ GEMM_BETA(nn, nn, 0, ZERO,
#ifdef COMPLEX
ZERO,
#endif
NULL, 0, NULL, 0, subbuffer, nn);
-
+
GEMM_KERNEL_N(nn, nn, k,
alpha_r,
#ifdef COMPLEX
alpha_i,
#endif
- a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn);
+ a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn);
#ifndef LOWER
-
+
for (j = 0; j < nn; j ++) {
for (i = 0; i <= j; i ++) {
#ifndef COMPLEX
c[i + loop + (j + loop) * ldc] +=
subbuffer[i + j * nn] + subbuffer[j + i * nn];
#else
- c[(i + loop + (j + loop) * ldc) * 2 + 0] +=
+ c[(i + loop + (j + loop) * ldc) * 2 + 0] +=
subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0];
- c[(i + loop + (j + loop) * ldc) * 2 + 1] +=
+ c[(i + loop + (j + loop) * ldc) * 2 + 1] +=
subbuffer[(i + j * nn) * 2 + 1] + subbuffer[(j + i * nn) * 2 + 1];
#endif
}
@@ -189,7 +189,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
for (j = 0; j < nn; j ++) {
for (i = j; i < nn; i ++) {
#ifndef COMPLEX
- c[i + loop + (j + loop) * ldc] +=
+ c[i + loop + (j + loop) * ldc] +=
subbuffer[i + j * nn] + subbuffer[j + i * nn];
#else
c[(i + loop + (j + loop) * ldc) * 2 + 0] +=
@@ -201,15 +201,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
}
#endif
}
-
+
#ifdef LOWER
GEMM_KERNEL_N(m - mm - nn, nn, k,
alpha_r,
#ifdef COMPLEX
alpha_i,
#endif
- a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE,
- c + (mm + nn + loop * ldc) * COMPSIZE, ldc);
+ a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE,
+ c + (mm + nn + loop * ldc) * COMPSIZE, ldc);
#endif
}
diff --git a/driver/level3/syrk_k.c b/driver/level3/syrk_k.c
index 9c9700e..08751dc 100644
--- a/driver/level3/syrk_k.c
+++ b/driver/level3/syrk_k.c
@@ -80,7 +80,7 @@ static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLA
#else
- SCAL_K(MIN(m_to - i + m_from - n_from, m_to), 0, 0, alpha[0],
+ SCAL_K(MIN(m_to - i + m_from - n_from, m_to), 0, 0, alpha[0],
#ifdef COMPLEX
alpha[1],
#endif
diff --git a/driver/level3/syrk_kernel.c b/driver/level3/syrk_kernel.c
index 65d108a..434d2f6 100644
--- a/driver/level3/syrk_kernel.c
+++ b/driver/level3/syrk_kernel.c
@@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#ifdef COMPLEX
alpha_i,
#endif
- a, b, c, ldc);
+ a, b, c, ldc);
#endif
return 0;
}
@@ -83,7 +83,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#ifdef COMPLEX
alpha_i,
#endif
- a, b, c, ldc);
+ a, b, c, ldc);
#endif
return 0;
}
@@ -95,7 +95,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#ifdef COMPLEX
alpha_i,
#endif
- a, b, c, ldc);
+ a, b, c, ldc);
#endif
b += offset * k * COMPSIZE;
c += offset * ldc * COMPSIZE;
@@ -114,7 +114,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#endif
a,
b + (m + offset) * k * COMPSIZE,
- c + (m + offset) * ldc * COMPSIZE, ldc);
+ c + (m + offset) * ldc * COMPSIZE, ldc);
#endif
n = m + offset;
@@ -128,7 +128,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#ifdef COMPLEX
alpha_i,
#endif
- a, b, c, ldc);
+ a, b, c, ldc);
#endif
a -= offset * k * COMPSIZE;
c -= offset * COMPSIZE;
@@ -147,7 +147,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#endif
a + (n - offset) * k * COMPSIZE,
b,
- c + (n - offset) * COMPSIZE, ldc);
+ c + (n - offset) * COMPSIZE, ldc);
#endif
m = n + offset;
@@ -167,21 +167,21 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#ifdef COMPLEX
alpha_i,
#endif
- a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc);
+ a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc);
#endif
- GEMM_BETA(nn, nn, 0, ZERO,
+ GEMM_BETA(nn, nn, 0, ZERO,
#ifdef COMPLEX
ZERO,
#endif
NULL, 0, NULL, 0, subbuffer, nn);
-
+
GEMM_KERNEL(nn, nn, k,
alpha_r,
#ifdef COMPLEX
alpha_i,
#endif
- a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn);
+ a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn);
cc = c + (loop + loop * ldc) * COMPSIZE;
ss = subbuffer;
@@ -220,8 +220,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#ifdef COMPLEX
alpha_i,
#endif
- a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE,
- c + (mm + nn + loop * ldc) * COMPSIZE, ldc);
+ a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE,
+ c + (mm + nn + loop * ldc) * COMPSIZE, ldc);
#endif
}
diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c
index 837670b..0d9bdf2 100644
--- a/driver/level3/syrk_thread.c
+++ b/driver/level3/syrk_thread.c
@@ -52,7 +52,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
int num_cpu;
int mask = 0;
-
+
if (!(mode & BLAS_COMPLEX)) {
switch (mode & BLAS_PREC) {
@@ -83,7 +83,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
#endif
}
}
-
+
n_from = 0;
n_to = arg -> n;
@@ -96,29 +96,29 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
nf = (double)(n_from);
nt = (double)(n_to);
-
+
dnum = (nt * nt - nf * nf) / (double)nthreads;
-
+
num_cpu = 0;
-
+
range[0] = n_from;
i = n_from;
-
+
while (i < n_to){
-
+
if (nthreads - num_cpu > 1) {
-
+
di = (double)i;
width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask;
-
+
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
-
+
} else {
width = n_to - i;
}
-
+
range[num_cpu + 1] = range[num_cpu] + width;
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = function;
queue[num_cpu].args = arg;
@@ -127,38 +127,38 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
-
+
} else {
nf = (double)(arg -> n - n_from);
nt = (double)(arg -> n - n_to);
dnum = (nt * nt - nf * nf) / (double)nthreads;
-
+
num_cpu = 0;
-
+
range[0] = n_from;
i = n_from;
-
+
while (i < n_to){
-
+
if (nthreads - num_cpu > 1) {
-
+
di = (double)(arg -> n - i);
width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask;
-
+
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
-
+
} else {
width = n_to - i;
}
-
+
range[num_cpu + 1] = range[num_cpu] + width;
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = function;
queue[num_cpu].args = arg;
@@ -167,7 +167,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
@@ -178,9 +178,9 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
queue[0].sa = sa;
queue[0].sb = sb;
queue[num_cpu - 1].next = NULL;
-
+
exec_blas(num_cpu, queue);
}
-
+
return 0;
}
diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c
index 9e46df0..c0a822b 100644
--- a/driver/level3/trmm_L.c
+++ b/driver/level3/trmm_L.c
@@ -122,7 +122,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (min_l > GEMM_Q) min_l = GEMM_Q;
min_i = min_l;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
START_RPCC();
#ifndef TRANSA
@@ -130,7 +130,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
#else
TRMM_ILNCOPY(min_l, min_i, a, lda, 0, 0, sa);
#endif
-
+
STOP_RPCC(innercost);
for(jjs = js; jjs < js + min_j; jjs += min_jj){
@@ -140,16 +140,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
START_RPCC();
GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
-
+
STOP_RPCC(outercost);
-
+
START_RPCC();
-
+
TRMM_KERNEL_N(min_i, min_jj, min_l, dp1,
#ifdef COMPLEX
ZERO,
#endif
- sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs * ldb) * COMPSIZE, ldb, 0);
+ sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs * ldb) * COMPSIZE, ldb, 0);
STOP_RPCC(trmmcost);
}
@@ -158,7 +158,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(is = min_i; is < min_l; is += GEMM_P){
min_i = min_l - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
START_RPCC();
#ifndef TRANSA
@@ -166,16 +166,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
#else
TRMM_ILNCOPY(min_l, min_i, a, lda, 0, is, sa);
#endif
-
+
STOP_RPCC(innercost);
-
+
START_RPCC();
-
+
TRMM_KERNEL_N(min_i, min_j, min_l, dp1,
#ifdef COMPLEX
ZERO,
#endif
- sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is);
+ sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is);
STOP_RPCC(trmmcost);
@@ -186,7 +186,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (min_l > GEMM_Q) min_l = GEMM_Q;
min_i = ls;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
START_RPCC();
#ifndef TRANSA
@@ -200,21 +200,21 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+
START_RPCC();
GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
-
+
STOP_RPCC(gemmcost);
-
+
START_RPCC();
- GEMM_KERNEL(min_i, min_jj, min_l, dp1,
+ GEMM_KERNEL(min_i, min_jj, min_l, dp1,
#ifdef COMPLEX
ZERO,
#endif
- sa, sb + min_l * (jjs - js) * COMPSIZE,
- b + (jjs * ldb) * COMPSIZE, ldb);
+ sa, sb + min_l * (jjs - js) * COMPSIZE,
+ b + (jjs * ldb) * COMPSIZE, ldb);
STOP_RPCC(gemmcost);
}
@@ -222,7 +222,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(is = min_i; is < ls; is += GEMM_P){
min_i = ls - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
START_RPCC();
#ifndef TRANSA
@@ -235,19 +235,19 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
START_RPCC();
- GEMM_KERNEL(min_i, min_j, min_l, dp1,
+ GEMM_KERNEL(min_i, min_j, min_l, dp1,
#ifdef COMPLEX
ZERO,
#endif
- sa, sb, b + (is + js * ldb) * COMPSIZE, ldb);
+ sa, sb, b + (is + js * ldb) * COMPSIZE, ldb);
STOP_RPCC(gemmcost);
}
-
+
for(is = ls; is < ls + min_l; is += GEMM_P){
min_i = ls + min_l - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
START_RPCC();
#ifndef TRANSA
@@ -255,7 +255,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
#else
TRMM_ILNCOPY(min_l, min_i, a, lda, ls, is, sa);
#endif
-
+
STOP_RPCC(innercost);
START_RPCC();
@@ -264,7 +264,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
#ifdef COMPLEX
ZERO,
#endif
- sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls);
+ sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls);
STOP_RPCC(trmmcost);
}
@@ -275,7 +275,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (min_l > GEMM_Q) min_l = GEMM_Q;
min_i = min_l;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
START_RPCC();
#ifndef TRANSA
@@ -283,20 +283,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
#else
TRMM_IUNCOPY(min_l, min_i, a, lda, m - min_l, m - min_l, sa);
#endif
-
+
STOP_RPCC(innercost);
-
+
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+
START_RPCC();
GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb,
sb + min_l * (jjs - js) * COMPSIZE);
-
+
STOP_RPCC(outercost);
-
+
START_RPCC();
TRMM_KERNEL_T(min_i, min_jj, min_l, dp1,
@@ -304,7 +304,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
ZERO,
#endif
sa, sb + min_l * (jjs - js) * COMPSIZE,
- b + (m - min_l + jjs * ldb) * COMPSIZE, ldb, 0);
+ b + (m - min_l + jjs * ldb) * COMPSIZE, ldb, 0);
STOP_RPCC(trmmcost);
}
@@ -312,7 +312,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(is = m - min_l + min_i; is < m; is += GEMM_P){
min_i = m - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
START_RPCC();
#ifndef TRANSA
@@ -320,16 +320,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
#else
TRMM_IUNCOPY(min_l, min_i, a, lda, m - min_l, is, sa);
#endif
-
+
STOP_RPCC(innercost);
-
+
START_RPCC();
TRMM_KERNEL_T(min_i, min_j, min_l, dp1,
#ifdef COMPLEX
ZERO,
#endif
- sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - m + min_l);
+ sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - m + min_l);
STOP_RPCC(trmmcost);
}
@@ -339,7 +339,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (min_l > GEMM_Q) min_l = GEMM_Q;
min_i = min_l;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
START_RPCC();
#ifndef TRANSA
@@ -347,18 +347,18 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
#else
TRMM_IUNCOPY(min_l, min_i, a, lda, ls - min_l, ls - min_l, sa);
#endif
-
+
STOP_RPCC(innercost);
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+
START_RPCC();
GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb,
sb + min_l * (jjs - js) * COMPSIZE);
-
+
STOP_RPCC(outercost);
START_RPCC();
@@ -368,7 +368,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
ZERO,
#endif
sa, sb + min_l * (jjs - js) * COMPSIZE,
- b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, 0);
+ b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, 0);
STOP_RPCC(trmmcost);
}
@@ -376,7 +376,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(is = ls - min_l + min_i; is < ls; is += GEMM_P){
min_i = ls - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
START_RPCC();
#ifndef TRANSA
@@ -384,7 +384,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
#else
TRMM_IUNCOPY(min_l, min_i, a, lda, ls - min_l, is, sa);
#endif
-
+
STOP_RPCC(innercost);
START_RPCC();
@@ -393,7 +393,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
#ifdef COMPLEX
ZERO,
#endif
- sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls + min_l);
+ sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls + min_l);
STOP_RPCC(trmmcost);
}
@@ -402,7 +402,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(is = ls; is < m; is += GEMM_P){
min_i = m - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
START_RPCC();
#ifndef TRANSA
@@ -415,11 +415,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
START_RPCC();
- GEMM_KERNEL(min_i, min_j, min_l, dp1,
+ GEMM_KERNEL(min_i, min_j, min_l, dp1,
#ifdef COMPLEX
ZERO,
#endif
- sa, sb, b + (is + js * ldb) * COMPSIZE, ldb);
+ sa, sb, b + (is + js * ldb) * COMPSIZE, ldb);
STOP_RPCC(gemmcost);
}
diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c
index e46553c..6012386 100644
--- a/driver/level3/trmm_R.c
+++ b/driver/level3/trmm_R.c
@@ -114,9 +114,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (min_l > GEMM_Q) min_l = GEMM_Q;
min_i = m;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa);
-
+
for(jjs = 0; jjs < ls - js; jjs += min_jj){
min_jj = ls - js - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
@@ -126,54 +126,54 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
#else
GEMM_OTCOPY(min_l, min_jj, a + ((js + jjs) + ls * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE);
#endif
-
- GEMM_KERNEL(min_i, min_jj, min_l, dp1,
+
+ GEMM_KERNEL(min_i, min_jj, min_l, dp1,
#ifdef COMPLEX
ZERO,
#endif
sa, sb + min_l * jjs * COMPSIZE,
- b + ((js + jjs) * ldb) * COMPSIZE, ldb);
+ b + ((js + jjs) * ldb) * COMPSIZE, ldb);
}
for(jjs = 0; jjs < min_l; jjs += min_jj){
min_jj = min_l - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+
#ifndef TRANSA
TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE);
#else
TRMM_OUTCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE);
#endif
-
+
TRMM_KERNEL_T(min_i, min_jj, min_l, dp1,
#ifdef COMPLEX
ZERO,
#endif
sa,
sb + (ls - js + jjs) * min_l * COMPSIZE,
- b + ((ls + jjs) * ldb) * COMPSIZE, ldb, -jjs);
+ b + ((ls + jjs) * ldb) * COMPSIZE, ldb, -jjs);
}
for(is = min_i; is < m; is += GEMM_P){
min_i = m - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa);
-
- GEMM_KERNEL(min_i, ls - js, min_l, dp1,
+
+ GEMM_KERNEL(min_i, ls - js, min_l, dp1,
#ifdef COMPLEX
ZERO,
#endif
sa, sb,
- b + (is + js * ldb) * COMPSIZE, ldb);
-
+ b + (is + js * ldb) * COMPSIZE, ldb);
+
TRMM_KERNEL_T(min_i, min_l, min_l, dp1,
#ifdef COMPLEX
ZERO,
#endif
sa,
sb + (ls - js) * min_l * COMPSIZE,
- b + (is + ls * ldb) * COMPSIZE, ldb, 0);
+ b + (is + ls * ldb) * COMPSIZE, ldb, 0);
}
}
@@ -183,9 +183,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (min_l > GEMM_Q) min_l = GEMM_Q;
min_i = m;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa);
-
+
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
@@ -195,26 +195,26 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
#else
GEMM_OTCOPY(min_l, min_jj, a + (jjs + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
#endif
-
- GEMM_KERNEL(min_i, min_jj, min_l, dp1,
+
+ GEMM_KERNEL(min_i, min_jj, min_l, dp1,
#ifdef COMPLEX
ZERO,
#endif
- sa, sb + min_l * (jjs - js) * COMPSIZE,
- b + (jjs * ldb) * COMPSIZE, ldb);
+ sa, sb + min_l * (jjs - js) * COMPSIZE,
+ b + (jjs * ldb) * COMPSIZE, ldb);
}
-
+
for(is = min_i; is < m; is += GEMM_P){
min_i = m - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa);
-
- GEMM_KERNEL(min_i, min_j, min_l, dp1,
+
+ GEMM_KERNEL(min_i, min_j, min_l, dp1,
#ifdef COMPLEX
ZERO,
#endif
- sa, sb, b + (is + js * ldb) * COMPSIZE, ldb);
+ sa, sb, b + (is + js * ldb) * COMPSIZE, ldb);
}
}
}
@@ -225,7 +225,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(js = n; js > 0; js -= GEMM_R){
min_j = js;
if (min_j > GEMM_R) min_j = GEMM_R;
-
+
start_ls = js - min_j;
while (start_ls + GEMM_Q < js) start_ls += GEMM_Q;
@@ -234,7 +234,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (min_l > GEMM_Q) min_l = GEMM_Q;
min_i = m;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa);
for(jjs = 0; jjs < min_l; jjs += min_jj){
@@ -246,20 +246,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
#else
TRMM_OLTCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE);
#endif
-
+
TRMM_KERNEL_N(min_i, min_jj, min_l, dp1,
#ifdef COMPLEX
ZERO,
#endif
sa,
sb + min_l * jjs * COMPSIZE,
- b + ((ls + jjs) * ldb) * COMPSIZE, ldb, -jjs);
+ b + ((ls + jjs) * ldb) * COMPSIZE, ldb, -jjs);
}
-
+
for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){
min_jj = js - ls - min_l - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+
#ifndef TRANSA
GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda,
sb + min_l * (min_l + jjs) * COMPSIZE);
@@ -267,20 +267,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
GEMM_OTCOPY(min_l, min_jj, a + ((ls + min_l + jjs) + ls * lda) * COMPSIZE, lda,
sb + min_l * (min_l + jjs) * COMPSIZE);
#endif
-
- GEMM_KERNEL(min_i, min_jj, min_l, dp1,
+
+ GEMM_KERNEL(min_i, min_jj, min_l, dp1,
#ifdef COMPLEX
ZERO,
#endif
sa,
sb + min_l * (min_l + jjs) * COMPSIZE,
- b + ((ls + min_l + jjs) * ldb) * COMPSIZE, ldb);
+ b + ((ls + min_l + jjs) * ldb) * COMPSIZE, ldb);
}
-
+
for(is = min_i; is < m; is += GEMM_P){
min_i = m - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa);
TRMM_KERNEL_N(min_i, min_l, min_l, dp1,
@@ -289,16 +289,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
#endif
sa,
sb,
- b + (is + ls * ldb) * COMPSIZE, ldb, 0);
+ b + (is + ls * ldb) * COMPSIZE, ldb, 0);
if (js - ls - min_l > 0) {
- GEMM_KERNEL(min_i, js - ls - min_l, min_l, dp1,
+ GEMM_KERNEL(min_i, js - ls - min_l, min_l, dp1,
#ifdef COMPLEX
ZERO,
#endif
sa,
sb + min_l * min_l * COMPSIZE,
- b + (is + (ls + min_l) * ldb) * COMPSIZE, ldb);
+ b + (is + (ls + min_l) * ldb) * COMPSIZE, ldb);
}
}
}
@@ -308,38 +308,38 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (min_l > GEMM_Q) min_l = GEMM_Q;
min_i = m;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa);
-
+
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+
#ifndef TRANSA
GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
#else
GEMM_OTCOPY(min_l, min_jj, a + ((jjs - min_j) + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
#endif
-
- GEMM_KERNEL(min_i, min_jj, min_l, dp1,
+
+ GEMM_KERNEL(min_i, min_jj, min_l, dp1,
#ifdef COMPLEX
ZERO,
#endif
- sa, sb + min_l * (jjs - js) * COMPSIZE,
- b + ((jjs - min_j) * ldb) * COMPSIZE, ldb);
+ sa, sb + min_l * (jjs - js) * COMPSIZE,
+ b + ((jjs - min_j) * ldb) * COMPSIZE, ldb);
}
-
+
for(is = min_i; is < m; is += GEMM_P){
min_i = m - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa);
-
- GEMM_KERNEL(min_i, min_j, min_l, dp1,
+
+ GEMM_KERNEL(min_i, min_j, min_l, dp1,
#ifdef COMPLEX
ZERO,
#endif
- sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb);
+ sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb);
}
}
}
diff --git a/driver/level3/trsm_L.c b/driver/level3/trsm_L.c
index 2c3006f..fa3b0d5 100644
--- a/driver/level3/trsm_L.c
+++ b/driver/level3/trsm_L.c
@@ -112,20 +112,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(js = 0; js < n; js += GEMM_R){
min_j = n - js;
if (min_j > GEMM_R) min_j = GEMM_R;
-
+
#if (!defined(UPPER) && !defined(TRANSA)) || (defined(UPPER) && defined(TRANSA))
for(ls = 0; ls < m; ls += GEMM_Q){
min_l = m - ls;
if (min_l > GEMM_Q) min_l = GEMM_Q;
min_i = min_l;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
#ifndef TRANSA
TRSM_ILTCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa);
#else
TRSM_IUNCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa);
#endif
-
+
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
@@ -136,43 +136,43 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
#ifdef COMPLEX
ZERO,
#endif
- sa, sb + min_l * (jjs - js) * COMPSIZE,
+ sa, sb + min_l * (jjs - js) * COMPSIZE,
b + (ls + jjs * ldb) * COMPSIZE, ldb, 0);
}
for(is = ls + min_i; is < ls + min_l; is += GEMM_P){
min_i = ls + min_l - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
#ifndef TRANSA
TRSM_ILTCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, is - ls, sa);
#else
TRSM_IUNCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, is - ls, sa);
#endif
-
+
TRSM_KERNEL(min_i, min_j, min_l, dm1,
#ifdef COMPLEX
ZERO,
#endif
sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls);
}
-
+
for(is = ls + min_l; is < m; is += GEMM_P){
min_i = m - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
#ifndef TRANSA
GEMM_ITCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, sa);
#else
GEMM_INCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, sa);
#endif
-
- GEMM_KERNEL(min_i, min_j, min_l, dm1,
+
+ GEMM_KERNEL(min_i, min_j, min_l, dm1,
#ifdef COMPLEX
ZERO,
#endif
- sa, sb, b + (is + js * ldb) * COMPSIZE, ldb);
+ sa, sb, b + (is + js * ldb) * COMPSIZE, ldb);
}
}
#else
@@ -197,19 +197,19 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
-
+
TRSM_KERNEL(min_i, min_jj, min_l, dm1,
#ifdef COMPLEX
ZERO,
#endif
- sa, sb + min_l * (jjs - js) * COMPSIZE,
+ sa, sb + min_l * (jjs - js) * COMPSIZE,
b + (start_is + jjs * ldb) * COMPSIZE, ldb, start_is - ls + min_l);
}
-
+
for(is = start_is - GEMM_P; is >= ls - min_l; is -= GEMM_P){
min_i = ls - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
#ifndef TRANSA
TRSM_IUTCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, is - (ls - min_l), sa);
#else
@@ -219,26 +219,26 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
#ifdef COMPLEX
ZERO,
#endif
- sa, sb,
+ sa, sb,
b + (is + js * ldb) * COMPSIZE, ldb, + is - (ls - min_l) );
}
-
+
for(is = 0; is < ls - min_l; is += GEMM_P){
min_i = ls - min_l - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
#ifndef TRANSA
GEMM_ITCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, sa);
#else
GEMM_INCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, sa);
#endif
- GEMM_KERNEL(min_i, min_j, min_l, dm1,
+ GEMM_KERNEL(min_i, min_j, min_l, dm1,
#ifdef COMPLEX
ZERO,
#endif
- sa, sb, b + (is + js * ldb) * COMPSIZE, ldb);
+ sa, sb, b + (is + js * ldb) * COMPSIZE, ldb);
}
}
diff --git a/driver/level3/trsm_R.c b/driver/level3/trsm_R.c
index 0964d78..b6ee956 100644
--- a/driver/level3/trsm_R.c
+++ b/driver/level3/trsm_R.c
@@ -112,15 +112,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(js = 0; js < n; js += GEMM_R){
min_j = n - js;
if (min_j > GEMM_R) min_j = GEMM_R;
-
+
for(ls = 0; ls < js; ls += GEMM_Q){
min_l = js - ls;
if (min_l > GEMM_Q) min_l = GEMM_Q;
min_i = m;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa);
-
+
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
@@ -131,25 +131,25 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
GEMM_OTCOPY(min_l, min_jj, a + (jjs + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
#endif
- GEMM_KERNEL(min_i, min_jj, min_l, dm1,
+ GEMM_KERNEL(min_i, min_jj, min_l, dm1,
#ifdef COMPLEX
ZERO,
#endif
sa, sb + min_l * (jjs - js) * COMPSIZE,
- b + (jjs * ldb) * COMPSIZE, ldb);
+ b + (jjs * ldb) * COMPSIZE, ldb);
}
for(is = min_i; is < m; is += GEMM_P){
min_i = m - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa);
-
- GEMM_KERNEL(min_i, min_j, min_l, dm1,
+
+ GEMM_KERNEL(min_i, min_j, min_l, dm1,
#ifdef COMPLEX
ZERO,
#endif
- sa, sb, b + (is + js * ldb) * COMPSIZE, ldb);
+ sa, sb, b + (is + js * ldb) * COMPSIZE, ldb);
}
}
@@ -160,25 +160,25 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (min_i > GEMM_P) min_i = GEMM_P;
GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa);
-
+
#ifndef TRANSA
TRSM_OUNCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb);
#else
TRSM_OLTCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb);
#endif
-
+
TRSM_KERNEL(min_i, min_l, min_l, dm1,
#ifdef COMPLEX
ZERO,
#endif
sa,
- sb,
+ sb,
b + (ls * ldb) * COMPSIZE, ldb, 0);
-
+
for(jjs = 0; jjs < min_j - min_l - ls + js; jjs += min_jj){
min_jj = min_j - min_l - ls + js - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+
#ifndef TRANSA
GEMM_ONCOPY (min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda,
sb + min_l * (min_l + jjs) * COMPSIZE);
@@ -187,36 +187,36 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
sb + min_l * (min_l + jjs) * COMPSIZE);
#endif
- GEMM_KERNEL(min_i, min_jj, min_l, dm1,
+ GEMM_KERNEL(min_i, min_jj, min_l, dm1,
#ifdef COMPLEX
ZERO,
#endif
- sa,
+ sa,
sb + min_l * (min_l + jjs) * COMPSIZE,
- b + (min_l + ls + jjs) * ldb * COMPSIZE, ldb);
+ b + (min_l + ls + jjs) * ldb * COMPSIZE, ldb);
}
for(is = min_i; is < m; is += GEMM_P){
min_i = m - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa);
-
+
TRSM_KERNEL(min_i, min_l, min_l, dm1,
#ifdef COMPLEX
ZERO,
#endif
sa,
- sb,
+ sb,
b + (is + ls * ldb) * COMPSIZE, ldb, 0);
-
- GEMM_KERNEL(min_i, min_j - min_l + js - ls, min_l, dm1,
+
+ GEMM_KERNEL(min_i, min_j - min_l + js - ls, min_l, dm1,
#ifdef COMPLEX
ZERO,
#endif
- sa,
+ sa,
sb + min_l * min_l * COMPSIZE,
- b + (is + ( min_l + ls) * ldb) * COMPSIZE, ldb);
+ b + (is + ( min_l + ls) * ldb) * COMPSIZE, ldb);
}
}
}
@@ -235,48 +235,48 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
if (min_i > GEMM_P) min_i = GEMM_P;
GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa);
-
+
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+
#ifndef TRANSA
GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
#else
GEMM_OTCOPY(min_l, min_jj, a + ((jjs - min_j) + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
#endif
- GEMM_KERNEL(min_i, min_jj, min_l, dm1,
+ GEMM_KERNEL(min_i, min_jj, min_l, dm1,
#ifdef COMPLEX
ZERO,
#endif
sa, sb + min_l * (jjs - js) * COMPSIZE,
- b + (jjs - min_j) * ldb * COMPSIZE, ldb);
+ b + (jjs - min_j) * ldb * COMPSIZE, ldb);
}
for(is = min_i; is < m; is += GEMM_P){
min_i = m - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa);
-
- GEMM_KERNEL(min_i, min_j, min_l, dm1,
+
+ GEMM_KERNEL(min_i, min_j, min_l, dm1,
#ifdef COMPLEX
ZERO,
#endif
- sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb);
+ sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb);
}
}
start_ls = js - min_j;
while (start_ls + GEMM_Q < js) start_ls += GEMM_Q;
-
+
for(ls = start_ls; ls >= js - min_j; ls -= GEMM_Q){
min_l = js - ls;
if (min_l > GEMM_Q) min_l = GEMM_Q;
min_i = m;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa);
#ifndef TRANSA
@@ -286,63 +286,63 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
TRSM_OUTCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda,
0, sb + min_l * (min_j - js + ls) * COMPSIZE);
#endif
-
+
TRSM_KERNEL(min_i, min_l, min_l, dm1,
#ifdef COMPLEX
ZERO,
#endif
sa,
- sb + min_l * (min_j - js + ls) * COMPSIZE,
+ sb + min_l * (min_j - js + ls) * COMPSIZE,
b + (ls * ldb) * COMPSIZE, ldb, 0);
-
+
for(jjs = 0; jjs < min_j - js + ls; jjs += min_jj){
min_jj = min_j - js + ls - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+
#ifndef TRANSA
GEMM_ONCOPY (min_l, min_jj, a + (ls + (js - min_j + jjs) * lda) * COMPSIZE, lda,
sb + min_l * jjs * COMPSIZE);
#else
- GEMM_OTCOPY (min_l, min_jj, a + ((js - min_j + jjs) + ls * lda) * COMPSIZE, lda,
+ GEMM_OTCOPY (min_l, min_jj, a + ((js - min_j + jjs) + ls * lda) * COMPSIZE, lda,
sb + min_l * jjs * COMPSIZE);
#endif
-
- GEMM_KERNEL(min_i, min_jj, min_l, dm1,
+
+ GEMM_KERNEL(min_i, min_jj, min_l, dm1,
#ifdef COMPLEX
ZERO,
#endif
- sa,
+ sa,
sb + min_l * jjs * COMPSIZE,
- b + (js - min_j + jjs) * ldb * COMPSIZE, ldb);
+ b + (js - min_j + jjs) * ldb * COMPSIZE, ldb);
}
for(is = min_i; is < m; is += GEMM_P){
min_i = m - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa);
-
+
TRSM_KERNEL(min_i, min_l, min_l, dm1,
#ifdef COMPLEX
ZERO,
#endif
sa,
- sb + min_l * (min_j - js + ls) * COMPSIZE,
+ sb + min_l * (min_j - js + ls) * COMPSIZE,
b + (is + ls * ldb) * COMPSIZE, ldb, 0);
-
- GEMM_KERNEL(min_i, min_j - js + ls, min_l, dm1,
+
+ GEMM_KERNEL(min_i, min_j - js + ls, min_l, dm1,
#ifdef COMPLEX
ZERO,
#endif
- sa,
+ sa,
sb,
- b + (is + (js - min_j) * ldb) * COMPSIZE, ldb);
+ b + (is + (js - min_j) * ldb) * COMPSIZE, ldb);
}
}
}
-
+
#endif
-
+
return 0;
}
diff --git a/driver/level3/zher2k_k.c b/driver/level3/zher2k_k.c
index 93bb781..54c76d7 100644
--- a/driver/level3/zher2k_k.c
+++ b/driver/level3/zher2k_k.c
@@ -130,7 +130,7 @@ static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLA
SCAL_K(MIN(i + n_from - m_from + 1, m_to) * COMPSIZE, 0, 0, alpha[0], c, 1, NULL, 0, NULL, 0);
- if (i + n_from - m_from + 1 <= m_to)
+ if (i + n_from - m_from + 1 <= m_to)
*(c + (i + n_from - m_from) * COMPSIZE + 1) = ZERO;
c += ldc * COMPSIZE;
diff --git a/driver/level3/zher2k_kernel.c b/driver/level3/zher2k_kernel.c
index 9b4c450..92aef88 100644
--- a/driver/level3/zher2k_kernel.c
+++ b/driver/level3/zher2k_kernel.c
@@ -61,7 +61,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
#ifdef COMPLEX
alpha_i,
#endif
- a, b, c, ldc);
+ a, b, c, ldc);
#endif
return 0;
}
@@ -73,7 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
#ifdef COMPLEX
alpha_i,
#endif
- a, b, c, ldc);
+ a, b, c, ldc);
#endif
return 0;
}
@@ -86,7 +86,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
#ifdef COMPLEX
alpha_i,
#endif
- a, b, c, ldc);
+ a, b, c, ldc);
#endif
b += offset * k * COMPSIZE;
c += offset * ldc * COMPSIZE;
@@ -105,7 +105,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
#endif
a,
b + (m + offset) * k * COMPSIZE,
- c + (m + offset) * ldc * COMPSIZE, ldc);
+ c + (m + offset) * ldc * COMPSIZE, ldc);
#endif
n = m + offset;
@@ -120,7 +120,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
#ifdef COMPLEX
alpha_i,
#endif
- a, b, c, ldc);
+ a, b, c, ldc);
#endif
a -= offset * k * COMPSIZE;
c -= offset * COMPSIZE;
@@ -139,30 +139,30 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
#endif
a + (n - offset) * k * COMPSIZE,
b,
- c + (n - offset) * COMPSIZE, ldc);
+ c + (n - offset) * COMPSIZE, ldc);
#endif
m = n + offset;
if (m <= 0) return 0;
}
for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) {
-
+
int mm, nn;
-
+
mm = (loop & ~(GEMM_UNROLL_MN - 1));
nn = MIN(GEMM_UNROLL_MN, n - loop);
-
+
#ifndef LOWER
GEMM_KERNEL(mm, nn, k,
alpha_r,
#ifdef COMPLEX
alpha_i,
#endif
- a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc);
+ a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc);
#endif
-
+
if (flag) {
- GEMM_BETA(nn, nn, 0, ZERO,
+ GEMM_BETA(nn, nn, 0, ZERO,
#ifdef COMPLEX
ZERO,
#endif
@@ -173,17 +173,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
#ifdef COMPLEX
alpha_i,
#endif
- a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn);
+ a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn);
#ifndef LOWER
-
+
for (j = 0; j < nn; j ++) {
for (i = 0; i <= j; i ++) {
- c[(i + loop + (j + loop) * ldc) * 2 + 0] +=
+ c[(i + loop + (j + loop) * ldc) * 2 + 0] +=
subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0];
if (i != j) {
- c[(i + loop + (j + loop) * ldc) * 2 + 1] +=
+ c[(i + loop + (j + loop) * ldc) * 2 + 1] +=
subbuffer[(i + j * nn) * 2 + 1] - subbuffer[(j + i * nn) * 2 + 1];
} else {
c[(i + loop + (j + loop) * ldc) * 2 + 1] = ZERO;
@@ -205,15 +205,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
}
#endif
}
-
+
#ifdef LOWER
GEMM_KERNEL(m - mm - nn, nn, k,
alpha_r,
#ifdef COMPLEX
alpha_i,
#endif
- a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE,
- c + (mm + nn + loop * ldc) * COMPSIZE, ldc);
+ a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE,
+ c + (mm + nn + loop * ldc) * COMPSIZE, ldc);
#endif
}
diff --git a/driver/level3/zherk_k.c b/driver/level3/zherk_k.c
index d1ffbdb..2203fc5 100644
--- a/driver/level3/zherk_k.c
+++ b/driver/level3/zherk_k.c
@@ -128,7 +128,7 @@ static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLA
SCAL_K(MIN(i + n_from - m_from + 1, m_to) * COMPSIZE, 0, 0, alpha[0], c, 1, NULL, 0, NULL, 0);
- if (i + n_from - m_from + 1 <= m_to)
+ if (i + n_from - m_from + 1 <= m_to)
*(c + (i + n_from - m_from) * COMPSIZE + 1) = ZERO;
c += ldc * COMPSIZE;
diff --git a/driver/level3/zherk_kernel.c b/driver/level3/zherk_kernel.c
index fd8ff9c..e4c9e27 100644
--- a/driver/level3/zherk_kernel.c
+++ b/driver/level3/zherk_kernel.c
@@ -59,7 +59,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#ifndef LOWER
GEMM_KERNEL(m, n, k,
alpha_r, ZERO,
- a, b, c, ldc);
+ a, b, c, ldc);
#endif
return 0;
}
@@ -68,7 +68,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#ifdef LOWER
GEMM_KERNEL(m, n, k,
alpha_r, ZERO,
- a, b, c, ldc);
+ a, b, c, ldc);
#endif
return 0;
}
@@ -78,7 +78,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#ifdef LOWER
GEMM_KERNEL(m, offset, k,
alpha_r, ZERO,
- a, b, c, ldc);
+ a, b, c, ldc);
#endif
b += offset * k * COMPSIZE;
c += offset * ldc * COMPSIZE;
@@ -94,7 +94,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
alpha_r, ZERO,
a,
b + (m + offset) * k * COMPSIZE,
- c + (m + offset) * ldc * COMPSIZE, ldc);
+ c + (m + offset) * ldc * COMPSIZE, ldc);
#endif
n = m + offset;
@@ -106,7 +106,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#ifndef LOWER
GEMM_KERNEL(-offset, n, k,
alpha_r, ZERO,
- a, b, c, ldc);
+ a, b, c, ldc);
#endif
a -= offset * k * COMPSIZE;
c -= offset * COMPSIZE;
@@ -122,7 +122,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
alpha_r, ZERO,
a + (n - offset) * k * COMPSIZE,
b,
- c + (n - offset) * COMPSIZE, ldc);
+ c + (n - offset) * COMPSIZE, ldc);
#endif
m = n + offset;
if (m <= 0) return 0;
@@ -138,7 +138,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#ifndef LOWER
GEMM_KERNEL(mm, nn, k,
alpha_r, ZERO,
- a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc);
+ a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc);
#endif
GEMM_BETA(nn, nn, 0, ZERO, ZERO,
@@ -146,8 +146,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
GEMM_KERNEL(nn, nn, k,
alpha_r, ZERO,
- a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn);
-
+ a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn);
+
cc = c + (loop + loop * ldc) * COMPSIZE;
ss = subbuffer;
@@ -158,7 +158,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
cc[i * 2 + 0] += ss[i * 2 + 0];
cc[i * 2 + 1] += ss[i * 2 + 1];
}
-
+
cc[j * 2 + 0] += ss[i * 2 + 0];
cc[j * 2 + 1] = ZERO;
@@ -184,8 +184,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#ifdef LOWER
GEMM_KERNEL(m - mm - nn, nn, k,
alpha_r, ZERO,
- a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE,
- c + (mm + nn + loop * ldc) * COMPSIZE, ldc);
+ a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE,
+ c + (mm + nn + loop * ldc) * COMPSIZE, ldc);
#endif
}
diff --git a/driver/level3/zsyrk_beta.c b/driver/level3/zsyrk_beta.c
index eb09729..3787e31 100644
--- a/driver/level3/zsyrk_beta.c
+++ b/driver/level3/zsyrk_beta.c
@@ -42,7 +42,7 @@
int CNAME(BLASLONG dummy1, BLASLONG n, BLASLONG dummy2, FLOAT alpha_r, FLOAT alpha_i,
FLOAT *dummy3, BLASLONG dummy4, FLOAT *dummy5, BLASLONG dummy6,
- FLOAT *c, BLASLONG ldc,
+ FLOAT *c, BLASLONG ldc,
FLOAT *dummy7, FLOAT *dummy8, BLASLONG from, BLASLONG to){
BLASLONG i;
diff --git a/driver/mapper/mapper.c b/driver/mapper/mapper.c
index 83805fb..bbf499f 100644
--- a/driver/mapper/mapper.c
+++ b/driver/mapper/mapper.c
@@ -92,7 +92,7 @@ static int mapper_release(struct inode *inode, struct file *fp){
#ifdef CONFIG_BIGPHYS_AREA
bigphysarea_free_pages(buffer[pos].address);
#else
-
+
for (addr = buffer[pos].address; addr < buffer[pos].address + buffer[pos].size; addr += PAGE_SIZE) {
ClearPageReserved(virt_to_page(addr));
}
@@ -121,7 +121,7 @@ int mapper_mapper(struct file *fp, struct vm_area_struct *vma){
all_length = vma->vm_end - vma->vm_start;
current_addr = vma -> vm_start;
-
+
spin_lock(&lock);
while (all_length > 0) {
@@ -133,56 +133,56 @@ int mapper_mapper(struct file *fp, struct vm_area_struct *vma){
pos = 0;
while ((pos < MAX_BUFF_SIZE) && (buffer[pos].address != 0)) pos ++;
-
+
if (pos >= MAX_BUFF_SIZE) {
-
+
printk("Memory Allocator : too much memory allocation requested.\n");
spin_unlock(&lock);
-
+
return -EIO;
}
-
+
#ifdef CONFIG_BIGPHYS_AREA
alloc_addr = (caddr_t)bigphysarea_alloc_pages(length >> PAGE_SHIFT, 1, GFP_KERNEL);
#else
alloc_addr = (caddr_t)kmalloc(length, GFP_KERNEL);
#endif
-
+
if (alloc_addr == (caddr_t)NULL) {
-
+
spin_unlock(&lock);
-
+
return -EIO;
}
-
+
#ifndef CONFIG_BIGPHYS_AREA
for (addr = alloc_addr; addr < alloc_addr + length; addr += PAGE_SIZE) {
clear_page(addr);
SetPageReserved(virt_to_page(addr));
}
#endif
-
+
if ((ret = remap_pfn_range(vma,
current_addr,
virt_to_phys((void *)alloc_addr) >> PAGE_SHIFT,
length,
PAGE_SHARED)) < 0) {
-
+
#ifdef CONFIG_BIGPHYS_AREA
bigphysarea_free_pages((caddr_t)alloc_addr);
#else
-
+
for (addr = alloc_addr; addr < alloc_addr + length; addr += PAGE_SIZE) ClearPageReserved(virt_to_page(addr));
-
+
kfree((caddr_t)alloc_addr);
#endif
-
+
spin_unlock(&lock);
-
+
return ret;
}
-
+
buffer[pos].pid = current -> tgid;
buffer[pos].address = alloc_addr;
#ifndef CONFIG_BIGPHYS_AREA
@@ -209,11 +209,11 @@ static int __init mapper_init(void){
int ret, i;
ret = alloc_chrdev_region(&mapper_dev, 0, 1, "mapper");
-
+
cdev_init(&mapper_cdev, &mapper_fops);
ret = cdev_add(&mapper_cdev, mapper_dev, 1);
-
+
spin_lock_init(&lock);
for (i = 0; i < MAX_BUFF_SIZE; i++) {
@@ -240,7 +240,7 @@ static void __exit mapper_exit(void){
#endif
}
}
-
+
cdev_del(&mapper_cdev);
unregister_chrdev_region(mapper_dev, 1);
diff --git a/driver/others/Makefile b/driver/others/Makefile
index ca05c51..fc73871 100644
--- a/driver/others/Makefile
+++ b/driver/others/Makefile
@@ -1,14 +1,14 @@
TOPDIR = ../..
include ../../Makefile.system
-COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX)
+COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX)
#COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
ifdef SMP
-COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX)
+COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX)
ifndef NO_AFFINITY
-COMMONOBJS += init.$(SUFFIX)
+COMMONOBJS += init.$(SUFFIX)
endif
endif
@@ -56,13 +56,13 @@ ifeq ($(USE_OPENMP), 1)
BLAS_SERVER = blas_server_omp.c
else
ifeq ($(OSNAME), WINNT)
-BLAS_SERVER = blas_server_win32.c
+BLAS_SERVER = blas_server_win32.c
endif
ifeq ($(OSNAME), CYGWIN_NT)
-BLAS_SERVER = blas_server_win32.c
+BLAS_SERVER = blas_server_win32.c
endif
ifeq ($(OSNAME), Interix)
-BLAS_SERVER = blas_server_win32.c
+BLAS_SERVER = blas_server_win32.c
endif
endif
diff --git a/driver/others/blas_l1_thread.c b/driver/others/blas_l1_thread.c
index 851135b..83fc268 100644
--- a/driver/others/blas_l1_thread.c
+++ b/driver/others/blas_l1_thread.c
@@ -42,9 +42,9 @@
int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha,
void *a, BLASLONG lda,
- void *b, BLASLONG ldb,
+ void *b, BLASLONG ldb,
void *c, BLASLONG ldc, int (*function)(), int nthreads){
-
+
blas_queue_t queue[MAX_CPU_NUMBER];
blas_arg_t args [MAX_CPU_NUMBER];
@@ -52,23 +52,23 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha
int num_cpu, calc_type;
calc_type = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0) + 2;
-
+
mode |= BLAS_LEGACY;
for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]);
num_cpu = 0;
i = m;
-
+
while (i > 0){
-
+
/* Adjust Parameters */
width = blas_quickdivide(i + nthreads - num_cpu - 1,
nthreads - num_cpu);
i -= width;
if (i < 0) width = width + i;
-
+
astride = width * lda;
if (!(mode & BLAS_TRANSB_T)) {
@@ -95,10 +95,10 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha
queue[num_cpu].routine = function;
queue[num_cpu].args = &args[num_cpu];
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
a = (void *)((BLASULONG)a + astride);
b = (void *)((BLASULONG)b + bstride);
-
+
num_cpu ++;
}
diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c
index 1735ee9..e2632c2 100644
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -178,8 +178,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
#ifdef EXPRECISION
if (mode & BLAS_XDOUBLE){
/* REAL / Extended Double */
- void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble,
- xdouble *, BLASLONG, xdouble *, BLASLONG,
+ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble,
+ xdouble *, BLASLONG, xdouble *, BLASLONG,
xdouble *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
@@ -187,14 +187,14 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
- } else
+ } else
#endif
if (mode & BLAS_DOUBLE){
/* REAL / Double */
- void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
- double *, BLASLONG, double *, BLASLONG,
+ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
+ double *, BLASLONG, double *, BLASLONG,
double *, BLASLONG, void *) = func;
-
+
afunc(args -> m, args -> n, args -> k,
((double *)args -> alpha)[0],
args -> a, args -> lda,
@@ -202,10 +202,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> c, args -> ldc, sb);
} else {
/* REAL / Single */
- void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
- float *, BLASLONG, float *, BLASLONG,
+ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
+ float *, BLASLONG, float *, BLASLONG,
float *, BLASLONG, void *) = func;
-
+
afunc(args -> m, args -> n, args -> k,
((float *)args -> alpha)[0],
args -> a, args -> lda,
@@ -217,7 +217,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
if (mode & BLAS_XDOUBLE){
/* COMPLEX / Extended Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
- xdouble *, BLASLONG, xdouble *, BLASLONG,
+ xdouble *, BLASLONG, xdouble *, BLASLONG,
xdouble *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
@@ -231,7 +231,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
if (mode & BLAS_DOUBLE){
/* COMPLEX / Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
- double *, BLASLONG, double *, BLASLONG,
+ double *, BLASLONG, double *, BLASLONG,
double *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
@@ -243,7 +243,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
} else {
/* COMPLEX / Single */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float,
- float *, BLASLONG, float *, BLASLONG,
+ float *, BLASLONG, float *, BLASLONG,
float *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
@@ -274,11 +274,11 @@ static int blas_thread_server(void *arg){
#ifdef TIMING_DEBUG
unsigned long start, stop;
#endif
-
+
#if defined(OS_LINUX) && !defined(NO_AFFINITY)
if (!increased_threads)
thread_status[cpu].node = gotoblas_set_affinity(cpu + 1);
- else
+ else
thread_status[cpu].node = gotoblas_set_affinity(-1);
#endif
@@ -291,7 +291,7 @@ static int blas_thread_server(void *arg){
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Thread has just been spawned!\n", cpu);
#endif
-
+
while (1){
#ifdef MONITOR
@@ -303,34 +303,34 @@ static int blas_thread_server(void *arg){
#endif
last_tick = (unsigned int)rpcc();
-
+
while (!thread_status[cpu].queue) {
-
+
YIELDING;
if ((unsigned int)rpcc() - last_tick > thread_timeout) {
-
+
pthread_mutex_lock (&thread_status[cpu].lock);
-
+
if (!thread_status[cpu].queue) {
thread_status[cpu].status = THREAD_STATUS_SLEEP;
while (thread_status[cpu].status == THREAD_STATUS_SLEEP) {
-
+
#ifdef MONITOR
main_status[cpu] = MAIN_SLEEPING;
#endif
-
+
pthread_cond_wait(&thread_status[cpu].wakeup, &thread_status[cpu].lock);
}
}
-
+
pthread_mutex_unlock(&thread_status[cpu].lock);
-
+
last_tick = (unsigned int)rpcc();
}
-
+
}
-
+
queue = thread_status[cpu].queue;
if ((long)queue == -1) break;
@@ -345,19 +345,19 @@ static int blas_thread_server(void *arg){
if (queue) {
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
-
+
thread_status[cpu].queue = (blas_queue_t *)1;
sa = queue -> sa;
sb = queue -> sb;
-
+
#ifdef SMP_DEBUG
if (queue -> args) {
fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
}
#endif
-
+
#ifdef CONSISTENT_FPCSR
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
@@ -366,21 +366,21 @@ static int blas_thread_server(void *arg){
#ifdef MONITOR
main_status[cpu] = MAIN_RUNNING1;
#endif
-
+
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
if (sb == NULL) {
if (!(queue -> mode & BLAS_COMPLEX)){
#ifdef EXPRECISION
if (queue -> mode & BLAS_XDOUBLE){
- sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble)
+ sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
- } else
+ } else
#endif
if (queue -> mode & BLAS_DOUBLE){
sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
-
+
} else {
sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
@@ -402,7 +402,7 @@ static int blas_thread_server(void *arg){
}
queue->sb=sb;
}
-
+
#ifdef MONITOR
main_status[cpu] = MAIN_RUNNING2;
#endif
@@ -423,24 +423,24 @@ static int blas_thread_server(void *arg){
#ifdef MONITOR
main_status[cpu] = MAIN_FINISH;
#endif
-
+
thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */
WMB;
}
-
+
#ifdef MONITOR
main_status[cpu] = MAIN_DONE;
#endif
#ifdef TIMING_DEBUG
stop = rpcc();
-
+
fprintf(STDERR, "Thread[%ld] : %16lu %16lu (%8lu cycles)\n", cpu + 1,
start, stop,
stop - start);
#endif
-
+
}
/* Shutdown procedure */
@@ -508,7 +508,7 @@ static int blas_monitor(void *arg){
}
sleep(1);
}
-
+
return 0;
}
#endif
@@ -522,50 +522,47 @@ int blas_thread_init(void){
#endif
if (blas_server_avail) return 0;
-
+
#ifdef NEED_STACKATTR
pthread_attr_init(&attr);
pthread_attr_setguardsize(&attr, 0x1000U);
pthread_attr_setstacksize( &attr, 0x1000U);
#endif
-
+
LOCK_COMMAND(&server_lock);
if (!blas_server_avail){
- char *p;
-
- p = getenv("THREAD_TIMEOUT");
+ env_var_t p;
- if (p) {
+ if (readenv(p,"THREAD_TIMEOUT")) {
thread_timeout = atoi(p);
if (thread_timeout < 4) thread_timeout = 4;
if (thread_timeout > 30) thread_timeout = 30;
thread_timeout = (1 << thread_timeout);
}else{
- p = getenv("GOTO_THREAD_TIMEOUT");
- if (p) {
+ if (readenv(p,"GOTO_THREAD_TIMEOUT")) {
thread_timeout = atoi(p);
if (thread_timeout < 4) thread_timeout = 4;
if (thread_timeout > 30) thread_timeout = 30;
thread_timeout = (1 << thread_timeout);
}
}
-
+
for(i = 0; i < blas_num_threads - 1; i++){
thread_status[i].queue = (blas_queue_t *)NULL;
thread_status[i].status = THREAD_STATUS_WAKEUP;
-
+
pthread_mutex_init(&thread_status[i].lock, NULL);
pthread_cond_init (&thread_status[i].wakeup, NULL);
-
+
#ifdef NEED_STACKATTR
- ret=pthread_create(&blas_threads[i], &attr,
+ ret=pthread_create(&blas_threads[i], &attr,
(void *)&blas_thread_server, (void *)i);
#else
- ret=pthread_create(&blas_threads[i], NULL,
+ ret=pthread_create(&blas_threads[i], NULL,
(void *)&blas_thread_server, (void *)i);
#endif
if(ret!=0){
@@ -575,7 +572,7 @@ int blas_thread_init(void){
}
#ifdef MONITOR
- pthread_create(&monitor_thread, NULL,
+ pthread_create(&monitor_thread, NULL,
(void *)&blas_monitor, (void *)NULL);
#endif
@@ -587,7 +584,7 @@ int blas_thread_init(void){
return 0;
}
-/*
+/*
User can call one of two routines.
exec_blas_async ... immediately returns after jobs are queued.
@@ -613,13 +610,13 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
#ifdef SMP_DEBUG
int exec_count = 0;
fprintf(STDERR, "Exec_blas_async is called. Position = %d\n", pos);
-#endif
-
+#endif
+
blas_lock(&exec_queue_lock);
while (queue) {
queue -> position = pos;
-
+
#ifdef CONSISTENT_FPCSR
__asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode));
__asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode));
@@ -633,7 +630,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
do {
while((thread_status[i].node != node || thread_status[i].queue) && (i < blas_num_threads - 1)) i ++;
-
+
if (i < blas_num_threads - 1) break;
i ++;
@@ -657,40 +654,40 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
if (i >= blas_num_threads - 1) i = 0;
}
#endif
-
+
queue -> assigned = i;
WMB;
thread_status[i].queue = queue;
WMB;
-
+
queue = queue -> next;
pos ++;
#ifdef SMP_DEBUG
exec_count ++;
#endif
-
+
}
blas_unlock(&exec_queue_lock);
#ifdef SMP_DEBUG
fprintf(STDERR, "Done(Number of threads = %2ld).\n", exec_count);
-#endif
-
+#endif
+
while (current) {
-
+
pos = current -> assigned;
-
+
if ((BLASULONG)thread_status[pos].queue > 1) {
-
+
if (thread_status[pos].status == THREAD_STATUS_SLEEP) {
-
+
pthread_mutex_lock (&thread_status[pos].lock);
-
+
#ifdef MONITOR
num_suspend ++;
#endif
-
+
if (thread_status[pos].status == THREAD_STATUS_SLEEP) {
thread_status[pos].status = THREAD_STATUS_WAKEUP;
pthread_cond_signal(&thread_status[pos].wakeup);
@@ -698,7 +695,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
pthread_mutex_unlock(&thread_status[pos].lock);
}
}
-
+
current = current -> next;
}
@@ -708,11 +705,11 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
while ((num > 0) && queue) {
-
+
while(thread_status[queue -> assigned].queue) {
YIELDING;
};
-
+
queue = queue -> next;
num --;
}
@@ -720,7 +717,7 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
#ifdef SMP_DEBUG
fprintf(STDERR, "Done.\n\n");
#endif
-
+
return 0;
}
@@ -738,31 +735,31 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
#endif
if ((num <= 0) || (queue == NULL)) return 0;
-
+
#ifdef SMP_DEBUG
fprintf(STDERR, "Exec_blas is called. Number of executing threads : %ld\n", num);
-#endif
+#endif
#ifdef __ELF__
if (omp_in_parallel && (num > 1)) {
if (omp_in_parallel() > 0) {
- fprintf(stderr,
+ fprintf(stderr,
"OpenBLAS Warning : Detect OpenMP Loop and this application may hang. "
"Please rebuild the library with USE_OPENMP=1 option.\n");
}
}
#endif
-
+
if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next);
#ifdef TIMING_DEBUG
start = rpcc();
-
+
fprintf(STDERR, "\n");
#endif
-
+
routine = queue -> routine;
-
+
if (queue -> mode & BLAS_LEGACY) {
legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
} else
@@ -772,19 +769,19 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
} else
(routine)(queue -> args, queue -> range_m, queue -> range_n,
queue -> sa, queue -> sb, 0);
-
+
#ifdef TIMING_DEBUG
stop = rpcc();
#endif
-
+
if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);
-
+
#ifdef TIMING_DEBUG
- fprintf(STDERR, "Thread[0] : %16lu %16lu (%8lu cycles)\n",
+ fprintf(STDERR, "Thread[0] : %16lu %16lu (%8lu cycles)\n",
start, stop,
stop - start);
#endif
-
+
return 0;
}
@@ -798,7 +795,7 @@ void goto_set_num_threads(int num_threads) {
if (num_threads == 1) {
if (blas_cpu_number == 1){
//OpenBLAS is already single thread.
- return;
+ return;
}else{
//From multi-threads to single thread
//Restore the original affinity mask
@@ -812,26 +809,26 @@ void goto_set_num_threads(int num_threads) {
if (num_threads > blas_num_threads) {
LOCK_COMMAND(&server_lock);
-
+
increased_threads = 1;
for(i = blas_num_threads - 1; i < num_threads - 1; i++){
-
+
thread_status[i].queue = (blas_queue_t *)NULL;
thread_status[i].status = THREAD_STATUS_WAKEUP;
-
+
pthread_mutex_init(&thread_status[i].lock, NULL);
pthread_cond_init (&thread_status[i].wakeup, NULL);
-
+
#ifdef NEED_STACKATTR
- pthread_create(&blas_threads[i], &attr,
+ pthread_create(&blas_threads[i], &attr,
(void *)&blas_thread_server, (void *)i);
#else
- pthread_create(&blas_threads[i], NULL,
+ pthread_create(&blas_threads[i], NULL,
(void *)&blas_thread_server, (void *)i);
#endif
}
-
+
blas_num_threads = num_threads;
UNLOCK_COMMAND(&server_lock);
@@ -846,7 +843,7 @@ void goto_set_num_threads(int num_threads) {
blas_cpu_number = num_threads;
-#if defined(ARCH_MIPS64)
+#if defined(ARCH_MIPS64)
//set parameters for different number of threads.
blas_set_parameter();
#endif
@@ -855,7 +852,7 @@ void goto_set_num_threads(int num_threads) {
void openblas_set_num_threads(int num_threads) {
goto_set_num_threads(num_threads);
-
+
}
/* Compatible function with pthread_create / join */
@@ -887,11 +884,11 @@ int gotoblas_pthread(int numthreads, void *function, void *args, int stride) {
args += stride;
}
-
+
queue[numthreads - 1].next = NULL;
-
+
exec_blas(numthreads, queue);
-
+
return 0;
}
@@ -903,17 +900,17 @@ int BLASFUNC(blas_thread_shutdown)(void){
int i;
if (!blas_server_avail) return 0;
-
+
LOCK_COMMAND(&server_lock);
for (i = 0; i < blas_num_threads - 1; i++) {
blas_lock(&exec_queue_lock);
-
+
thread_status[i].queue = (blas_queue_t *)-1;
blas_unlock(&exec_queue_lock);
-
+
pthread_mutex_lock (&thread_status[i].lock);
thread_status[i].status = THREAD_STATUS_WAKEUP;
@@ -931,16 +928,16 @@ int BLASFUNC(blas_thread_shutdown)(void){
for(i = 0; i < blas_num_threads - 1; i++){
pthread_mutex_destroy(&thread_status[i].lock);
pthread_cond_destroy (&thread_status[i].wakeup);
- }
+ }
#ifdef NEED_STACKATTR
pthread_attr_destory(&attr);
#endif
blas_server_avail = 0;
-
+
UNLOCK_COMMAND(&server_lock);
-
+
return 0;
}
diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index 0a484f3..8d62a81 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -79,7 +79,7 @@ void goto_set_num_threads(int num_threads) {
blas_thread_buffer[i]=NULL;
}
}
-#if defined(ARCH_MIPS64)
+#if defined(ARCH_MIPS64)
//set parameters for different number of threads.
blas_set_parameter();
#endif
@@ -128,8 +128,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
#ifdef EXPRECISION
if (mode & BLAS_XDOUBLE){
/* REAL / Extended Double */
- void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble,
- xdouble *, BLASLONG, xdouble *, BLASLONG,
+ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble,
+ xdouble *, BLASLONG, xdouble *, BLASLONG,
xdouble *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
@@ -137,14 +137,14 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
- } else
+ } else
#endif
if (mode & BLAS_DOUBLE){
/* REAL / Double */
- void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
- double *, BLASLONG, double *, BLASLONG,
+ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
+ double *, BLASLONG, double *, BLASLONG,
double *, BLASLONG, void *) = func;
-
+
afunc(args -> m, args -> n, args -> k,
((double *)args -> alpha)[0],
args -> a, args -> lda,
@@ -152,10 +152,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> c, args -> ldc, sb);
} else {
/* REAL / Single */
- void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
- float *, BLASLONG, float *, BLASLONG,
+ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
+ float *, BLASLONG, float *, BLASLONG,
float *, BLASLONG, void *) = func;
-
+
afunc(args -> m, args -> n, args -> k,
((float *)args -> alpha)[0],
args -> a, args -> lda,
@@ -167,7 +167,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
if (mode & BLAS_XDOUBLE){
/* COMPLEX / Extended Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
- xdouble *, BLASLONG, xdouble *, BLASLONG,
+ xdouble *, BLASLONG, xdouble *, BLASLONG,
xdouble *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
@@ -181,7 +181,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
if (mode & BLAS_DOUBLE){
/* COMPLEX / Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
- double *, BLASLONG, double *, BLASLONG,
+ double *, BLASLONG, double *, BLASLONG,
double *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
@@ -193,7 +193,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
} else {
/* COMPLEX / Single */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float,
- float *, BLASLONG, float *, BLASLONG,
+ float *, BLASLONG, float *, BLASLONG,
float *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
@@ -210,7 +210,7 @@ static void exec_threads(blas_queue_t *queue){
void *buffer, *sa, *sb;
int pos=0, release_flag=0;
-
+
buffer = NULL;
sa = queue -> sa;
sb = queue -> sb;
@@ -235,19 +235,19 @@ static void exec_threads(blas_queue_t *queue){
sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
queue->sa=sa;
}
-
+
if (sb == NULL) {
if (!(queue -> mode & BLAS_COMPLEX)){
#ifdef EXPRECISION
if (queue -> mode & BLAS_XDOUBLE){
- sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble)
+ sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
- } else
+ } else
#endif
if (queue -> mode & BLAS_DOUBLE){
sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
-
+
} else {
sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 100ca34..081bdd7 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -71,8 +71,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
#ifdef EXPRECISION
if (mode & BLAS_XDOUBLE){
/* REAL / Extended Double */
- void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble,
- xdouble *, BLASLONG, xdouble *, BLASLONG,
+ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble,
+ xdouble *, BLASLONG, xdouble *, BLASLONG,
xdouble *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
@@ -80,14 +80,14 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
- } else
+ } else
#endif
if (mode & BLAS_DOUBLE){
/* REAL / Double */
- void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
- double *, BLASLONG, double *, BLASLONG,
+ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
+ double *, BLASLONG, double *, BLASLONG,
double *, BLASLONG, void *) = func;
-
+
afunc(args -> m, args -> n, args -> k,
((double *)args -> alpha)[0],
args -> a, args -> lda,
@@ -95,10 +95,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> c, args -> ldc, sb);
} else {
/* REAL / Single */
- void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
- float *, BLASLONG, float *, BLASLONG,
+ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
+ float *, BLASLONG, float *, BLASLONG,
float *, BLASLONG, void *) = func;
-
+
afunc(args -> m, args -> n, args -> k,
((float *)args -> alpha)[0],
args -> a, args -> lda,
@@ -110,7 +110,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
if (mode & BLAS_XDOUBLE){
/* COMPLEX / Extended Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
- xdouble *, BLASLONG, xdouble *, BLASLONG,
+ xdouble *, BLASLONG, xdouble *, BLASLONG,
xdouble *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
@@ -124,7 +124,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
if (mode & BLAS_DOUBLE){
/* COMPLEX / Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
- double *, BLASLONG, double *, BLASLONG,
+ double *, BLASLONG, double *, BLASLONG,
double *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
@@ -136,7 +136,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
} else {
/* COMPLEX / Single */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float,
- float *, BLASLONG, float *, BLASLONG,
+ float *, BLASLONG, float *, BLASLONG,
float *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
@@ -163,47 +163,47 @@ static DWORD WINAPI blas_thread_server(void *arg){
blas_queue_t *queue;
DWORD action;
HANDLE handles[] = {pool.filled, pool.killed};
-
+
/* Each server needs each buffer */
buffer = blas_memory_alloc(2);
-
+
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu);
#endif
-
+
while (1){
-
+
/* Waiting for Queue */
-
+
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
#endif
-
+
do {
action = WaitForMultipleObjects(2, handles, FALSE, INFINITE);
} while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1));
-
+
if (action == WAIT_OBJECT_0 + 1) break;
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
#endif
-
+
EnterCriticalSection(&pool.lock);
-
+
queue = pool.queue;
if (queue) pool.queue = queue->next;
-
+
LeaveCriticalSection(&pool.lock);
-
+
if (queue) {
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
-
+
if (pool.queue) SetEvent(pool.filled);
-
+
sa = queue -> sa;
sb = queue -> sb;
-
+
#ifdef CONSISTENT_FPCSR
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
@@ -213,27 +213,27 @@ static DWORD WINAPI blas_thread_server(void *arg){
fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
#endif
-
+
// fprintf(stderr, "queue start[%ld]!!!\n", cpu);
-
+
#ifdef MONITOR
main_status[cpu] = MAIN_RUNNING1;
#endif
-
+
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
-
+
if (sb == NULL) {
if (!(queue -> mode & BLAS_COMPLEX)){
#ifdef EXPRECISION
if (queue -> mode & BLAS_XDOUBLE){
- sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble)
+ sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
- } else
+ } else
#endif
if (queue -> mode & BLAS_DOUBLE){
sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
-
+
} else {
sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
@@ -255,11 +255,11 @@ static DWORD WINAPI blas_thread_server(void *arg){
}
queue->sb=sb;
}
-
+
#ifdef MONITOR
main_status[cpu] = MAIN_RUNNING2;
#endif
-
+
if (!(queue -> mode & BLAS_LEGACY)) {
(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
@@ -269,28 +269,28 @@ static DWORD WINAPI blas_thread_server(void *arg){
}else{
continue; //if queue == NULL
}
-
+
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
#endif
-
+
EnterCriticalSection(&queue->lock);
-
+
queue -> status = BLAS_STATUS_FINISHED;
-
+
LeaveCriticalSection(&queue->lock);
-
+
SetEvent(queue->finish);
}
-
+
/* Shutdown procedure */
-
+
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu);
#endif
-
+
blas_memory_free(buffer);
-
+
return 0;
}
@@ -299,11 +299,11 @@ int blas_thread_init(void){
BLASLONG i;
if (blas_server_avail || (blas_cpu_number <= 1)) return 0;
-
+
LOCK_COMMAND(&server_lock);
#ifdef SMP_DEBUG
- fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n",
+ fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n",
blas_cpu_number);
#endif
@@ -317,11 +317,11 @@ int blas_thread_init(void){
pool.queue = NULL;
for(i = 0; i < blas_cpu_number - 1; i++){
- blas_threads[i] = CreateThread(NULL, 0,
+ blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i,
0, &blas_threads_id[i]);
}
-
+
blas_server_avail = 1;
}
@@ -330,7 +330,7 @@ int blas_thread_init(void){
return 0;
}
-/*
+/*
User can call one of two routines.
exec_blas_async ... immediately returns after jobs are queued.
@@ -387,7 +387,7 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
#endif
WaitForSingleObject(queue->finish, INFINITE);
-
+
CloseHandle(queue->finish);
DeleteCriticalSection(&queue -> lock);
@@ -414,7 +414,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next);
routine = queue -> routine;
-
+
if (!(queue -> mode & BLAS_LEGACY)) {
(routine)(queue -> args, queue -> range_m, queue -> range_n,
queue -> sa, queue -> sb, 0);
@@ -435,7 +435,7 @@ int BLASFUNC(blas_thread_shutdown)(void){
int i;
if (!blas_server_avail) return 0;
-
+
LOCK_COMMAND(&server_lock);
if (blas_server_avail){
@@ -446,12 +446,12 @@ int BLASFUNC(blas_thread_shutdown)(void){
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
TerminateThread(blas_threads[i],0);
}
-
+
blas_server_avail = 0;
}
-
+
UNLOCK_COMMAND(&server_lock);
-
+
return 0;
}
@@ -466,7 +466,7 @@ void goto_set_num_threads(int num_threads)
if (num_threads > blas_num_threads) {
LOCK_COMMAND(&server_lock);
-
+
//increased_threads = 1;
if (!blas_server_avail){
@@ -478,14 +478,14 @@ void goto_set_num_threads(int num_threads)
pool.queue = NULL;
blas_server_avail = 1;
}
-
- for(i = blas_num_threads - 1; i < num_threads - 1; i++){
-
- blas_threads[i] = CreateThread(NULL, 0,
+
+ for(i = blas_num_threads - 1; i < num_threads - 1; i++){
+
+ blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i,
0, &blas_threads_id[i]);
}
-
+
blas_num_threads = num_threads;
UNLOCK_COMMAND(&server_lock);
diff --git a/driver/others/divtable.c b/driver/others/divtable.c
index 7a191db..d801afb 100644
--- a/driver/others/divtable.c
+++ b/driver/others/divtable.c
@@ -39,25 +39,25 @@
#include "common.h"
#ifdef SMP
-#ifndef USE64BITINT
+#if !defined(USE64BITINT) || defined(ARCH_X86)
unsigned int blas_quick_divide_table[] = {
- 0x00000000, 0x00000001, 0x80000001, 0x55555556,
- 0x40000001, 0x33333334, 0x2aaaaaab, 0x24924925,
- 0x20000001, 0x1c71c71d, 0x1999999a, 0x1745d175,
- 0x15555556, 0x13b13b14, 0x12492493, 0x11111112,
- 0x10000001, 0x0f0f0f10, 0x0e38e38f, 0x0d79435f,
- 0x0ccccccd, 0x0c30c30d, 0x0ba2e8bb, 0x0b21642d,
- 0x0aaaaaab, 0x0a3d70a4, 0x09d89d8a, 0x097b425f,
- 0x0924924a, 0x08d3dcb1, 0x08888889, 0x08421085,
- 0x08000001, 0x07c1f07d, 0x07878788, 0x07507508,
- 0x071c71c8, 0x06eb3e46, 0x06bca1b0, 0x06906907,
- 0x06666667, 0x063e7064, 0x06186187, 0x05f417d1,
- 0x05d1745e, 0x05b05b06, 0x0590b217, 0x0572620b,
- 0x05555556, 0x0539782a, 0x051eb852, 0x05050506,
- 0x04ec4ec5, 0x04d4873f, 0x04bda130, 0x04a7904b,
- 0x04924925, 0x047dc120, 0x0469ee59, 0x0456c798,
- 0x04444445, 0x04325c54, 0x04210843, 0x04104105,
- 0x04000001,
+ 0x00000000, 0x00000001, 0x80000001, 0x55555556,
+ 0x40000001, 0x33333334, 0x2aaaaaab, 0x24924925,
+ 0x20000001, 0x1c71c71d, 0x1999999a, 0x1745d175,
+ 0x15555556, 0x13b13b14, 0x12492493, 0x11111112,
+ 0x10000001, 0x0f0f0f10, 0x0e38e38f, 0x0d79435f,
+ 0x0ccccccd, 0x0c30c30d, 0x0ba2e8bb, 0x0b21642d,
+ 0x0aaaaaab, 0x0a3d70a4, 0x09d89d8a, 0x097b425f,
+ 0x0924924a, 0x08d3dcb1, 0x08888889, 0x08421085,
+ 0x08000001, 0x07c1f07d, 0x07878788, 0x07507508,
+ 0x071c71c8, 0x06eb3e46, 0x06bca1b0, 0x06906907,
+ 0x06666667, 0x063e7064, 0x06186187, 0x05f417d1,
+ 0x05d1745e, 0x05b05b06, 0x0590b217, 0x0572620b,
+ 0x05555556, 0x0539782a, 0x051eb852, 0x05050506,
+ 0x04ec4ec5, 0x04d4873f, 0x04bda130, 0x04a7904b,
+ 0x04924925, 0x047dc120, 0x0469ee59, 0x0456c798,
+ 0x04444445, 0x04325c54, 0x04210843, 0x04104105,
+ 0x04000001,
};
#else
BLASULONG blas_quick_divide_table[] = {
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 905efb1..1235df2 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -66,7 +66,11 @@ extern gotoblas_t gotoblas_BOBCAT;
extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BULLDOZER;
extern gotoblas_t gotoblas_PILEDRIVER;
+#ifdef NO_AVX2
+#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
+#else
extern gotoblas_t gotoblas_HASWELL;
+#endif
#else
//Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
@@ -95,7 +99,7 @@ int support_avx(){
#ifndef NO_AVX
int eax, ebx, ecx, edx;
int ret=0;
-
+
cpuid(1, &eax, &ebx, &ecx, &edx);
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
xgetbv(0, &eax, &edx);
@@ -116,18 +120,24 @@ extern void openblas_warning(int verbose, const char * msg);
static int get_vendor(void){
int eax, ebx, ecx, edx;
- char vendor[13];
+
+ union
+ {
+ char vchar[16];
+ int vint[4];
+ } vendor;
cpuid(0, &eax, &ebx, &ecx, &edx);
-
- *(int *)(&vendor[0]) = ebx;
- *(int *)(&vendor[4]) = edx;
- *(int *)(&vendor[8]) = ecx;
- vendor[12] = (char)0;
- if (!strcmp(vendor, "GenuineIntel")) return VENDOR_INTEL;
- if (!strcmp(vendor, "AuthenticAMD")) return VENDOR_AMD;
- if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR;
+ *(&vendor.vint[0]) = ebx;
+ *(&vendor.vint[1]) = edx;
+ *(&vendor.vint[2]) = ecx;
+
+ vendor.vchar[12] = '\0';
+
+ if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
+ if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
+ if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
@@ -173,7 +183,7 @@ static gotoblas_t *get_coretype(void){
// Pentium (Clarkdale) / Pentium Mobile (Arrandale)
// Xeon (Clarkdale), 32nm
if (model == 5) return &gotoblas_NEHALEM;
-
+
//Intel Xeon Processor 5600 (Westmere-EP)
//Xeon Processor E7 (Westmere-EX)
//Xeon E7540
@@ -232,7 +242,7 @@ static gotoblas_t *get_coretype(void){
if (family <= 0xe) {
// Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon
cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
- if (eax & 0xffff >= 0x01) {
+ if ( (eax & 0xffff) >= 0x01) {
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0)
return NULL;
@@ -244,7 +254,7 @@ static gotoblas_t *get_coretype(void){
}
if (family == 0xf){
if ((exfamily == 0) || (exfamily == 2)) {
- if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3;
+ if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3;
else return &gotoblas_OPTERON;
} else if (exfamily == 5) {
return &gotoblas_BOBCAT;
@@ -279,7 +289,7 @@ static gotoblas_t *get_coretype(void){
break;
}
}
-
+
return NULL;
}
@@ -320,7 +330,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9];
if (gotoblas == &gotoblas_NEHALEM) return corename[10];
if (gotoblas == &gotoblas_ATHLON) return corename[11];
- if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12];
+ if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12];
if (gotoblas == &gotoblas_OPTERON) return corename[13];
if (gotoblas == &gotoblas_BARCELONA) return corename[14];
if (gotoblas == &gotoblas_NANO) return corename[15];
@@ -333,12 +343,80 @@ char *gotoblas_corename(void) {
return corename[0];
}
+
+static gotoblas_t *force_coretype(char *coretype){
+
+ int i ;
+ int found = -1;
+ char message[128];
+ char mname[20];
+
+ for ( i=1 ; i <= 20; i++)
+ {
+ if (!strncasecmp(coretype,corename[i],20))
+ {
+ found = i;
+ break;
+ }
+ }
+ if (found < 0)
+ {
+ strncpy(mname,coretype,20);
+ sprintf(message, "Core not found: %s\n",mname);
+ openblas_warning(1, message);
+ return(NULL);
+ }
+
+ switch (found)
+ {
+
+ case 20: return (&gotoblas_HASWELL);
+ case 19: return (&gotoblas_PILEDRIVER);
+ case 18: return (&gotoblas_BULLDOZER);
+ case 17: return (&gotoblas_BOBCAT);
+ case 16: return (&gotoblas_SANDYBRIDGE);
+ case 15: return (&gotoblas_NANO);
+ case 14: return (&gotoblas_BARCELONA);
+ case 13: return (&gotoblas_OPTERON);
+ case 12: return (&gotoblas_OPTERON_SSE3);
+ case 11: return (&gotoblas_ATHLON);
+ case 10: return (&gotoblas_NEHALEM);
+ case 9: return (&gotoblas_DUNNINGTON);
+ case 8: return (&gotoblas_PENRYN);
+ case 7: return (&gotoblas_CORE2);
+ case 6: return (&gotoblas_ATOM);
+ case 5: return (&gotoblas_BANIAS);
+ case 4: return (&gotoblas_PRESCOTT);
+ case 3: return (&gotoblas_NORTHWOOD);
+ case 2: return (&gotoblas_COPPERMINE);
+ case 1: return (&gotoblas_KATMAI);
+ }
+ return(NULL);
+
+}
+
+
+
+
void gotoblas_dynamic_init(void) {
-
+
+ char coremsg[128];
+ char coren[22];
+ char *p;
+
+
if (gotoblas) return;
- gotoblas = get_coretype();
-
+ p = getenv("OPENBLAS_CORETYPE");
+ if ( p )
+ {
+ gotoblas = force_coretype(p);
+ }
+ else
+ {
+ gotoblas = get_coretype();
+ }
+
#ifdef ARCH_X86
if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
#else
@@ -353,18 +431,21 @@ void gotoblas_dynamic_init(void) {
gotoblas = &gotoblas_PRESCOTT;
}
#endif
-
+
if (gotoblas && gotoblas -> init) {
+ strncpy(coren,gotoblas_corename(),20);
+ sprintf(coremsg, "Core: %s\n",coren);
+ openblas_warning(2, coremsg);
gotoblas -> init();
} else {
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
exit(1);
}
-
+
}
void gotoblas_dynamic_quit(void) {
-
+
gotoblas = NULL;
}
diff --git a/driver/others/init.c b/driver/others/init.c
index cbcf229..30d35e0 100644
--- a/driver/others/init.c
+++ b/driver/others/init.c
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -181,8 +181,8 @@ static inline int rcount(unsigned long number) {
}
/***
- Known issue: The number of CPUs/cores should less
- than sizeof(unsigned long). On 64 bits, the limit
+ Known issue: The number of CPUs/cores should less
+ than sizeof(unsigned long). On 64 bits, the limit
is 64. On 32 bits, it is 32.
***/
static inline void get_cpumap(int node, unsigned long * node_info) {
@@ -197,14 +197,14 @@ static inline void get_cpumap(int node, unsigned long * node_info) {
int k=0;
sprintf(name, CPUMAP_NAME, node);
-
+
infile = open(name, O_RDONLY);
for(i=0; i<32; i++){
affinity[i] = 0;
}
if (infile != -1) {
-
+
read(infile, cpumap, sizeof(cpumap));
for(i=0; i<160; i++){
@@ -212,7 +212,7 @@ static inline void get_cpumap(int node, unsigned long * node_info) {
break;
if(cpumap[i] != ','){
name[k++]=cpumap[i];
-
+
//Enough data for Hex
if(k >= NCPUBITS/4){
affinity[count++] = strtoul(name, &dummy, 16);
@@ -249,7 +249,7 @@ static inline void get_share(int cpu, int level, unsigned long * share) {
int bitmask_idx = 0;
sprintf(name, SHARE_NAME, cpu, level);
-
+
infile = open(name, O_RDONLY);
// Init share
@@ -260,7 +260,7 @@ static inline void get_share(int cpu, int level, unsigned long * share) {
share[bitmask_idx] = CPUMASK(cpu);
if (infile != -1) {
-
+
read(infile, cpumap, sizeof(cpumap));
for(i=0; i<160; i++){
@@ -268,8 +268,8 @@ static inline void get_share(int cpu, int level, unsigned long * share) {
break;
if(cpumap[i] != ','){
name[k++]=cpumap[i];
-
- //Enough data
+
+ //Enough data
if(k >= NCPUBITS/4){
affinity[count++] = strtoul(name, &dummy, 16);
k=0;
@@ -287,8 +287,8 @@ static inline void get_share(int cpu, int level, unsigned long * share) {
for(i=0; i<count && i<MAX_BITMASK_LEN; i++){
share[i]=affinity[count-i-1];
}
-
-
+
+
close(infile);
}
@@ -369,7 +369,7 @@ static void numa_mapping(void) {
#ifdef DEBUG
fprintf(stderr, "\nFrom /sys ...\n\n");
- for (cpu = 0; cpu < count; cpu++)
+ for (cpu = 0; cpu < count; cpu++)
fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]);
#endif
@@ -406,7 +406,7 @@ static void numa_mapping(void) {
#ifdef DEBUG
fprintf(stderr, "\nSorting ...\n\n");
- for (cpu = 0; cpu < count; cpu++)
+ for (cpu = 0; cpu < count; cpu++)
fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]);
#endif
@@ -453,12 +453,12 @@ static void disable_hyperthread(void) {
share[i] &= common->avail[i];
if (popcount(share[i]) > 1) {
-
+
#ifdef DEBUG
fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n",
cpu, share[i] & ~(CPUMASK(cpu)));
#endif
-
+
common -> avail[i] &= ~((share[i] & ~ CPUMASK(cpu)));
}
}
@@ -514,7 +514,7 @@ static void setup_mempolicy(void) {
for (cpu = 0; cpu < numprocs; cpu ++) {
mynode = READ_NODE(common -> cpu_info[cpu_sub_mapping[cpu]]);
-
+
lnodemask |= (1UL << mynode);
node_cpu[mynode] ++;
@@ -527,11 +527,11 @@ static void setup_mempolicy(void) {
for (cpu = 0; cpu < MAX_NODES; cpu ++) if ((node_cpu[cpu] != 0) && (node_cpu[cpu] != maxcpu)) node_equal = 0;
if (lnodemask) {
-
+
#ifdef DEBUG
fprintf(stderr, "Node mask = %lx\n", lnodemask);
#endif
-
+
my_set_mempolicy(MPOL_INTERLEAVE, &lnodemask, sizeof(lnodemask) * 8);
numnodes = popcount(lnodemask);
@@ -551,11 +551,11 @@ static void open_shmem(void) {
do {
shmid = shmget(SH_MAGIC, 4096, 0666);
-
+
if (shmid == -1) {
shmid = shmget(SH_MAGIC, 4096, IPC_CREAT | 0666);
}
-
+
try ++;
} while ((try < 10) && (shmid == -1));
@@ -599,7 +599,7 @@ static void local_cpu_map(void) {
if (id > 0) {
if (is_dead(id)) common -> cpu_use[cpu] = 0;
}
-
+
bitmask_idx = CPUELT(cpu);
if ((common -> cpu_use[cpu] == 0) && (lprocmask[bitmask_idx] & CPUMASK(cpu))) {
@@ -611,9 +611,9 @@ static void local_cpu_map(void) {
}
cpu ++;
-
+
} while ((mapping < numprocs) && (cpu < common -> final_num_procs));
-
+
disable_mapping = 0;
if ((mapping < numprocs) || (numprocs == 1)) {
@@ -622,7 +622,7 @@ static void local_cpu_map(void) {
}
disable_mapping = 1;
}
-
+
#ifdef DEBUG
for (cpu = 0; cpu < numprocs; cpu ++) {
fprintf(stderr, "Local Mapping : %2d --> %2d (%2d)\n", cpu, cpu_mapping[cpu], cpu_sub_mapping[cpu]);
@@ -634,14 +634,14 @@ static void local_cpu_map(void) {
int get_num_procs(void) { return numprocs; }
int get_num_nodes(void) { return numnodes; }
-int get_node_equal(void) {
+int get_node_equal(void) {
return (((blas_cpu_number % numnodes) == 0) && node_equal);
-
+
}
int gotoblas_set_affinity(int pos) {
-
+
cpu_set_t cpu_mask;
int mynode = 1;
@@ -662,7 +662,7 @@ int gotoblas_set_affinity(int pos) {
CPU_ZERO(&cpu_mask);
CPU_SET (cpu_mapping[pos], &cpu_mask);
-
+
sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask);
node_mapping[WhereAmI()] = mynode;
@@ -672,7 +672,7 @@ int gotoblas_set_affinity(int pos) {
return mynode;
}
-int get_node(void) {
+int get_node(void) {
if (!disable_mapping) return node_mapping[WhereAmI()];
@@ -694,15 +694,15 @@ void gotoblas_affinity_init(void) {
initialized = 1;
sched_getaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
-
+
#ifdef USE_OPENMP
numprocs = 0;
#else
- numprocs = readenv("OPENBLAS_NUM_THREADS");
- if (numprocs == 0) numprocs = readenv("GOTO_NUM_THREADS");
+ numprocs = readenv_atoi("OPENBLAS_NUM_THREADS");
+ if (numprocs == 0) numprocs = readenv_atoi("GOTO_NUM_THREADS");
#endif
- if (numprocs == 0) numprocs = readenv("OMP_NUM_THREADS");
+ if (numprocs == 0) numprocs = readenv_atoi("OMP_NUM_THREADS");
numnodes = 1;
@@ -746,9 +746,9 @@ void gotoblas_affinity_init(void) {
}
for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu;
-
+
numa_check();
-
+
disable_hyperthread();
if (common -> num_nodes > 1) numa_mapping();
@@ -786,14 +786,14 @@ void gotoblas_affinity_init(void) {
CPU_ZERO(&cpu_mask);
CPU_SET (cpu_mapping[0], &cpu_mask);
-
+
sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask);
node_mapping[WhereAmI()] = READ_NODE(common -> cpu_info[cpu_sub_mapping[0]]);
setup_mempolicy();
- if (readenv("OPENBLAS_MAIN_FREE") || readenv("GOTOBLAS_MAIN_FREE")) {
+ if (readenv_atoi("OPENBLAS_MAIN_FREE") || readenv_atoi("GOTOBLAS_MAIN_FREE")) {
sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
}
@@ -817,13 +817,13 @@ void gotoblas_affinity_quit(void) {
if ((numprocs == 1) || (initialized == 0)) return;
if (!disable_mapping) {
-
+
blas_lock(&common -> lock);
-
+
for (i = 0; i < numprocs; i ++) common -> cpu_use[cpu_mapping[i]] = -1;
-
+
blas_unlock(&common -> lock);
-
+
}
shmctl(shmid, IPC_STAT, &ds);
diff --git a/driver/others/lamc3.c b/driver/others/lamc3.c
index 439ef6e..acc4b50 100644
--- a/driver/others/lamc3.c
+++ b/driver/others/lamc3.c
@@ -44,7 +44,7 @@ double
FLOAT
#endif
NAME(FLOAT *a, FLOAT *b){
-
+
return *a + *b;
}
diff --git a/driver/others/lamch.c b/driver/others/lamch.c
index b044500..cdbc0ee 100644
--- a/driver/others/lamch.c
+++ b/driver/others/lamch.c
@@ -152,7 +152,7 @@ double
FLOAT
#endif
NAME(char *P){
-
+
char p = *P;
int pos;
FLOAT *hdata = (FLOAT *)idata;
diff --git a/driver/others/memory.c b/driver/others/memory.c
index 24a9203..f44b37b 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -136,8 +136,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
-#define CONSTRUCTOR __attribute__ ((constructor))
-#define DESTRUCTOR __attribute__ ((destructor))
+#define CONSTRUCTOR __attribute__ ((constructor))
+#define DESTRUCTOR __attribute__ ((destructor))
#ifdef DYNAMIC_ARCH
gotoblas_t *gotoblas = NULL;
@@ -171,32 +171,32 @@ int get_num_procs(void) {
#ifdef OS_WINDOWS
int get_num_procs(void) {
-
+
static int nums = 0;
if (nums == 0) {
SYSTEM_INFO sysinfo;
-
+
GetSystemInfo(&sysinfo);
nums = sysinfo.dwNumberOfProcessors;
}
-
+
return nums;
}
#endif
-#if defined(OS_FREEBSD)
+#if defined(OS_FREEBSD)
int get_num_procs(void) {
-
+
static int nums = 0;
int m[2];
size_t len;
-
+
if (nums == 0) {
m[0] = CTL_HW;
m[1] = HW_NCPU;
@@ -232,7 +232,7 @@ void set_stack_limit(int limitMB){
rl.rlim_cur=StackSize;
result=setrlimit(RLIMIT_STACK, &rl);
if(result !=0){
- fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
+ fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
}
}
}
@@ -241,12 +241,12 @@ void set_stack_limit(int limitMB){
#endif
/*
-OpenBLAS uses the numbers of CPU cores in multithreading.
+OpenBLAS uses the numbers of CPU cores in multithreading.
It can be set by openblas_set_num_threads(int num_threads);
*/
int blas_cpu_number = 0;
/*
-The numbers of threads in the thread pool.
+The numbers of threads in the thread pool.
This value is equal or large than blas_cpu_number. This means some threads are sleep.
*/
int blas_num_threads = 0;
@@ -273,7 +273,7 @@ void openblas_fork_handler()
}
int blas_get_cpu_number(void){
- char *p;
+ env_var_t p;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
int max_num;
#endif
@@ -288,21 +288,18 @@ int blas_get_cpu_number(void){
blas_goto_num = 0;
#ifndef USE_OPENMP
- p = getenv("OPENBLAS_NUM_THREADS");
- if (p) blas_goto_num = atoi(p);
+ if (readenv(p,"OPENBLAS_NUM_THREADS")) blas_goto_num = atoi(p);
if (blas_goto_num < 0) blas_goto_num = 0;
if (blas_goto_num == 0) {
- p = getenv("GOTO_NUM_THREADS");
- if (p) blas_goto_num = atoi(p);
+ if (readenv(p,"GOTO_NUM_THREADS")) blas_goto_num = atoi(p);
if (blas_goto_num < 0) blas_goto_num = 0;
}
-
+
#endif
blas_omp_num = 0;
- p = getenv("OMP_NUM_THREADS");
- if (p) blas_omp_num = atoi(p);
+ if (readenv(p,"OMP_NUM_THREADS")) blas_omp_num = atoi(p);
if (blas_omp_num < 0) blas_omp_num = 0;
if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
@@ -318,8 +315,8 @@ int blas_get_cpu_number(void){
#ifdef DEBUG
printf( "Adjusted number of threads : %3d\n", blas_num_threads);
#endif
-
- blas_cpu_number = blas_num_threads;
+
+ blas_cpu_number = blas_num_threads;
return blas_num_threads;
}
@@ -355,12 +352,12 @@ static void *alloc_mmap(void *address){
void *map_address;
if (address){
- map_address = mmap(address,
- BUFFER_SIZE,
+ map_address = mmap(address,
+ BUFFER_SIZE,
MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
} else {
- map_address = mmap(address,
- BUFFER_SIZE,
+ map_address = mmap(address,
+ BUFFER_SIZE,
MMAP_ACCESS, MMAP_POLICY, -1, 0);
}
@@ -387,7 +384,7 @@ static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
BLASULONG original, *p;
BLASULONG start, stop, min;
int iter, i, count;
-
+
min = (BLASULONG)-1;
original = *(BLASULONG *)(address + size - PAGESIZE);
@@ -397,20 +394,20 @@ static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
p = (BLASULONG *)address;
-
+
count = size / PAGESIZE;
-
+
start = rpcc();
-
+
for (i = 0; i < count; i ++) {
p = (BLASULONG *)(*p);
}
-
+
stop = rpcc();
-
+
if (min > stop - start) min = stop - start;
}
-
+
*(BLASULONG *)(address + size - PAGESIZE + 0) = original;
*(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p;
@@ -442,11 +439,11 @@ static void *alloc_mmap(void *address){
} else {
#endif
- map_address = mmap(NULL, BUFFER_SIZE * SCALING,
+ map_address = mmap(NULL, BUFFER_SIZE * SCALING,
MMAP_ACCESS, MMAP_POLICY, -1, 0);
-
+
if (map_address != (void *)-1) {
-
+
#ifdef OS_LINUX
#ifdef DEBUG
int ret=0;
@@ -462,45 +459,45 @@ static void *alloc_mmap(void *address){
#endif
#endif
-
+
allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
-
+
start = (BLASULONG)map_address;
current = (SCALING - 1) * BUFFER_SIZE;
-
+
while(current > 0) {
*(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
start += PAGESIZE;
current -= PAGESIZE;
}
-
+
*(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
-
+
start = (BLASULONG)map_address;
-
+
best = (BLASULONG)-1;
best_address = map_address;
-
+
while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
-
+
current = run_bench(start, allocsize);
-
+
if (best > current) {
best = current;
best_address = (void *)start;
}
-
+
start += PAGESIZE;
-
+
}
-
+
if ((BLASULONG)best_address > (BLASULONG)map_address)
munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
-
+
munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
-
+
map_address = best_address;
-
+
#if defined(OS_LINUX) && !defined(NO_WARMUP)
hot_alloc = 2;
#endif
@@ -632,7 +629,7 @@ static void alloc_devicedirver_free(struct release_t *release){
}
static void *alloc_devicedirver(void *address){
-
+
int fd;
void *map_address;
@@ -646,7 +643,7 @@ static void *alloc_devicedirver(void *address){
PROT_READ | PROT_WRITE,
MAP_FILE | MAP_SHARED,
fd, 0);
-
+
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].attr = fd;
@@ -671,9 +668,9 @@ static void alloc_shm_free(struct release_t *release){
static void *alloc_shm(void *address){
void *map_address;
int shmid;
-
+
shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
-
+
map_address = (void *)shmat(shmid, address, 0);
if (map_address != (void *)-1){
@@ -725,7 +722,7 @@ static void *alloc_hugetlb(void *address){
#if defined(OS_LINUX) || defined(OS_AIX)
int shmid;
-
+
shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
#ifdef OS_LINUX
SHM_HUGETLB |
@@ -734,10 +731,10 @@ static void *alloc_hugetlb(void *address){
SHM_LGPAGE | SHM_PIN |
#endif
IPC_CREAT | SHM_R | SHM_W);
-
+
if (shmid != -1) {
map_address = (void *)shmat(shmid, address, SHM_RND);
-
+
#ifdef OS_LINUX
my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
#endif
@@ -750,7 +747,7 @@ static void *alloc_hugetlb(void *address){
#ifdef __sun__
struct memcntl_mha mha;
-
+
mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
mha.mha_flags = 0;
mha.mha_pagesize = HUGE_PAGESIZE;
@@ -769,19 +766,26 @@ static void *alloc_hugetlb(void *address){
tp.PrivilegeCount = 1;
tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
- if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) return (void *) -1;
+ if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
+ CloseHandle(hToken);
+ return -1;
+ }
- if (AdjustTokenPrivileges(hToken, FALSE, (PTOKEN_PRIVILEGES)&tp, 0, NULL, NULL) != TRUE) return (void *) -1;
+ if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) {
+ CloseHandle(hToken);
+ return -1;
+ }
map_address = (void *)VirtualAlloc(address,
BUFFER_SIZE,
MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
PAGE_READWRITE);
- AdjustTokenPrivileges(hToken, TRUE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, NULL);
+ tp.Privileges[0].Attributes = 0;
+ AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL);
if (map_address == (void *)NULL) map_address = (void *)-1;
-
+
#endif
if (map_address != (void *)-1){
@@ -829,7 +833,7 @@ static void *alloc_hugetlbfile(void *address){
PROT_READ | PROT_WRITE,
MAP_SHARED,
fd, 0);
-
+
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].attr = fd;
@@ -882,7 +886,7 @@ static void gotoblas_memory_init(void);
/* 2 : Thread */
void *blas_memory_alloc(int procpos){
-
+
int position;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
int mypos;
@@ -917,11 +921,11 @@ void *blas_memory_alloc(int procpos){
void *(**func)(void *address);
if (!memory_initialized) {
-
+
LOCK_COMMAND(&alloc_lock);
-
+
if (!memory_initialized) {
-
+
#if defined(WHEREAMI) && !defined(USE_OPENMP)
for (position = 0; position < NUM_BUFFERS; position ++){
memory[position].addr = (void *)0;
@@ -930,7 +934,7 @@ void *blas_memory_alloc(int procpos){
memory[position].lock = 0;
}
#endif
-
+
#ifdef DYNAMIC_ARCH
gotoblas_dynamic_init();
#endif
@@ -938,11 +942,11 @@ void *blas_memory_alloc(int procpos){
#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
gotoblas_affinity_init();
#endif
-
+
#ifdef SMP
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
#endif
-
+
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
#ifndef DYNAMIC_ARCH
blas_set_parameter();
@@ -968,16 +972,16 @@ void *blas_memory_alloc(int procpos){
do {
if (!memory[position].used && (memory[position].pos == mypos)) {
-
+
blas_lock(&memory[position].lock);
-
+
if (!memory[position].used) goto allocation;
-
+
blas_unlock(&memory[position].lock);
}
-
+
position ++;
-
+
} while (position < NUM_BUFFERS);
@@ -987,18 +991,18 @@ void *blas_memory_alloc(int procpos){
do {
if (!memory[position].used) {
-
+
blas_lock(&memory[position].lock);
if (!memory[position].used) goto allocation;
-
+
blas_unlock(&memory[position].lock);
}
-
+
position ++;
-
+
} while (position < NUM_BUFFERS);
-
+
goto error;
allocation :
@@ -1055,13 +1059,13 @@ void *blas_memory_alloc(int procpos){
} while ((BLASLONG)map_address == -1);
- memory[position].addr = map_address;
+ memory[position].addr = map_address;
#ifdef DEBUG
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
#endif
}
-
+
#if defined(WHEREAMI) && !defined(USE_OPENMP)
if (memory[position].pos == -1) memory[position].pos = mypos;
@@ -1071,18 +1075,18 @@ void *blas_memory_alloc(int procpos){
#ifdef DYNAMIC_ARCH
if (memory_initialized == 1) {
-
+
LOCK_COMMAND(&alloc_lock);
-
+
if (memory_initialized == 1) {
-
+
if (!gotoblas) gotoblas_dynamic_init();
-
+
memory_initialized = 2;
}
-
+
UNLOCK_COMMAND(&alloc_lock);
-
+
}
#endif
@@ -1090,8 +1094,8 @@ void *blas_memory_alloc(int procpos){
#ifdef DEBUG
printf("Mapped : %p %3d\n\n",
(void *)memory[position].addr, position);
-#endif
-
+#endif
+
return (void *)memory[position].addr;
error:
@@ -1106,8 +1110,8 @@ void blas_memory_free(void *free_area){
#ifdef DEBUG
printf("Unmapped Start : %p ...\n", free_area);
-#endif
-
+#endif
+
position = 0;
while ((memory[position].addr != free_area)
@@ -1117,21 +1121,21 @@ void blas_memory_free(void *free_area){
#ifdef DEBUG
printf(" Position : %d\n", position);
-#endif
+#endif
memory[position].used = 0;
#ifdef DEBUG
printf("Unmap Succeeded.\n\n");
-#endif
+#endif
return;
-
+
error:
printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
#ifdef DEBUG
- for (position = 0; position < NUM_BUFFERS; position++)
+ for (position = 0; position < NUM_BUFFERS; position++)
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
#endif
@@ -1151,7 +1155,7 @@ void blas_shutdown(void){
for (pos = 0; pos < release_pos; pos ++) {
release_info[pos].func(&release_info[pos]);
}
-
+
#ifdef SEEK_ADDRESS
base_address = 0UL;
#else
@@ -1173,7 +1177,7 @@ void blas_shutdown(void){
}
#if defined(OS_LINUX) && !defined(NO_WARMUP)
-
+
#ifdef SMP
#if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
@@ -1184,7 +1188,7 @@ static BLASULONG init_lock = 0UL;
#endif
#endif
-static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
+static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
void *sa, void *sb, BLASLONG pos) {
#if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
@@ -1247,7 +1251,7 @@ static void _init_thread_memory(void *buffer) {
queue[num_cpu - 1].next = NULL;
queue[0].sa = buffer;
-
+
exec_blas(num_cpu, queue);
}
@@ -1266,15 +1270,15 @@ static void gotoblas_memory_init(void) {
#ifdef SMP_SERVER
if (blas_server_avail == 0) blas_thread_init();
#endif
-
+
_init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
-
+
#else
-
+
_touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
-
+
#endif
-
+
blas_memory_free(buffer);
}
#endif
diff --git a/driver/others/memory_qalloc.c b/driver/others/memory_qalloc.c
index 10b35aa..17b7f5d 100644
--- a/driver/others/memory_qalloc.c
+++ b/driver/others/memory_qalloc.c
@@ -58,12 +58,12 @@ void *sb = NULL;
static double static_buffer[BUFFER_SIZE/sizeof(double)];
void *blas_memory_alloc(int numproc){
-
+
if (sa == NULL){
#if 1
- sa = (void *)qalloc(QFAST, BUFFER_SIZE);
+ sa = (void *)qalloc(QFAST, BUFFER_SIZE);
#else
- sa = (void *)malloc(BUFFER_SIZE);
+ sa = (void *)malloc(BUFFER_SIZE);
#endif
sb = (void *)&static_buffer[0];
}
diff --git a/driver/others/openblas_error_handle.c b/driver/others/openblas_error_handle.c
index 2d8b9bd..f32a544 100644
--- a/driver/others/openblas_error_handle.c
+++ b/driver/others/openblas_error_handle.c
@@ -35,9 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
int openblas_verbose() {
int ret=0;
- char *p;
- p = getenv("OPENBLAS_VERBOSE");
- if (p) ret = atoi(p);
+ env_var_t p;
+ if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p);
if(ret<0) ret=0;
return ret;
}
diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c
index 581ab1a..0fecbf9 100644
--- a/driver/others/openblas_get_config.c
+++ b/driver/others/openblas_get_config.c
@@ -13,25 +13,27 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include "common.h"
+#include <string.h>
+
static char* openblas_config_str=""
#ifdef USE64BITINT
"USE64BITINT "
@@ -51,9 +53,32 @@ static char* openblas_config_str=""
#ifdef NO_AFFINITY
"NO_AFFINITY "
#endif
+#ifndef DYNAMIC_ARCH
+ CHAR_CORENAME
+#endif
;
+#ifdef DYNAMIC_ARCH
+char *gotoblas_corename();
+static char tmp_config_str[256];
+#endif
+
+
char* CNAME() {
+#ifndef DYNAMIC_ARCH
return openblas_config_str;
+#else
+ strcpy(tmp_config_str, openblas_config_str);
+ strcat(tmp_config_str, gotoblas_corename());
+ return tmp_config_str;
+#endif
}
+
+char* openblas_get_corename() {
+#ifndef DYNAMIC_ARCH
+ return CHAR_CORENAME;
+#else
+ return gotoblas_corename();
+#endif
+}
diff --git a/driver/others/openblas_get_parallel.c b/driver/others/openblas_get_parallel.c
index 68fe574..ea2e4d9 100644
--- a/driver/others/openblas_get_parallel.c
+++ b/driver/others/openblas_get_parallel.c
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -33,12 +33,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(USE_OPENMP)
-static int parallel = 2 ;
-#elif defined(SMP_SERVER)
-static int parallel = 1;
-#else
-static int parallel = 0;
-#endif
+static int parallel = 2 ;
+#elif defined(SMP_SERVER)
+static int parallel = 1;
+#else
+static int parallel = 0;
+#endif
int CNAME() {
return parallel;
diff --git a/driver/others/openblas_set_num_threads.c b/driver/others/openblas_set_num_threads.c
index 5e24cfc..ea0c70a 100644
--- a/driver/others/openblas_set_num_threads.c
+++ b/driver/others/openblas_set_num_threads.c
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
diff --git a/driver/others/parameter.c b/driver/others/parameter.c
index 58e5fb1..a0a8b51 100644
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -165,7 +165,8 @@ int get_L2_size(void){
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
- defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC)
+ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
+ defined(PILEDRIVER) || defined(HASWELL)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
@@ -177,26 +178,26 @@ int get_L2_size(void){
int i;
cpuid(2, &eax, &ebx, &ecx, &edx);
-
+
info[ 0] = BITMASK(eax, 8, 0xff);
info[ 1] = BITMASK(eax, 16, 0xff);
info[ 2] = BITMASK(eax, 24, 0xff);
-
+
info[ 3] = BITMASK(ebx, 0, 0xff);
info[ 4] = BITMASK(ebx, 8, 0xff);
info[ 5] = BITMASK(ebx, 16, 0xff);
info[ 6] = BITMASK(ebx, 24, 0xff);
-
+
info[ 7] = BITMASK(ecx, 0, 0xff);
info[ 8] = BITMASK(ecx, 8, 0xff);
info[ 9] = BITMASK(ecx, 16, 0xff);
info[10] = BITMASK(ecx, 24, 0xff);
-
+
info[11] = BITMASK(edx, 0, 0xff);
info[12] = BITMASK(edx, 8, 0xff);
info[13] = BITMASK(edx, 16, 0xff);
info[14] = BITMASK(edx, 24, 0xff);
-
+
for (i = 0; i < 15; i++){
switch (info[i]){
@@ -248,7 +249,7 @@ int get_L2_size(void){
void blas_set_parameter(void){
- char *p;
+ env_var_t p;
int factor;
int size = get_L2_size();
@@ -284,7 +285,7 @@ void blas_set_parameter(void){
#endif
#endif
-#if defined(CORE_NORTHWOOD)
+#if defined(CORE_NORTHWOOD)
size >>= 7;
#ifdef ALLOC_HUGETLB
@@ -414,7 +415,7 @@ void blas_set_parameter(void){
#endif
#endif
-#if defined(CORE_OPTERON)
+#if defined(CORE_OPTERON)
sgemm_p = 224 + 14 * (size >> 5);
dgemm_p = 112 + 14 * (size >> 6);
cgemm_p = 116 + 14 * (size >> 6);
@@ -463,13 +464,12 @@ void blas_set_parameter(void){
#endif
#endif
- p = getenv("GOTO_BLOCK_FACTOR");
- if (p) {
+ if (readenv(p,"GOTO_BLOCK_FACTOR")) {
factor = atoi(p);
if (factor < 10) factor = 10;
if (factor > 200) factor = 200;
-
+
sgemm_p = ((long)((double)sgemm_p * (double)factor * 1.e-2)) & ~7L;
dgemm_p = ((long)((double)dgemm_p * (double)factor * 1.e-2)) & ~7L;
cgemm_p = ((long)((double)cgemm_p * (double)factor * 1.e-2)) & ~7L;
@@ -479,7 +479,7 @@ void blas_set_parameter(void){
xgemm_p = ((long)((double)xgemm_p * (double)factor * 1.e-2)) & ~7L;
#endif
}
-
+
if (sgemm_p == 0) sgemm_p = 64;
if (dgemm_p == 0) dgemm_p = 64;
if (cgemm_p == 0) cgemm_p = 64;
@@ -572,7 +572,7 @@ int get_current_cpu_info(void){
#if defined(ARCH_IA64)
-static inline BLASULONG cpuid(BLASULONG regnum){
+static inline BLASULONG cpuid(BLASULONG regnum){
BLASULONG value;
#ifndef __ECC
@@ -587,11 +587,11 @@ static inline BLASULONG cpuid(BLASULONG regnum){
#if 1
void blas_set_parameter(void){
-
+
BLASULONG cpuid3, size;
cpuid3 = cpuid(3);
-
+
size = BITMASK(cpuid3, 16, 0xff);
sgemm_p = 192 * (size + 1);
@@ -625,7 +625,7 @@ void blas_set_parameter(void){
#define IA64_PROC_NAME "/proc/pal/cpu0/cache_info"
void blas_set_parameter(void){
-
+
BLASULONG cpuid3;
int size = 0;
@@ -643,17 +643,17 @@ void blas_set_parameter(void){
if (size <= 0) {
if ((infile = fopen(IA64_PROC_NAME, "r")) != NULL) {
-
+
while(fgets(buffer, sizeof(buffer), infile) != NULL) {
if ((!strncmp("Data/Instruction Cache level 3", buffer, 30))) break;
}
-
+
fgets(buffer, sizeof(buffer), infile);
-
+
fclose(infile);
-
+
*strstr(buffer, "bytes") = (char)NULL;
-
+
size = atoi(strchr(buffer, ':') + 1) / 1572864;
}
}
@@ -663,7 +663,7 @@ void blas_set_parameter(void){
if (size <= 0) {
cpuid3 = cpuid(3);
-
+
size = BITMASK(cpuid3, 16, 0xff) + 1;
}
@@ -692,7 +692,7 @@ void blas_set_parameter(void){
#endif
-#if defined(ARCH_MIPS64)
+#if defined(ARCH_MIPS64)
void blas_set_parameter(void){
#if defined(LOONGSON3A)
#ifdef SMP
@@ -720,7 +720,7 @@ void blas_set_parameter(void){
dgemm_r = 160;
}
#endif
-#endif
+#endif
}
#endif
diff --git a/driver/others/profile.c b/driver/others/profile.c
index f464c0b..9fca09f 100644
--- a/driver/others/profile.c
+++ b/driver/others/profile.c
@@ -75,13 +75,13 @@ void gotoblas_profile_quit(void) {
fprintf(stderr, "\n\t====== BLAS Profiling Result =======\n\n");
fprintf(stderr, " Function No. of Calls Time Consumption Efficiency Bytes/cycle Wall Time(Cycles)\n");
-
+
for (i = 0; i < MAX_PROF_TABLE; i ++) {
if (function_profile_table[i].calls) {
#ifndef OS_WINDOWS
- fprintf(stderr, "%-12s : %10Ld %8.2f%% %10.3f%% %8.2f %Ld\n",
+ fprintf(stderr, "%-12s : %10Ld %8.2f%% %10.3f%% %8.2f %Ld\n",
#else
- fprintf(stderr, "%-12s : %10lld %8.2f%% %10.3f%% %8.2f %lld\n",
+ fprintf(stderr, "%-12s : %10lld %8.2f%% %10.3f%% %8.2f %lld\n",
#endif
func_table[i],
function_profile_table[i].calls,
@@ -94,11 +94,11 @@ void gotoblas_profile_quit(void) {
}
fprintf(stderr, " --------------------------------------------------------------------\n");
-
+
#ifndef OS_WINDOWS
- fprintf(stderr, "%-12s : %10Ld %10.3f%% %8.2f\n",
+ fprintf(stderr, "%-12s : %10Ld %10.3f%% %8.2f\n",
#else
- fprintf(stderr, "%-12s : %10lld %10.3f%% %8.2f\n",
+ fprintf(stderr, "%-12s : %10lld %10.3f%% %8.2f\n",
#endif
"Total",
calls,
diff --git a/driver/others/xerbla.c b/driver/others/xerbla.c
index 6f5170e..7427b51 100644
--- a/driver/others/xerbla.c
+++ b/driver/others/xerbla.c
@@ -48,7 +48,7 @@
#ifdef __ELF__
int __xerbla(char *message, blasint *info, blasint length){
-
+
printf(" ** On entry to %6s parameter number %2d had an illegal value\n",
message, *info);
@@ -60,7 +60,7 @@ int BLASFUNC(xerbla)(char *, blasint *, blasint) __attribute__ ((weak, alias ("_
#else
int BLASFUNC(xerbla)(char *message, blasint *info, blasint length){
-
+
printf(" ** On entry to %6s parameter number %2d had an illegal value\n",
message, *info);
diff --git a/exports/Makefile b/exports/Makefile
index 3ef20b3..c798bc7 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -84,8 +84,8 @@ dll : ../$(LIBDLLNAME)
../$(LIBDLLNAME) : ../$(LIBNAME) libopenblas.def dllinit.$(SUFFIX)
$(RANLIB) ../$(LIBNAME)
$(CC) $(CFLAGS) $(LDFLAGS) libopenblas.def dllinit.$(SUFFIX) \
- -shared -o ../$(LIBDLLNAME) -Wl,--out-implib,../$(LIBPREFIX).lib \
- -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB)
+ -shared -o ../$(LIBDLLNAME) -Wl,--out-implib,../$(LIBDLLNAME).a \
+ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB)
libopenblas.def : gensymbol
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > $(@F)
@@ -163,10 +163,10 @@ goto64.$(SUFFIX) : ../$(LIBNAME) aix.def
ld -b64 -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib/ppc64 -lxlf90 -lc -lm -lpthread
else
goto32.$(SUFFIX) : ../$(LIBNAME) aix.def
- ld -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib -lg2c -lc -lm
+ ld -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib -lg2c -lc -lm
goto64.$(SUFFIX) : ../$(LIBNAME) aix.def
- ld -b64 -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib/ppc64 -lg2c -lc -lm
+ ld -b64 -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib/ppc64 -lg2c -lc -lm
endif
endif
diff --git a/exports/dllinit.c b/exports/dllinit.c
index 0f25824..02ff092 100644
--- a/exports/dllinit.c
+++ b/exports/dllinit.c
@@ -50,6 +50,6 @@ BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) {
if (reason == DLL_PROCESS_DETACH) {
gotoblas_quit();
}
-
+
return TRUE;
}
diff --git a/exports/gensymbol b/exports/gensymbol
index 58a309f..0769ae0 100644
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -22,7 +22,9 @@
zhbmv,zhemm,zhemv,zher2,zher2k,zher,zherk,zhpmv,zhpr2,
zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv,
ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, zsymv,
- xerbla);
+ xerbla,
+ saxpby,daxpby,caxpby,zaxpby
+ );
@cblasobjs = (
cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
@@ -49,7 +51,9 @@
cblas_zhemv, cblas_zher2, cblas_zher2k, cblas_zher, cblas_zherk, cblas_zhpmv, cblas_zhpr2,
cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk,
cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm,
- cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub );
+ cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub,
+ cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby
+ );
@exblasobjs = (
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
@@ -69,7 +73,7 @@
);
@gemm3mobjs = (
- zgemm3m, cgemm3m, zsymm3m, csymm3m, zhemm3m, chemm3m,
+
);
@@ -81,6 +85,7 @@
@misc_no_underscore_objs = (
goto_set_num_threads,
openblas_get_config,
+ openblas_get_corename,
);
@misc_underscore_objs = (
@@ -88,17 +93,17 @@
@lapackobjs = (
# These routines are provided by OpenBLAS.
- sgesv, dgesv, cgesv, zgesv,
- sgetf2, dgetf2, cgetf2, zgetf2,
- sgetrf, dgetrf, cgetrf, zgetrf,
- slaswp, dlaswp, claswp, zlaswp,
- sgetrs, dgetrs, cgetrs, zgetrs,
- slauu2, dlauu2, clauu2, zlauu2,
- slauum, dlauum, clauum, zlauum,
- spotf2, dpotf2, cpotf2, zpotf2,
- spotrf, dpotrf, cpotrf, zpotrf,
- strti2, dtrti2, ctrti2, ztrti2,
- strtri, dtrtri, ctrtri, ztrtri,
+ sgesv, dgesv, cgesv, zgesv,
+ sgetf2, dgetf2, cgetf2, zgetf2,
+ sgetrf, dgetrf, cgetrf, zgetrf,
+ slaswp, dlaswp, claswp, zlaswp,
+ sgetrs, dgetrs, cgetrs, zgetrs,
+ slauu2, dlauu2, clauu2, zlauu2,
+ slauum, dlauum, clauum, zlauum,
+ spotf2, dpotf2, cpotf2, zpotf2,
+ spotrf, dpotrf, cpotrf, zpotrf,
+ strti2, dtrti2, ctrti2, ztrti2,
+ strtri, dtrtri, ctrtri, ztrtri,
spotri, dpotri, cpotri, zpotri,
);
@@ -115,7 +120,7 @@
# ALLAUX -- Auxiliary routines called from all precisions
# already provided by @blasobjs: xerbla, lsame
ilaenv, ieeeck, lsamen, iparmq,
- ilaprec, ilatrans, ilauplo, iladiag,
+ ilaprec, ilatrans, ilauplo, iladiag,
ilaver, slamch, slamc3,
# SCLAUX -- Auxiliary routines called from both REAL and COMPLEX.
@@ -181,7 +186,7 @@
slaqtr, slar1v, slar2v, ilaslr, ilaslc,
slarf, slarfb, slarfg, slarfgp, slarft, slarfx, slargv,
slarrv, slartv,
- slarz, slarzb, slarzt, slasy2, slasyf,
+ slarz, slarzb, slarzt, slasy2, slasyf,
slatbs, slatdf, slatps, slatrd, slatrs, slatrz, slatzm,
sopgtr, sopmtr, sorg2l, sorg2r,
sorgbr, sorghr, sorgl2, sorglq, sorgql, sorgqr, sorgr2,
@@ -362,8 +367,8 @@
dtfttr, dtpttf, dtpttr, dtrttf, dtrttp,
dgejsv, dgesvj, dgsvj0, dgsvj1,
dgeequb, dsyequb, dpoequb, dgbequb,
- dbbcsd, dlapmr, dorbdb, dorbdb1, dorbdb2, dorbdb3, dorbdb4,
- dorbdb5, dorbdb6, dorcsd, dorcsd2by1,
+ dbbcsd, dlapmr, dorbdb, dorbdb1, dorbdb2, dorbdb3, dorbdb4,
+ dorbdb5, dorbdb6, dorcsd, dorcsd2by1,
dgeqrt, dgeqrt2, dgeqrt3, dgemqrt,
dtpqrt, dtpqrt2, dtpmqrt, dtprfb,
@@ -436,8 +441,8 @@
zhfrk, ztfttp, zlanhf, zpftrf, zpftri, zpftrs, ztfsm, ztftri,
ztfttr, ztpttf, ztpttr, ztrttf, ztrttp,
zgeequb, zgbequb, zsyequb, zpoequb, zheequb,
- zbbcsd, zlapmr, zunbdb, zunbdb1, zunbdb2, zunbdb3, zunbdb4,
- zunbdb5, zunbdb6, zuncsd, zuncsd2by1,
+ zbbcsd, zlapmr, zunbdb, zunbdb1, zunbdb2, zunbdb3, zunbdb4,
+ zunbdb5, zunbdb6, zuncsd, zuncsd2by1,
zgeqrt, zgeqrt2, zgeqrt3, zgemqrt,
ztpqrt, ztpqrt2, ztpmqrt, ztprfb,
);
@@ -2701,20 +2706,20 @@
@lapack_embeded_underscore_objs=(xerbla_array, chla_transtype, slasyf_rook,
ssytf2_rook, ssytrf_rook, ssytrs_rook,
ssytri_rook, ssycon_rook, ssysv_rook,
- chetf2_rook, chetrf_rook, chetri_rook,
+ chetf2_rook, chetrf_rook, chetri_rook,
chetrs_rook, checon_rook, chesv_rook,
- clahef_rook, clasyf_rook,
- csytf2_rook, csytrf_rook, csytrs_rook,
- csytri_rook, csycon_rook, csysv_rook,
- dlasyf_rook,
- dsytf2_rook, dsytrf_rook, dsytrs_rook,
- dsytri_rook, dsycon_rook, dsysv_rook,
- zhetf2_rook, zhetrf_rook, zhetri_rook,
+ clahef_rook, clasyf_rook,
+ csytf2_rook, csytrf_rook, csytrs_rook,
+ csytri_rook, csycon_rook, csysv_rook,
+ dlasyf_rook,
+ dsytf2_rook, dsytrf_rook, dsytrs_rook,
+ dsytri_rook, dsycon_rook, dsysv_rook,
+ zhetf2_rook, zhetrf_rook, zhetri_rook,
zhetrs_rook, zhecon_rook, zhesv_rook,
zlahef_rook, zlasyf_rook,
zsytf2_rook, zsytrf_rook, zsytrs_rook,
zsytri_rook, zsycon_rook, zsysv_rook,
-
+
);
@@ -2771,7 +2776,7 @@ if ($ARGV[6] == 1) {
}
@hplobjs = (daxpy, dcopy, dscal, idamax, dgemv, dtrsv, dger, dgemm, dtrsm);
- at hplobjs2 = (HPL_dlaswp00N, HPL_dlaswp01N, HPL_dlaswp01T);
+ at hplobjs2 = (HPL_dlaswp00N, HPL_dlaswp01N, HPL_dlaswp01T);
$bu = $ARGV[2];
@@ -2796,7 +2801,7 @@ if ($ARGV[0] eq "osx"){
}
# }
exit(0);
-}
+}
if ($ARGV[0] eq "aix"){
@@ -2817,11 +2822,11 @@ if ($ARGV[0] eq "aix"){
}
# }
exit(0);
-}
+}
if ($ARGV[0] eq "win2k"){
print "EXPORTS\n";
- $count = 1;
+ $count = 1;
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);
@@ -2846,7 +2851,7 @@ if ($ARGV[0] eq "win2k"){
print "\t$uppercase=$objs", "__ \@", $count, "\n";
$count ++;
}
-
+
#for misc_common_objs
foreach $objs (@misc_common_objs) {
@@ -2857,19 +2862,19 @@ if ($ARGV[0] eq "win2k"){
print "\t$uppercase=$objs", "_ \@", $count, "\n";
$count ++;
}
-
-
+
+
foreach $objs (@no_underscore_objs) {
print "\t",$objs,"=$objs"," \@", $count, "\n";
$count ++;
}
-
+
exit(0);
}
if ($ARGV[0] eq "win2khpl"){
print "EXPORTS\n";
- $count = 1;
+ $count = 1;
foreach $objs (@hplobjs) {
$uppercase = $objs;
$uppercase =~ tr/[a-z]/[A-Z]/;
@@ -2894,7 +2899,7 @@ if ($ARGV[0] eq "microsoft"){
@underscore_objs = (@underscore_objs, @misc_common_objs);
print "EXPORTS\n";
- $count = 1;
+ $count = 1;
foreach $objs (@underscore_objs) {
$uppercase = $objs;
$uppercase =~ tr/[a-z]/[A-Z]/;
diff --git a/f_check b/f_check
index 86f1fa6..90ae2fe 100644
--- a/f_check
+++ b/f_check
@@ -26,12 +26,12 @@ if ($compiler eq "") {
@lists = ("g77", "g95", "gfortran", "frt", "fort", "openf90", "openf95",
"sunf77", "sunf90", "sunf95",
- "xlf95", "xlf90", "xlf",
- "ppuf77", "ppuf95", "ppuf90", "ppuxlf",
- "pathf90", "pathf95",
+ "xlf95", "xlf90", "xlf",
+ "ppuf77", "ppuf95", "ppuf90", "ppuxlf",
+ "pathf90", "pathf95",
"pgf95", "pgf90", "pgf77",
"ifort");
-
+
OUTER:
foreach $lists (@lists) {
foreach $path (@path) {
@@ -129,43 +129,43 @@ if ($compiler eq "") {
$bu = "_";
$openmp = "";
}
-
+
if ($compiler =~ /g95/) {
$vendor = G95;
$bu = "_";
$openmp = "";
}
-
+
if ($compiler =~ /gfortran/) {
$vendor = GFORTRAN;
$bu = "_";
$openmp = "-fopenmp";
}
-
+
if ($compiler =~ /ifort/) {
$vendor = INTEL;
$bu = "_";
$openmp = "-openmp";
}
-
+
if ($compiler =~ /pathf/) {
$vendor = PATHSCALE;
$bu = "_";
$openmp = "-mp";
}
-
+
if ($compiler =~ /pgf/) {
$vendor = PGI;
$bu = "_";
$openmp = "-mp";
}
-
+
if ($compiler =~ /ftn/) {
$vendor = PGI;
$bu = "_";
$openmp = "-openmp";
}
-
+
if ($compiler =~ /frt/) {
$vendor = FUJITSU;
$bu = "_";
@@ -177,12 +177,12 @@ if ($compiler eq "") {
$bu = "_";
$openmp = "-xopenmp=parallel";
}
-
+
if ($compiler =~ /ppuf/) {
$vendor = IBM;
$openmp = "-openmp";
}
-
+
if ($compiler =~ /xlf/) {
$vendor = IBM;
$openmp = "-openmp";
@@ -209,9 +209,9 @@ $data = `which $compiler > /dev/null 2> /dev/null`;
if (!$?) {
$binary = $ENV{"BINARY"};
-
+
$openmp = "" if $ENV{USE_OPENMP} != 1;
-
+
if ($binary == 32) {
$link = `$compiler $openmp -m32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
if ($?) {
@@ -223,7 +223,7 @@ if (!$?) {
}
$binary = "" if ($?);
}
-
+
if ($binary == 64) {
$link = `$compiler $openmp -m64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
if ($?) {
@@ -235,12 +235,12 @@ if (!$?) {
}
$binary = "" if ($?);
}
-
+
if ($binary eq "") {
$link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
}
-
+
$linker_L = "";
$linker_l = "";
$linker_a = "";
@@ -268,11 +268,11 @@ if ($link ne "") {
}
$linker_L .= $flags . " ";
}
-
+
if ($flags =~ /^\-Y/) {
$linker_L .= "-Wl,". $flags . " ";
}
-
+
if ($flags =~ /^\-rpath\@/) {
$flags =~ s/\@/\,/g;
if ($vendor eq "PGI") {
@@ -288,9 +288,9 @@ if ($link ne "") {
}
$linker_L .= "-Wl,". $flags . " " ;
}
-
+
if (
- ($flags =~ /^\-l/)
+ ($flags =~ /^\-l/)
&& ($flags !~ /gfortranbegin/)
&& ($flags !~ /frtbegin/)
&& ($flags !~ /pathfstart/)
diff --git a/ftest.f b/ftest.f
index 94ba566..73909c4 100644
--- a/ftest.f
+++ b/ftest.f
@@ -2,5 +2,5 @@
zhoge = (0.0d0,0.0d0)
- return
+ return
end
diff --git a/ftest3.f b/ftest3.f
index 8f2cd33..82cba50 100644
--- a/ftest3.f
+++ b/ftest3.f
@@ -2,5 +2,5 @@
zho_ge = (0.0d0,0.0d0)
- return
+ return
end
diff --git a/getarch.c b/getarch.c
index b100eb5..3e99142 100644
--- a/getarch.c
+++ b/getarch.c
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -384,7 +384,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "BARCELONA"
#endif
-#if defined(FORCE_BOBCAT)
+#if defined(FORCE_BOBCAT)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
@@ -818,11 +818,11 @@ static int get_num_cores(void) {
int m[2], count;
size_t len;
#endif
-
+
#ifdef linux
//returns the number of processors which are currently online
return sysconf(_SC_NPROCESSORS_ONLN);
-
+
#elif defined(OS_WINDOWS)
GetSystemInfo(&sysinfo);
@@ -855,7 +855,7 @@ int main(int argc, char *argv[]){
#ifdef FORCE
printf("CORE=%s\n", CORENAME);
-#else
+#else
#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__)
printf("CORE=%s\n", get_corename());
#endif
@@ -952,6 +952,15 @@ int main(int argc, char *argv[]){
#else
get_cpuconfig();
#endif
+
+#ifdef FORCE
+ printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
+#else
+#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__)
+ printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
+#endif
+#endif
+
break;
case '2' : /* SMP */
diff --git a/getarch_2nd.c b/getarch_2nd.c
index 0b140bb..fad647f 100644
--- a/getarch_2nd.c
+++ b/getarch_2nd.c
@@ -15,7 +15,7 @@ int main(int argc, char **argv) {
printf("DGEMM_UNROLL_N=%d\n", DGEMM_DEFAULT_UNROLL_N);
printf("QGEMM_UNROLL_M=%d\n", QGEMM_DEFAULT_UNROLL_M);
printf("QGEMM_UNROLL_N=%d\n", QGEMM_DEFAULT_UNROLL_N);
-
+
printf("CGEMM_UNROLL_M=%d\n", CGEMM_DEFAULT_UNROLL_M);
printf("CGEMM_UNROLL_N=%d\n", CGEMM_DEFAULT_UNROLL_N);
printf("ZGEMM_UNROLL_M=%d\n", ZGEMM_DEFAULT_UNROLL_M);
@@ -60,8 +60,8 @@ int main(int argc, char **argv) {
#endif
- }
-
+ }
+
if ((argc >= 2) && (*argv[1] == '1')) {
printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float)));
diff --git a/interface/Makefile b/interface/Makefile
index 9774f37..6992248 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -1,6 +1,8 @@
TOPDIR = ..
include $(TOPDIR)/Makefile.system
+SUPPORT_GEMM3M = 0
+
ifeq ($(ARCH), x86)
SUPPORT_GEMM3M = 0
endif
@@ -27,6 +29,7 @@ SBLAS1OBJS = \
smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \
smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \
srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \
+ saxpby.$(SUFFIX)
SBLAS2OBJS = \
sgemv.$(SUFFIX) sger.$(SUFFIX) \
@@ -39,16 +42,19 @@ SBLAS2OBJS = \
SBLAS3OBJS = \
sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \
- strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX)
+ strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \
+ somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)
+
DBLAS1OBJS = \
daxpy.$(SUFFIX) dswap.$(SUFFIX) \
dcopy.$(SUFFIX) dscal.$(SUFFIX) \
- ddot.$(SUFFIX) \
+ ddot.$(SUFFIX) \
dasum.$(SUFFIX) dnrm2.$(SUFFIX) \
dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \
dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \
drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \
+ daxpby.$(SUFFIX)
DBLAS2OBJS = \
dgemv.$(SUFFIX) dger.$(SUFFIX) \
@@ -61,7 +67,8 @@ DBLAS2OBJS = \
DBLAS3OBJS = \
dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \
- dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX)
+ dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \
+ domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)
CBLAS1OBJS = \
caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
@@ -71,6 +78,7 @@ CBLAS1OBJS = \
scamax.$(SUFFIX) icamax.$(SUFFIX) \
scamin.$(SUFFIX) icamin.$(SUFFIX) \
csrot.$(SUFFIX) crotg.$(SUFFIX) \
+ caxpby.$(SUFFIX)
CBLAS2OBJS = \
cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \
@@ -87,7 +95,8 @@ CBLAS2OBJS = \
CBLAS3OBJS = \
cgemm.$(SUFFIX) csymm.$(SUFFIX) ctrmm.$(SUFFIX) \
ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \
- chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX)
+ chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \
+ comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)
ZBLAS1OBJS = \
zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
@@ -97,6 +106,7 @@ ZBLAS1OBJS = \
dzamax.$(SUFFIX) izamax.$(SUFFIX) \
dzamin.$(SUFFIX) izamin.$(SUFFIX) \
zdrot.$(SUFFIX) zrotg.$(SUFFIX) \
+ zaxpby.$(SUFFIX)
ZBLAS2OBJS = \
zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \
@@ -113,9 +123,10 @@ ZBLAS2OBJS = \
ZBLAS3OBJS = \
zgemm.$(SUFFIX) zsymm.$(SUFFIX) ztrmm.$(SUFFIX) \
ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \
- zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX)
+ zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \
+ zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)
-ifdef SUPPORT_GEMM3M
+ifeq ($(SUPPORT_GEMM3M), 1)
CBLAS3OBJS += cgemm3m.$(SUFFIX) csymm3m.$(SUFFIX) chemm3m.$(SUFFIX)
@@ -145,7 +156,7 @@ QBLAS2OBJS = \
QBLAS3OBJS = \
qgemm.$(SUFFIX) qsymm.$(SUFFIX) qtrmm.$(SUFFIX) \
- qtrsm.$(SUFFIX) qsyrk.$(SUFFIX) qsyr2k.$(SUFFIX)
+ qtrsm.$(SUFFIX) qsyrk.$(SUFFIX) qsyr2k.$(SUFFIX)
XBLAS1OBJS = \
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
@@ -173,7 +184,7 @@ XBLAS3OBJS = \
xtrsm.$(SUFFIX) xsyrk.$(SUFFIX) xsyr2k.$(SUFFIX) \
xhemm.$(SUFFIX) xherk.$(SUFFIX) xher2k.$(SUFFIX)
-ifdef SUPPORT_GEMM3M
+ifeq ($(SUPPORT_GEMM3M), 1)
XBLAS3OBJS += xgemm3m.$(SUFFIX) xsymm3m.$(SUFFIX) xhemm3m.$(SUFFIX)
@@ -202,7 +213,7 @@ QBLAS2OBJS = \
QBLAS3OBJS = \
qgemm.$(SUFFIX) qsymm.$(SUFFIX) qtrmm.$(SUFFIX) \
- qtrsm.$(SUFFIX) qsyrk.$(SUFFIX) qsyr2k.$(SUFFIX)
+ qtrsm.$(SUFFIX) qsyrk.$(SUFFIX) qsyr2k.$(SUFFIX)
XBLAS1OBJS = \
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
@@ -229,7 +240,7 @@ XBLAS3OBJS = \
xtrsm.$(SUFFIX) xsyrk.$(SUFFIX) xsyr2k.$(SUFFIX) \
xhemm.$(SUFFIX) xherk.$(SUFFIX) xher2k.$(SUFFIX)
-ifdef SUPPORT_GEMM3M
+ifeq ($(SUPPORT_GEMM3M), 1)
XBLAS3OBJS += xgemm3m.$(SUFFIX) xsymm3m.$(SUFFIX) xhemm3m.$(SUFFIX)
@@ -246,7 +257,7 @@ CSBLAS1OBJS = \
cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
- cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX)
+ cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX)
CSBLAS2OBJS = \
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
@@ -262,7 +273,7 @@ CDBLAS1OBJS = \
cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
- cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX)
+ cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX)
CDBLAS2OBJS = \
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
@@ -280,7 +291,8 @@ CCBLAS1OBJS = \
cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
- cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX)
+ cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
+ cblas_caxpby.$(SUFFIX)
CCBLAS2OBJS = \
cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \
@@ -288,7 +300,7 @@ CCBLAS2OBJS = \
cblas_cher.$(SUFFIX) cblas_cher2.$(SUFFIX) cblas_chpmv.$(SUFFIX) \
cblas_chpr.$(SUFFIX) cblas_chpr2.$(SUFFIX) cblas_ctbmv.$(SUFFIX) \
cblas_ctbsv.$(SUFFIX) cblas_ctpmv.$(SUFFIX) cblas_ctpsv.$(SUFFIX) \
- cblas_ctrmv.$(SUFFIX) cblas_ctrsv.$(SUFFIX)
+ cblas_ctrmv.$(SUFFIX) cblas_ctrsv.$(SUFFIX)
CCBLAS3OBJS = \
cblas_cgemm.$(SUFFIX) cblas_csymm.$(SUFFIX) cblas_ctrmm.$(SUFFIX) cblas_ctrsm.$(SUFFIX) \
@@ -301,7 +313,8 @@ CZBLAS1OBJS = \
cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
- cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX)
+ cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
+ cblas_zaxpby.$(SUFFIX)
CZBLAS2OBJS = \
cblas_zgemv.$(SUFFIX) cblas_zgerc.$(SUFFIX) cblas_zgeru.$(SUFFIX) \
@@ -309,7 +322,7 @@ CZBLAS2OBJS = \
cblas_zher.$(SUFFIX) cblas_zher2.$(SUFFIX) cblas_zhpmv.$(SUFFIX) \
cblas_zhpr.$(SUFFIX) cblas_zhpr2.$(SUFFIX) cblas_ztbmv.$(SUFFIX) \
cblas_ztbsv.$(SUFFIX) cblas_ztpmv.$(SUFFIX) cblas_ztpsv.$(SUFFIX) \
- cblas_ztrmv.$(SUFFIX) cblas_ztrsv.$(SUFFIX)
+ cblas_ztrmv.$(SUFFIX) cblas_ztrsv.$(SUFFIX)
CZBLAS3OBJS = \
cblas_zgemm.$(SUFFIX) cblas_zsymm.$(SUFFIX) cblas_ztrmm.$(SUFFIX) cblas_ztrsm.$(SUFFIX) \
@@ -343,25 +356,25 @@ ZBLASOBJS = $(ZBLAS1OBJS) $(ZBLAS2OBJS) $(ZBLAS3OBJS)
XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS)
#SLAPACKOBJS = \
-# sgetf2.$(SUFFIX) sgetrf.$(SUFFIX) slauu2.$(SUFFIX) slauum.$(SUFFIX) \
-# spotf2.$(SUFFIX) spotrf.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) \
-# slaswp.$(SUFFIX) sgetrs.$(SUFFIX) sgesv.$(SUFFIX) spotri.$(SUFFIX) \
+# sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \
+# spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \
+# slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) spotri.$(SUFFIX)
SLAPACKOBJS = \
sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \
spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \
- slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) spotri.$(SUFFIX)
+ slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX)
#DLAPACKOBJS = \
-# dgetf2.$(SUFFIX) dgetrf.$(SUFFIX) dlauu2.$(SUFFIX) dlauum.$(SUFFIX) \
-# dpotf2.$(SUFFIX) dpotrf.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) \
-# dlaswp.$(SUFFIX) dgetrs.$(SUFFIX) dgesv.$(SUFFIX) dpotri.$(SUFFIX) \
+# dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \
+# dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \
+# dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dpotri.$(SUFFIX)
DLAPACKOBJS = \
dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \
dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \
- dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dpotri.$(SUFFIX)
+ dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX)
QLAPACKOBJS = \
@@ -369,28 +382,29 @@ QLAPACKOBJS = \
qpotf2.$(SUFFIX) qpotrf.$(SUFFIX) qtrti2.$(SUFFIX) qtrtri.$(SUFFIX) \
qlaswp.$(SUFFIX) qgetrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \
+
#CLAPACKOBJS = \
-# cgetf2.$(SUFFIX) cgetrf.$(SUFFIX) clauu2.$(SUFFIX) clauum.$(SUFFIX) \
-# cpotf2.$(SUFFIX) cpotrf.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) \
-# claswp.$(SUFFIX) cgetrs.$(SUFFIX) cgesv.$(SUFFIX) cpotri.$(SUFFIX) \
+# cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \
+# cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \
+# clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) cpotri.$(SUFFIX)
+
CLAPACKOBJS = \
cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \
cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \
- clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) cpotri.$(SUFFIX)
+ clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX)
#ZLAPACKOBJS = \
-# zgetf2.$(SUFFIX) zgetrf.$(SUFFIX) zlauu2.$(SUFFIX) zlauum.$(SUFFIX) \
-# zpotf2.$(SUFFIX) zpotrf.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) \
-# zlaswp.$(SUFFIX) zgetrs.$(SUFFIX) zgesv.$(SUFFIX) zpotri.$(SUFFIX) \
+# zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \
+# zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \
+# zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) zpotri.$(SUFFIX)
+
ZLAPACKOBJS = \
zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \
zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \
- zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) zpotri.$(SUFFIX)
-
-
+ zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX)
XLAPACKOBJS = \
@@ -1991,3 +2005,55 @@ zlarf.$(SUFFIX) zlarf.$(PSUFFIX) : larf.c
xlarf.$(SUFFIX) xlarf.$(PSUFFIX) : larf.c
$(CC) -c $(CFLAGS) $< -o $(@F)
+
+############# BLAS EXTENSIONS #####################################
+
+daxpby.$(SUFFIX) daxpby.$(PSUFFIX) : axpby.c
+ $(CC) $(CFLAGS) -c $< -o $(@F)
+
+cblas_daxpby.$(SUFFIX) cblas_daxpby.$(PSUFFIX) : axpby.c
+ $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
+
+saxpby.$(SUFFIX) saxpby.$(PSUFFIX) : axpby.c
+ $(CC) $(CFLAGS) -c $< -o $(@F)
+
+cblas_saxpby.$(SUFFIX) cblas_saxpby.$(PSUFFIX) : axpby.c
+ $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
+
+zaxpby.$(SUFFIX) zaxpby.$(PSUFFIX) : zaxpby.c
+ $(CC) $(CFLAGS) -c $< -o $(@F)
+
+cblas_zaxpby.$(SUFFIX) cblas_zaxpby.$(PSUFFIX) : zaxpby.c
+ $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
+
+caxpby.$(SUFFIX) caxpby.$(PSUFFIX) : zaxpby.c
+ $(CC) $(CFLAGS) -c $< -o $(@F)
+
+cblas_caxpby.$(SUFFIX) cblas_caxpby.$(PSUFFIX) : zaxpby.c
+ $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
+
+domatcopy.$(SUFFIX) domatcopy.$(PSUFFIX) : omatcopy.c
+ $(CC) -c $(CFLAGS) $< -o $(@F)
+
+somatcopy.$(SUFFIX) somatcopy.$(PSUFFIX) : omatcopy.c
+ $(CC) -c $(CFLAGS) $< -o $(@F)
+
+comatcopy.$(SUFFIX) comatcopy.$(PSUFFIX) : zomatcopy.c
+ $(CC) -c $(CFLAGS) $< -o $(@F)
+
+zomatcopy.$(SUFFIX) zomatcopy.$(PSUFFIX) : zomatcopy.c
+ $(CC) -c $(CFLAGS) $< -o $(@F)
+
+dimatcopy.$(SUFFIX) dimatcopy.$(PSUFFIX) : imatcopy.c
+ $(CC) -c $(CFLAGS) $< -o $(@F)
+
+simatcopy.$(SUFFIX) simatcopy.$(PSUFFIX) : imatcopy.c
+ $(CC) -c $(CFLAGS) $< -o $(@F)
+
+cimatcopy.$(SUFFIX) cimatcopy.$(PSUFFIX) : zimatcopy.c
+ $(CC) -c $(CFLAGS) $< -o $(@F)
+
+zimatcopy.$(SUFFIX) zimatcopy.$(PSUFFIX) : zimatcopy.c
+ $(CC) -c $(CFLAGS) $< -o $(@F)
+
+
diff --git a/interface/asum.c b/interface/asum.c
index 634836e..1393989 100644
--- a/interface/asum.c
+++ b/interface/asum.c
@@ -45,7 +45,7 @@
#ifndef CBLAS
FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
-
+
BLASLONG n = *N;
BLASLONG incx = *INCX;
FLOATRET ret;
@@ -70,7 +70,7 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
#else
FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
-
+
FLOAT ret;
PRINT_DEBUG_CNAME;
diff --git a/kernel/arm/swap.c b/interface/axpby.c
similarity index 73%
copy from kernel/arm/swap.c
copy to interface/axpby.c
index 1ca9e76..63dba81 100644
--- a/kernel/arm/swap.c
+++ b/interface/axpby.c
@@ -25,38 +25,48 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-/**************************************************************************************
-* 2013/08/20 Saar
-* BLASTEST float OK
-* BLASTEST double OK
-*
-**************************************************************************************/
+/******************************************************************
+ 2014/06/07 Saar
+******************************************************************/
+
-#include "common.h"
#include <stdio.h>
+#include "common.h"
+#ifdef FUNCTION_PROFILE
+#include "functable.h"
+#endif
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+#ifndef CBLAS
+
+void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY)
{
- BLASLONG i=0;
- BLASLONG ix=0,iy=0;
- FLOAT temp;
- if ( n < 0 ) return(0);
+ BLASLONG n = *N;
+ BLASLONG incx = *INCX;
+ BLASLONG incy = *INCY;
+ FLOAT alpha = *ALPHA;
+ FLOAT beta = *BETA;
+
+#else
- while(i < n)
- {
+void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT beta, FLOAT *y, blasint incy)
+{
- temp = x[ix] ;
- x[ix] = y[iy] ;
- y[iy] = temp ;
+#endif
- ix += inc_x ;
- iy += inc_y ;
- i++ ;
+ if (n <= 0) return;
- }
- return(0);
+ FUNCTION_PROFILE_START();
+
+ if (incx < 0) x -= (n - 1) * incx;
+ if (incy < 0) y -= (n - 1) * incy;
+
+ AXPBY_K(n, alpha, x, incx, beta, y, incy);
+
+ FUNCTION_PROFILE_END(1, 2 * n, 2 * n);
+
+ return;
}
-
+
diff --git a/interface/axpy.c b/interface/axpy.c
index 5e288e3..61b7b4d 100644
--- a/interface/axpy.c
+++ b/interface/axpy.c
@@ -45,7 +45,7 @@
#ifndef CBLAS
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
-
+
BLASLONG n = *N;
BLASLONG incx = *INCX;
BLASLONG incy = *INCY;
@@ -85,12 +85,12 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
//In that case, the threads would be dependent.
if (incx == 0 || incy == 0)
nthreads = 1;
-
+
//Temporarily work-around the low performance issue with small imput size &
//multithreads.
if (n <= 10000)
nthreads = 1;
-
+
if (nthreads == 1) {
#endif
@@ -105,9 +105,9 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
-#endif
-
- blas_level1_thread(mode, n, 0, 0, &alpha,
+#endif
+
+ blas_level1_thread(mode, n, 0, 0, &alpha,
x, incx, y, incy, NULL, 0, (void *)AXPYU_K, nthreads);
}
@@ -118,5 +118,5 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
IDEBUG_END;
return;
-
+
}
diff --git a/interface/copy.c b/interface/copy.c
index 6965682..3fb2182 100644
--- a/interface/copy.c
+++ b/interface/copy.c
@@ -45,7 +45,7 @@
#ifndef CBLAS
void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
-
+
BLASLONG n = *N;
BLASLONG incx = *INCX;
BLASLONG incy = *INCY;
@@ -70,11 +70,11 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){
if (incy < 0) y -= (n - 1) * incy * COMPSIZE;
COPY_K(n, x, incx, y, incy);
-
+
FUNCTION_PROFILE_END(COMPSIZE, COMPSIZE * n, 0);
IDEBUG_END;
return;
-
+
}
diff --git a/interface/dot.c b/interface/dot.c
index 3744db5..3a91840 100644
--- a/interface/dot.c
+++ b/interface/dot.c
@@ -45,7 +45,7 @@
#ifndef CBLAS
FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
-
+
BLASLONG n = *N;
BLASLONG incx = *INCX;
BLASLONG incy = *INCY;
@@ -74,7 +74,7 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
#else
FLOAT CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){
-
+
FLOAT ret;
PRINT_DEBUG_CNAME;
diff --git a/interface/dsdot.c b/interface/dsdot.c
index 94237e0..32e4b49 100644
--- a/interface/dsdot.c
+++ b/interface/dsdot.c
@@ -45,7 +45,7 @@
#ifndef CBLAS
double NAME(blasint *N, float *x, blasint *INCX, float *y, blasint *INCY){
-
+
BLASLONG n = *N;
BLASLONG incx = *INCX;
BLASLONG incy = *INCY;
@@ -69,7 +69,7 @@ double NAME(blasint *N, float *x, blasint *INCX, float *y, blasint *INCY){
IDEBUG_END;
return ret;
-
+
}
#else
@@ -77,7 +77,7 @@ double NAME(blasint *N, float *x, blasint *INCX, float *y, blasint *INCY){
double CNAME(blasint n, float *x, blasint incx, float *y, blasint incy){
double ret = 0.0;
-
+
PRINT_DEBUG_CNAME;
if (n <= 0) return 0;
@@ -96,7 +96,7 @@ double CNAME(blasint n, float *x, blasint incx, float *y, blasint incy){
IDEBUG_END;
return ret;
-
+
}
#endif
diff --git a/interface/gbmv.c b/interface/gbmv.c
index a76c48d..096c9f6 100644
--- a/interface/gbmv.c
+++ b/interface/gbmv.c
@@ -123,7 +123,7 @@ void NAME(char *TRANS, blasint *M, blasint *N,
if (n < 0) info = 3;
if (m < 0) info = 2;
if (i < 0) info = 1;
-
+
trans = i;
if (info != 0){
@@ -160,7 +160,7 @@ void CNAME(enum CBLAS_ORDER order,
if (TransA == CblasTrans) trans = 1;
if (TransA == CblasConjNoTrans) trans = 0;
if (TransA == CblasConjTrans) trans = 1;
-
+
info = -1;
if (incy == 0) info = 13;
@@ -214,9 +214,9 @@ void CNAME(enum CBLAS_ORDER order,
if (trans) leny = n;
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
-
+
if (alpha == ZERO) return;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
@@ -228,7 +228,7 @@ void CNAME(enum CBLAS_ORDER order,
#ifdef SMP
nthreads = num_cpu_avail(2);
-
+
if (nthreads == 1) {
#endif
diff --git a/interface/gemm.c b/interface/gemm.c
index 587175e..07fea15 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -72,7 +72,7 @@
#endif
#ifndef GEMM_MULTITHREAD_THRESHOLD
-# define GEMM_MULTITHREAD_THRESHOLD 4
+#define GEMM_MULTITHREAD_THRESHOLD 4
#endif
static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
@@ -110,7 +110,7 @@ void NAME(char *TRANSA, char *TRANSB,
FLOAT *b, blasint *ldB,
FLOAT *beta,
FLOAT *c, blasint *ldC){
-
+
blas_arg_t args;
int transa, transb, nrowa, nrowb;
@@ -128,7 +128,7 @@ void NAME(char *TRANSA, char *TRANSB,
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -136,7 +136,7 @@ void NAME(char *TRANSA, char *TRANSB,
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
@@ -199,7 +199,7 @@ void NAME(char *TRANSA, char *TRANSB,
if (args.ldc < args.m) info = 13;
if (args.ldb < nrowb) info = 10;
- if (args.lda < nrowa) info = 8;
+ if (args.lda < nrowa) info = 8;
if (args.k < 0) info = 5;
if (args.n < 0) info = 4;
if (args.m < 0) info = 3;
@@ -221,7 +221,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
FLOAT *alpha,
#endif
FLOAT *a, blasint lda,
- FLOAT *b, blasint ldb,
+ FLOAT *b, blasint ldb,
#ifndef COMPLEX
FLOAT beta,
#else
@@ -244,7 +244,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -252,7 +252,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
@@ -278,15 +278,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
args.m = m;
args.n = n;
args.k = k;
-
+
args.a = (void *)a;
args.b = (void *)b;
args.c = (void *)c;
-
+
args.lda = lda;
args.ldb = ldb;
args.ldc = ldc;
-
+
if (TransA == CblasNoTrans) transa = 0;
if (TransA == CblasTrans) transa = 1;
#ifndef COMPLEX
@@ -305,7 +305,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
if (TransB == CblasConjNoTrans) transb = 2;
if (TransB == CblasConjTrans) transb = 3;
#endif
-
+
nrowa = args.m;
if (transa & 1) nrowa = args.k;
nrowb = args.k;
@@ -315,7 +315,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
if (args.ldc < args.m) info = 13;
if (args.ldb < nrowb) info = 10;
- if (args.lda < nrowa) info = 8;
+ if (args.lda < nrowa) info = 8;
if (args.k < 0) info = 5;
if (args.n < 0) info = 4;
if (args.m < 0) info = 3;
@@ -327,11 +327,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
args.m = n;
args.n = m;
args.k = k;
-
+
args.a = (void *)b;
args.b = (void *)a;
args.c = (void *)c;
-
+
args.lda = ldb;
args.ldb = lda;
args.ldc = ldc;
@@ -354,7 +354,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
if (TransA == CblasConjNoTrans) transb = 2;
if (TransA == CblasConjTrans) transb = 3;
#endif
-
+
nrowa = args.m;
if (transa & 1) nrowa = args.k;
nrowb = args.k;
@@ -364,7 +364,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
if (args.ldc < args.m) info = 13;
if (args.ldb < nrowb) info = 10;
- if (args.lda < nrowa) info = 8;
+ if (args.lda < nrowa) info = 8;
if (args.k < 0) info = 5;
if (args.n < 0) info = 4;
if (args.m < 0) info = 3;
@@ -392,42 +392,91 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
FUNCTION_PROFILE_START();
buffer = (XFLOAT *)blas_memory_alloc(0);
-
+
sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A);
sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
-
+
#ifdef SMP
mode |= (transa << BLAS_TRANSA_SHIFT);
mode |= (transb << BLAS_TRANSB_SHIFT);
- args.common = NULL;
+ int nthreads_max = num_cpu_avail(3);
+ int nthreads_avail = nthreads_max;
- if(args.m <= GEMM_MULTITHREAD_THRESHOLD || args.n <= GEMM_MULTITHREAD_THRESHOLD
- || args.k <=GEMM_MULTITHREAD_THRESHOLD){
- args.nthreads = 1;
- }else{
- args.nthreads = num_cpu_avail(3);
+#ifndef COMPLEX
+ double MNK = (double) args.m * (double) args.n * (double) args.k;
+ if ( MNK <= (1024.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
+ nthreads_max = 1;
+ else
+ {
+ if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
+ {
+ nthreads_max = 4;
+ if ( args.m < 16 * GEMM_MULTITHREAD_THRESHOLD )
+ {
+ nthreads_max = 2;
+ if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
+ if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
+ if ( args.k < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
+ }
+ else
+ {
+ if ( args.n <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2;
+ }
+ }
+ }
+#else
+ double MNK = (double) args.m * (double) args.n * (double) args.k;
+ if ( MNK <= (256.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
+ nthreads_max = 1;
+ else
+ {
+ if ( MNK <= (16384.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
+ {
+ nthreads_max = 4;
+ if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD )
+ {
+ nthreads_max = 2;
+ if ( args.m <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
+ if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
+ if ( args.k < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
+ }
+ else
+ {
+ if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2;
+ }
+ }
}
+
+#endif
+ args.common = NULL;
+
+ if ( nthreads_max > nthreads_avail )
+ args.nthreads = nthreads_avail;
+ else
+ args.nthreads = nthreads_max;
+
+
if (args.nthreads == 1) {
#endif
-
+
(gemm[(transb << 2) | transa])(&args, NULL, NULL, sa, sb, 0);
-
+
#ifdef SMP
-
+
} else {
-
+
#ifndef USE_SIMPLE_THREADED_LEVEL3
#ifndef NO_AFFINITY
nodes = get_num_nodes();
-
+
if ((nodes > 1) && get_node_equal()) {
-
+
args.nthreads /= nodes;
-
+
gemm_thread_mn(mode, &args, NULL, NULL, gemm[16 | (transb << 2) | transa], sa, sb, nodes);
-
+
} else {
#endif
@@ -436,21 +485,21 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
#else
GEMM_THREAD(mode, &args, NULL, NULL, gemm[(transb << 2) | transa], sa, sb, args.nthreads);
-
+
#endif
-
+
#ifndef USE_SIMPLE_THREADED_LEVEL3
#ifndef NO_AFFINITY
}
#endif
#endif
-
+
#endif
-
+
#ifdef SMP
}
#endif
-
+
blas_memory_free(buffer);
FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.k + args.k * args.n + args.m * args.n, 2 * args.m * args.n * args.k);
diff --git a/interface/gemv.c b/interface/gemv.c
index 9ea8aa8..562ceee 100644
--- a/interface/gemv.c
+++ b/interface/gemv.c
@@ -85,7 +85,7 @@ void NAME(char *TRANS, blasint *M, blasint *N,
int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = {
GEMV_N, GEMV_T,
};
-
+
blasint info;
blasint lenx, leny;
blasint i;
@@ -109,7 +109,7 @@ void NAME(char *TRANS, blasint *M, blasint *N,
if (n < 0) info = 3;
if (m < 0) info = 2;
if (i < 0) info = 1;
-
+
trans = i;
if (info != 0){
@@ -150,7 +150,7 @@ void CNAME(enum CBLAS_ORDER order,
if (TransA == CblasTrans) trans = 1;
if (TransA == CblasConjNoTrans) trans = 0;
if (TransA == CblasConjTrans) trans = 1;
-
+
info = -1;
if (incy == 0) info = 11;
@@ -159,7 +159,7 @@ void CNAME(enum CBLAS_ORDER order,
if (n < 0) info = 3;
if (m < 0) info = 2;
if (trans < 0) info = 1;
-
+
}
if (order == CblasRowMajor) {
@@ -198,7 +198,7 @@ void CNAME(enum CBLAS_ORDER order,
if (trans) leny = n;
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0);
-
+
if (alpha == ZERO) return;
IDEBUG_START;
@@ -215,17 +215,17 @@ void CNAME(enum CBLAS_ORDER order,
if (nthreads == 1) {
#endif
-
+
(gemv[(int)trans])(m, n, 0, alpha, a, lda, x, incx, y, incy, buffer);
-
+
#ifdef SMP
} else {
-
+
(gemv_thread[(int)trans])(m, n, alpha, a, lda, x, incx, y, incy, buffer, nthreads);
-
+
}
#endif
-
+
blas_memory_free(buffer);
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
diff --git a/interface/ger.c b/interface/ger.c
index 0218d94..9857d24 100644
--- a/interface/ger.c
+++ b/interface/ger.c
@@ -42,6 +42,12 @@
#include "functable.h"
#endif
+#ifdef SMP
+#ifdef __64BIT__
+#define SMPTEST 1
+#endif
+#endif
+
#ifdef XDOUBLE
#define ERROR_NAME "QGER "
#elif defined DOUBLE
@@ -75,7 +81,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha,
blasint incy = *INCY;
blasint lda = *LDA;
FLOAT *buffer;
-#ifdef SMP
+#ifdef SMPTEST
int nthreads;
#endif
@@ -107,7 +113,7 @@ void CNAME(enum CBLAS_ORDER order,
FLOAT *buffer;
blasint info, t;
-#ifdef SMP
+#ifdef SMPTEST
int nthreads;
#endif
@@ -157,7 +163,7 @@ void CNAME(enum CBLAS_ORDER order,
/* Quick return if possible. */
if (m == 0 || n == 0) return;
if (alpha == 0.) return;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
@@ -167,19 +173,20 @@ void CNAME(enum CBLAS_ORDER order,
buffer = (FLOAT *)blas_memory_alloc(1);
-#ifdef SMP
+#ifdef SMPTEST
nthreads = num_cpu_avail(2);
+
if (nthreads == 1) {
#endif
GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer);
-#ifdef SMP
+#ifdef SMPTEST
} else {
-
+
GER_THREAD(m, n, alpha, x, incx, y, incy, a, lda, buffer, nthreads);
-
+
}
#endif
diff --git a/interface/imatcopy.c b/interface/imatcopy.c
new file mode 100644
index 0000000..3bc886f
--- /dev/null
+++ b/interface/imatcopy.c
@@ -0,0 +1,142 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/***********************************************************
+ * 2014/06/10 Saar
+***********************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+#ifdef FUNCTION_PROFILE
+#include "functable.h"
+#endif
+
+#if defined(DOUBLE)
+#define ERROR_NAME "DIMATCOPY"
+#else
+#define ERROR_NAME "SIMATCOPY"
+#endif
+
+#define BlasRowMajor 0
+#define BlasColMajor 1
+#define BlasNoTrans 0
+#define BlasTrans 1
+
+#undef malloc
+#undef free
+
+void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb)
+{
+
+ char Order, Trans;
+ int order=-1,trans=-1;
+ blasint info = -1;
+ FLOAT *b;
+ size_t msize;
+
+ Order = *ORDER;
+ Trans = *TRANS;
+
+ TOUPPER(Order);
+ TOUPPER(Trans);
+
+ if ( Order == 'C' ) order = BlasColMajor;
+ if ( Order == 'R' ) order = BlasRowMajor;
+ if ( Trans == 'N' ) trans = BlasNoTrans;
+ if ( Trans == 'R' ) trans = BlasNoTrans;
+ if ( Trans == 'T' ) trans = BlasTrans;
+ if ( Trans == 'C' ) trans = BlasTrans;
+
+ if ( order == BlasColMajor)
+ {
+ if ( trans == BlasNoTrans && *ldb < *rows ) info = 9;
+ if ( trans == BlasTrans && *ldb < *cols ) info = 9;
+ }
+ if ( order == BlasRowMajor)
+ {
+ if ( trans == BlasNoTrans && *ldb < *cols ) info = 9;
+ if ( trans == BlasTrans && *ldb < *rows ) info = 9;
+ }
+
+ if ( order == BlasColMajor && *lda < *rows ) info = 7;
+ if ( order == BlasRowMajor && *lda < *cols ) info = 7;
+ if ( *cols <= 0 ) info = 4;
+ if ( *rows <= 0 ) info = 3;
+ if ( trans < 0 ) info = 2;
+ if ( order < 0 ) info = 1;
+
+ if (info >= 0) {
+ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+ return;
+ }
+
+ if ( *lda > *ldb )
+ msize = (*lda) * (*ldb) * sizeof(FLOAT);
+ else
+ msize = (*ldb) * (*ldb) * sizeof(FLOAT);
+
+ b = malloc(msize);
+ if ( b == NULL )
+ {
+ printf("Memory alloc failed\n");
+ exit(1);
+ }
+
+ if ( order == BlasColMajor )
+ {
+ if ( trans == BlasNoTrans )
+ {
+ OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *ldb );
+ OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *ldb, a, *ldb );
+ }
+ else
+ {
+ OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *ldb );
+ OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, b, *ldb, a, *ldb );
+ }
+ }
+ else
+ {
+ if ( trans == BlasNoTrans )
+ {
+ OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *ldb );
+ OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *ldb, a, *ldb );
+ }
+ else
+ {
+ OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *ldb );
+ OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *ldb, a, *ldb );
+ }
+ }
+
+ free(b);
+ return;
+
+}
+
+
diff --git a/interface/imax.c b/interface/imax.c
index 37396c7..55ffa7c 100644
--- a/interface/imax.c
+++ b/interface/imax.c
@@ -121,7 +121,7 @@
#ifndef CBLAS
blasint NAME(blasint *N, FLOAT *x, blasint *INCX){
-
+
BLASLONG n = *N;
BLASLONG incx = *INCX;
blasint ret;
@@ -146,7 +146,7 @@ blasint NAME(blasint *N, FLOAT *x, blasint *INCX){
#else
CBLAS_INDEX CNAME(blasint n, FLOAT *x, blasint incx){
-
+
CBLAS_INDEX ret;
PRINT_DEBUG_CNAME;
diff --git a/interface/lapack/gesv.c b/interface/lapack/gesv.c
index ce6bcbd..721da97 100644
--- a/interface/lapack/gesv.c
+++ b/interface/lapack/gesv.c
@@ -71,7 +71,7 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv,
extern
#endif
FLOAT *sa, *sb;
-
+
PRINT_DEBUG_NAME;
args.m = *N;
@@ -121,18 +121,18 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv,
args.n = *N;
info = GETRF_SINGLE(&args, NULL, NULL, sa, sb, 0);
-
+
if (info == 0){
args.n = *NRHS;
GETRS_N_SINGLE(&args, NULL, NULL, sa, sb, 0);
}
-
+
#ifdef SMP
} else {
args.n = *N;
info = GETRF_PARALLEL(&args, NULL, NULL, sa, sb, 0);
-
+
if (info == 0){
args.n = *NRHS;
GETRS_N_PARALLEL(&args, NULL, NULL, sa, sb, 0);
diff --git a/interface/lapack/getf2.c b/interface/lapack/getf2.c
index cae1595..3e66c04 100644
--- a/interface/lapack/getf2.c
+++ b/interface/lapack/getf2.c
@@ -60,7 +60,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
extern
#endif
FLOAT *sa, *sb;
-
+
PRINT_DEBUG_NAME;
args.m = *M;
@@ -81,7 +81,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
*Info = 0;
if (args.m == 0 || args.n == 0) return 0;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
diff --git a/interface/lapack/getrf.c b/interface/lapack/getrf.c
index aa799e8..44a92dd 100644
--- a/interface/lapack/getrf.c
+++ b/interface/lapack/getrf.c
@@ -53,14 +53,14 @@
int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint *Info){
blas_arg_t args;
-
+
blasint info;
FLOAT *buffer;
#ifdef PPC440
extern
#endif
FLOAT *sa, *sb;
-
+
PRINT_DEBUG_NAME;
args.m = *M;
diff --git a/interface/lapack/getrs.c b/interface/lapack/getrs.c
index 761a001..1b8c83a 100644
--- a/interface/lapack/getrs.c
+++ b/interface/lapack/getrs.c
@@ -105,7 +105,7 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return 0;
}
-
+
args.alpha = NULL;
args.beta = NULL;
@@ -148,5 +148,5 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
IDEBUG_END;
return 0;
-
+
}
diff --git a/interface/lapack/larf.c.obsolete b/interface/lapack/larf.c.obsolete
index 3b538c4..5e62c40 100644
--- a/interface/lapack/larf.c.obsolete
+++ b/interface/lapack/larf.c.obsolete
@@ -58,7 +58,7 @@ int NAME(char *SIDE, blasint *M, blasint *N, FLOAT *v, blasint *incV, FLOAT *tau
char side_arg = *SIDE;
int side;
-
+
PRINT_DEBUG_NAME;
TOUPPER(side_arg);
@@ -77,7 +77,7 @@ int NAME(char *SIDE, blasint *M, blasint *N, FLOAT *v, blasint *incV, FLOAT *tau
if (side_arg == 'R') side = 1;
if (args.m == 0 || args.n == 0) return 0;
-
+
#ifndef COMPLEX
if (*tau == ZERO) return 0;
#else
diff --git a/interface/lapack/laswp.c b/interface/lapack/laswp.c
index 026b515..ebeb103 100644
--- a/interface/lapack/laswp.c
+++ b/interface/lapack/laswp.c
@@ -53,7 +53,7 @@ static int (*laswp[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FL
};
int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *ipiv, blasint *INCX){
-
+
blasint n = *N;
blasint lda = *LDA;
blasint k1 = *K1;
@@ -93,10 +93,10 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
- blas_level1_thread(mode, n, k1, k2, dummyalpha,
- a, lda, NULL, 0, ipiv, incx,
+ blas_level1_thread(mode, n, k1, k2, dummyalpha,
+ a, lda, NULL, 0, ipiv, incx,
laswp[flag], nthreads);
}
#endif
@@ -106,5 +106,5 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *
IDEBUG_END;
return 0;
-
+
}
diff --git a/interface/lapack/lauu2.c b/interface/lapack/lauu2.c
index 14417e9..3599a47 100644
--- a/interface/lapack/lauu2.c
+++ b/interface/lapack/lauu2.c
@@ -72,7 +72,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
extern
#endif
FLOAT *sa, *sb;
-
+
PRINT_DEBUG_NAME;
args.n = *N;
@@ -98,7 +98,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
*Info = 0;
if (args.n <= 0) return 0;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
diff --git a/interface/lapack/lauum.c b/interface/lapack/lauum.c
index e5b593f..2c49eb0 100644
--- a/interface/lapack/lauum.c
+++ b/interface/lapack/lauum.c
@@ -78,7 +78,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
args.n = *N;
args.a = (void *)a;
args.lda = *ldA;
-
+
TOUPPER(uplo_arg);
uplo = -1;
@@ -98,7 +98,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
*Info = 0;
if (args.n == 0) return 0;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
@@ -118,7 +118,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
#endif
*Info = (lauum_single[uplo])(&args, NULL, NULL, sa, sb, 0);
-
+
#ifdef SMP
} else {
*Info = (lauum_parallel[uplo])(&args, NULL, NULL, sa, sb, 0);
diff --git a/interface/lapack/potf2.c b/interface/lapack/potf2.c
index 76822a4..8371922 100644
--- a/interface/lapack/potf2.c
+++ b/interface/lapack/potf2.c
@@ -72,7 +72,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
extern
#endif
FLOAT *sa, *sb;
-
+
PRINT_DEBUG_NAME;
args.n = *N;
@@ -98,7 +98,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
*Info = 0;
if (args.n <= 0) return 0;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
diff --git a/interface/lapack/potrf.c b/interface/lapack/potrf.c
index 9a15012..0922722 100644
--- a/interface/lapack/potrf.c
+++ b/interface/lapack/potrf.c
@@ -72,7 +72,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
extern
#endif
FLOAT *sa, *sb;
-
+
PRINT_DEBUG_NAME;
args.n = *N;
@@ -98,7 +98,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
*Info = 0;
if (args.n == 0) return 0;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
diff --git a/interface/lapack/potri.c b/interface/lapack/potri.c
index a4f3322..d623062 100644
--- a/interface/lapack/potri.c
+++ b/interface/lapack/potri.c
@@ -80,7 +80,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
extern
#endif
FLOAT *sa, *sb;
-
+
PRINT_DEBUG_NAME;
args.n = *N;
@@ -107,7 +107,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
*Info = 0;
if (args.n == 0) return 0;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
@@ -137,11 +137,11 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
#ifdef SMP
} else {
info = (trtri_parallel[uplo])(&args, NULL, NULL, sa, sb, 0);
-
+
if (!info) {
info = (lauum_parallel[uplo])(&args, NULL, NULL, sa, sb, 0);
}
-
+
*Info = info;
}
#endif
diff --git a/interface/lapack/trti2.c b/interface/lapack/trti2.c
index e119b45..42c4c48 100644
--- a/interface/lapack/trti2.c
+++ b/interface/lapack/trti2.c
@@ -73,13 +73,13 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
extern
#endif
FLOAT *sa, *sb;
-
+
PRINT_DEBUG_NAME;
args.n = *N;
args.a = (void *)a;
args.lda = *ldA;
-
+
TOUPPER(uplo_arg);
TOUPPER(diag_arg);
@@ -92,7 +92,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
info = 0;
if (args.lda < MAX(1,args.n)) info = 5;
- if (args.n < 0) info = 3;
+ if (args.n < 0) info = 3;
if (diag < 0) info = 2;
if (uplo < 0) info = 1;
if (info) {
@@ -104,7 +104,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
*Info = 0;
if (args.n <= 0) return 0;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
diff --git a/interface/lapack/trtri.c b/interface/lapack/trtri.c
index 5aa5e9b..6724a67 100644
--- a/interface/lapack/trtri.c
+++ b/interface/lapack/trtri.c
@@ -74,7 +74,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
extern
#endif
FLOAT *sa, *sb;
-
+
PRINT_DEBUG_NAME;
args.n = *N;
@@ -95,7 +95,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
info = 0;
if (args.lda < MAX(1,args.n)) info = 5;
- if (args.n < 0) info = 3;
+ if (args.n < 0) info = 3;
if (diag < 0) info = 2;
if (uplo < 0) info = 1;
if (info) {
@@ -107,7 +107,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
*Info = 0;
if (args.n == 0) return 0;
-
+
if (diag) {
if (AMIN_K(args.n, args.a, args.lda + 1) == ZERO) {
*Info = IAMIN_K(args.n, args.a, args.lda + 1);
@@ -133,12 +133,12 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
#endif
*Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
-
+
#ifdef SMP
} else {
*Info = (trtri_parallel[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
-
+
}
#endif
diff --git a/interface/lapack/zgetf2.c b/interface/lapack/zgetf2.c
index 950ef46..59ec487 100644
--- a/interface/lapack/zgetf2.c
+++ b/interface/lapack/zgetf2.c
@@ -60,7 +60,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
extern
#endif
FLOAT *sa, *sb;
-
+
PRINT_DEBUG_NAME;
args.m = *M;
diff --git a/interface/lapack/zgetrf.c b/interface/lapack/zgetrf.c
index 9f041d9..5031f58 100644
--- a/interface/lapack/zgetrf.c
+++ b/interface/lapack/zgetrf.c
@@ -60,7 +60,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
extern
#endif
FLOAT *sa, *sb;
-
+
PRINT_DEBUG_NAME;
args.m = *M;
diff --git a/interface/lapack/zgetrs.c b/interface/lapack/zgetrs.c
index 81d50e3..54d4b09 100644
--- a/interface/lapack/zgetrs.c
+++ b/interface/lapack/zgetrs.c
@@ -105,7 +105,7 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return 0;
}
-
+
args.alpha = NULL;
args.beta = NULL;
@@ -139,7 +139,7 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
}
#endif
-
+
#ifndef PPC440
blas_memory_free(buffer);
#endif
@@ -149,5 +149,5 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
IDEBUG_END;
return 0;
-
+
}
diff --git a/interface/lapack/zlaswp.c b/interface/lapack/zlaswp.c
index 85ead2c..31e0845 100644
--- a/interface/lapack/zlaswp.c
+++ b/interface/lapack/zlaswp.c
@@ -53,7 +53,7 @@ static int (*laswp[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASL
};
int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *ipiv, blasint *INCX){
-
+
blasint n = *N;
blasint lda = *LDA;
blasint k1 = *K1;
@@ -94,7 +94,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, laswp[flag], nthreads);
}
diff --git a/interface/lapack/zlauu2.c b/interface/lapack/zlauu2.c
index 05603fe..b0698ef 100644
--- a/interface/lapack/zlauu2.c
+++ b/interface/lapack/zlauu2.c
@@ -79,7 +79,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
args.n = *N;
args.a = (void *)a;
args.lda = *ldA;
-
+
TOUPPER(uplo_arg);
uplo = -1;
@@ -99,7 +99,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
*Info = 0;
if (args.n <= 0) return 0;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
diff --git a/interface/lapack/zlauum.c b/interface/lapack/zlauum.c
index 23990e8..4a36cc1 100644
--- a/interface/lapack/zlauum.c
+++ b/interface/lapack/zlauum.c
@@ -78,7 +78,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
args.n = *N;
args.a = (void *)a;
args.lda = *ldA;
-
+
TOUPPER(uplo_arg);
uplo = -1;
@@ -98,7 +98,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
*Info = 0;
if (args.n == 0) return 0;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
@@ -118,7 +118,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
#endif
*Info = (lauum_single[uplo])(&args, NULL, NULL, sa, sb, 0);
-
+
#ifdef SMP
} else {
diff --git a/interface/lapack/zpotf2.c b/interface/lapack/zpotf2.c
index f8f81e2..27ee089 100644
--- a/interface/lapack/zpotf2.c
+++ b/interface/lapack/zpotf2.c
@@ -79,7 +79,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
args.n = *N;
args.a = (void *)a;
args.lda = *ldA;
-
+
TOUPPER(uplo_arg);
uplo = -1;
@@ -99,7 +99,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
*Info = 0;
if (args.n <= 0) return 0;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
diff --git a/interface/lapack/zpotrf.c b/interface/lapack/zpotrf.c
index e2004d7..8cd3980 100644
--- a/interface/lapack/zpotrf.c
+++ b/interface/lapack/zpotrf.c
@@ -78,7 +78,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
args.n = *N;
args.a = (void *)a;
args.lda = *ldA;
-
+
TOUPPER(uplo_arg);
uplo = -1;
@@ -98,7 +98,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
*Info = 0;
if (args.n == 0) return 0;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
@@ -115,7 +115,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
args.nthreads = num_cpu_avail(4);
if (args.nthreads == 1) {
-#endif
+#endif
*Info = (potrf_single[uplo])(&args, NULL, NULL, sa, sb, 0);
diff --git a/interface/lapack/zpotri.c b/interface/lapack/zpotri.c
index b777c11..7c72a7e 100644
--- a/interface/lapack/zpotri.c
+++ b/interface/lapack/zpotri.c
@@ -80,7 +80,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
extern
#endif
FLOAT *sa, *sb;
-
+
PRINT_DEBUG_NAME;
args.n = *N;
@@ -107,7 +107,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
*Info = 0;
if (args.n == 0) return 0;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
@@ -136,11 +136,11 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
#ifdef SMP
} else {
info = (trtri_parallel[uplo])(&args, NULL, NULL, sa, sb, 0);
-
+
if (!info) {
info = (lauum_parallel[uplo])(&args, NULL, NULL, sa, sb, 0);
}
-
+
*Info = info;
}
#endif
diff --git a/interface/lapack/ztrti2.c b/interface/lapack/ztrti2.c
index 017374c..a254766 100644
--- a/interface/lapack/ztrti2.c
+++ b/interface/lapack/ztrti2.c
@@ -73,13 +73,13 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
extern
#endif
FLOAT *sa, *sb;
-
+
PRINT_DEBUG_NAME;
args.n = *N;
args.a = (void *)a;
args.lda = *ldA;
-
+
TOUPPER(uplo_arg);
TOUPPER(diag_arg);
@@ -92,7 +92,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
info = 0;
if (args.lda < MAX(1,args.n)) info = 5;
- if (args.n < 0) info = 3;
+ if (args.n < 0) info = 3;
if (diag < 0) info = 2;
if (uplo < 0) info = 1;
if (info) {
@@ -104,7 +104,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
*Info = 0;
if (args.n <= 0) return 0;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
diff --git a/interface/lapack/ztrtri.c b/interface/lapack/ztrtri.c
index 89caf80..b3ce85b 100644
--- a/interface/lapack/ztrtri.c
+++ b/interface/lapack/ztrtri.c
@@ -73,13 +73,13 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
extern
#endif
FLOAT *sa, *sb;
-
+
PRINT_DEBUG_NAME;
args.n = *N;
args.a = (void *)a;
args.lda = *ldA;
-
+
TOUPPER(uplo_arg);
TOUPPER(diag_arg);
@@ -92,7 +92,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
info = 0;
if (args.lda < MAX(1,args.n)) info = 5;
- if (args.n < 0) info = 3;
+ if (args.n < 0) info = 3;
if (diag < 0) info = 2;
if (uplo < 0) info = 1;
if (info) {
@@ -104,7 +104,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
*Info = 0;
if (args.n == 0) return 0;
-
+
if (diag) {
if (AMIN_K(args.n, args.a, args.lda + 1) == ZERO) {
*Info = IAMIN_K(args.n, args.a, args.lda + 1);
@@ -131,12 +131,12 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
#endif
*Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
-
+
#ifdef SMP
} else {
*Info = (trtri_parallel[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
-
+
}
#endif
diff --git a/interface/max.c b/interface/max.c
index 9bedadd..f059774 100644
--- a/interface/max.c
+++ b/interface/max.c
@@ -121,7 +121,7 @@
#ifndef CBLAS
FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
-
+
BLASLONG n = *N;
BLASLONG incx = *INCX;
FLOATRET ret;
@@ -146,7 +146,7 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
#else
FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
-
+
FLOAT ret;
PRINT_DEBUG_CNAME;
diff --git a/interface/nrm2.c b/interface/nrm2.c
index ff8ef6d..cb4c8f6 100644
--- a/interface/nrm2.c
+++ b/interface/nrm2.c
@@ -45,7 +45,7 @@
#ifndef CBLAS
FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
-
+
BLASLONG n = *N;
BLASLONG incx = *INCX;
FLOATRET ret;
@@ -70,7 +70,7 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
#else
FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
-
+
FLOAT ret;
PRINT_DEBUG_CNAME;
diff --git a/interface/omatcopy.c b/interface/omatcopy.c
new file mode 100644
index 0000000..0c418b3
--- /dev/null
+++ b/interface/omatcopy.c
@@ -0,0 +1,120 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/***********************************************************
+ * 2014/06/09 Saar
+***********************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+#ifdef FUNCTION_PROFILE
+#include "functable.h"
+#endif
+
+#if defined(DOUBLE)
+#define ERROR_NAME "DOMATCOPY"
+#else
+#define ERROR_NAME "SOMATCOPY"
+#endif
+
+#define BlasRowMajor 0
+#define BlasColMajor 1
+#define BlasNoTrans 0
+#define BlasTrans 1
+
+void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb)
+{
+
+ char Order, Trans;
+ int order=-1,trans=-1;
+ blasint info = -1;
+
+ Order = *ORDER;
+ Trans = *TRANS;
+
+ TOUPPER(Order);
+ TOUPPER(Trans);
+
+ if ( Order == 'C' ) order = BlasColMajor;
+ if ( Order == 'R' ) order = BlasRowMajor;
+ if ( Trans == 'N' ) trans = BlasNoTrans;
+ if ( Trans == 'R' ) trans = BlasNoTrans;
+ if ( Trans == 'T' ) trans = BlasTrans;
+ if ( Trans == 'C' ) trans = BlasTrans;
+
+ if ( order == BlasColMajor)
+ {
+ if ( trans == BlasNoTrans && *ldb < *rows ) info = 9;
+ if ( trans == BlasTrans && *ldb < *cols ) info = 9;
+ }
+ if ( order == BlasRowMajor)
+ {
+ if ( trans == BlasNoTrans && *ldb < *cols ) info = 9;
+ if ( trans == BlasTrans && *ldb < *rows ) info = 9;
+ }
+
+ if ( order == BlasColMajor && *lda < *rows ) info = 7;
+ if ( order == BlasRowMajor && *lda < *cols ) info = 7;
+ if ( *cols <= 0 ) info = 4;
+ if ( *rows <= 0 ) info = 3;
+ if ( trans < 0 ) info = 2;
+ if ( order < 0 ) info = 1;
+
+ if (info >= 0) {
+ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+ return;
+ }
+
+ if ( order == BlasColMajor )
+ {
+ if ( trans == BlasNoTrans )
+ {
+ OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *ldb );
+ }
+ else
+ {
+ OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *ldb );
+ }
+ }
+ else
+ {
+ if ( trans == BlasNoTrans )
+ {
+ OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *ldb );
+ }
+ else
+ {
+ OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *ldb );
+ }
+ }
+
+ return;
+
+}
+
+
diff --git a/interface/rot.c b/interface/rot.c
index 2e458b1..125275a 100644
--- a/interface/rot.c
+++ b/interface/rot.c
@@ -45,7 +45,7 @@
#ifndef CBLAS
void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){
-
+
BLASLONG n = *N;
BLASLONG incx = *INCX;
BLASLONG incy = *INCY;
@@ -78,5 +78,5 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT c, F
IDEBUG_END;
return;
-
+
}
diff --git a/interface/rotm.c b/interface/rotm.c
index 4f026c7..9dc0835 100644
--- a/interface/rotm.c
+++ b/interface/rotm.c
@@ -18,13 +18,13 @@ void CNAME(blasint n, FLOAT *dx, blasint incx, FLOAT *dy, blasint incy, FLOAT *d
#endif
blasint i__1, i__2;
-
+
blasint i__;
FLOAT w, z__;
blasint kx, ky;
FLOAT dh11, dh12, dh22, dh21, dflag;
blasint nsteps;
-
+
#ifndef CBLAS
PRINT_DEBUG_CNAME;
#else
@@ -34,7 +34,7 @@ void CNAME(blasint n, FLOAT *dx, blasint incx, FLOAT *dy, blasint incy, FLOAT *d
--dparam;
--dy;
--dx;
-
+
dflag = dparam[1];
if (n <= 0 || dflag == - 2.0) goto L140;
diff --git a/interface/rotmg.c b/interface/rotmg.c
index 4dbb580..1c41e14 100644
--- a/interface/rotmg.c
+++ b/interface/rotmg.c
@@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/**************************************************************************************
* 2014/05/02 Saar
-* fixed two bugs as reported by Brendan Tracey
-* Test with lapack-3.5.0 : OK
+* fixed two bugs as reported by Brendan Tracey
+* Test with lapack-3.5.0 : OK
*
**************************************************************************************/
@@ -62,7 +62,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
#endif
- FLOAT du, dp1, dp2, dq2, dq1, dh11, dh21, dh12, dh22, dflag, dtemp;
+ FLOAT du, dp1, dp2, dq2, dq1, dh11=ZERO, dh21=ZERO, dh12=ZERO, dh22=ZERO, dflag=-ONE, dtemp;
if(*dd1 < ZERO)
{
@@ -108,7 +108,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
if(dq2 < ZERO)
{
dflag = -ONE;
-
+
dh11 = ZERO;
dh12 = ZERO;
dh21 = ZERO;
@@ -130,7 +130,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
*dd2 = *dd1 / du;
*dd1 = dtemp;
*dx1 = dy1 * du;
- }
+ }
}
@@ -169,7 +169,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
}
}
}
-
+
if(*dd2 != ZERO)
{
while( (ABS(*dd2) <= RGAMSQ) || (ABS(*dd2) >= GAMSQ) )
@@ -203,7 +203,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
}
}
}
-
+
}
if(dflag < ZERO)
diff --git a/interface/sbmv.c b/interface/sbmv.c
index c481d56..761a9a0 100644
--- a/interface/sbmv.c
+++ b/interface/sbmv.c
@@ -43,6 +43,14 @@
#include "functable.h"
#endif
+/*
+#ifdef SMP
+#ifdef __64BIT__
+#define SMPTEST 1
+#endif
+#endif
+*/
+
#ifdef XDOUBLE
#define ERROR_NAME "QSBMV "
#elif defined(DOUBLE)
@@ -61,7 +69,7 @@ static int (*sbmv[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLA
#endif
};
-#ifdef SMPBUG
+#ifdef SMPTEST
static int (*sbmv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = {
#ifdef XDOUBLE
qsbmv_thread_U, qsbmv_thread_L,
@@ -75,7 +83,7 @@ static int (*sbmv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT
#ifndef CBLAS
-void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
+void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){
char uplo_arg = *UPLO;
@@ -90,7 +98,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *
blasint info;
int uplo;
FLOAT *buffer;
-#ifdef SMPBUG
+#ifdef SMPTEST
int nthreads;
#endif
@@ -101,7 +109,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (incy == 0) info = 11;
@@ -115,7 +123,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
void CNAME(enum CBLAS_ORDER order,
@@ -130,7 +138,7 @@ void CNAME(enum CBLAS_ORDER order,
FLOAT *buffer;
int uplo;
blasint info;
-#ifdef SMPBUG
+#ifdef SMPTEST
int nthreads;
#endif
@@ -142,7 +150,7 @@ void CNAME(enum CBLAS_ORDER order,
if (order == CblasColMajor) {
if (Uplo == CblasUpper) uplo = 0;
if (Uplo == CblasLower) uplo = 1;
-
+
info = -1;
if (incy == 0) info = 11;
@@ -189,7 +197,7 @@ void CNAME(enum CBLAS_ORDER order,
buffer = (FLOAT *)blas_memory_alloc(1);
-#ifdef SMPBUG
+#ifdef SMPTEST
nthreads = num_cpu_avail(2);
if (nthreads == 1) {
@@ -197,7 +205,7 @@ void CNAME(enum CBLAS_ORDER order,
(sbmv[uplo])(n, k, alpha, a, lda, x, incx, y, incy, buffer);
-#ifdef SMPBUG
+#ifdef SMPTEST
} else {
(sbmv_thread[uplo])(n, k, alpha, a, lda, x, incx, y, incy, buffer, nthreads);
diff --git a/interface/scal.c b/interface/scal.c
index 7b72ca0..3f468a2 100644
--- a/interface/scal.c
+++ b/interface/scal.c
@@ -45,7 +45,7 @@
#ifndef CBLAS
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX){
-
+
blasint n = *N;
blasint incx = *INCX;
FLOAT alpha = *ALPHA;
@@ -53,7 +53,7 @@ void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX){
#else
void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){
-
+
#endif
#ifdef SMP
@@ -78,6 +78,9 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){
#ifdef SMP
nthreads = num_cpu_avail(1);
+ if (n <= 1048576 )
+ nthreads = 1;
+
if (nthreads == 1) {
#endif
@@ -90,11 +93,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
-#endif
-
+#endif
+
blas_level1_thread(mode, n, 0, 0,
#ifndef CBLAS
- ALPHA,
+ ALPHA,
#else
&alpha,
#endif
@@ -108,5 +111,5 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){
IDEBUG_END;
return;
-
+
}
diff --git a/interface/sdsdot.c b/interface/sdsdot.c
index 168468c..6c457fa 100644
--- a/interface/sdsdot.c
+++ b/interface/sdsdot.c
@@ -45,14 +45,14 @@
#ifndef CBLAS
FLOATRET NAME(blasint *N, FLOAT *a, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
-
+
BLASLONG n = *N;
BLASLONG incx = *INCX;
BLASLONG incy = *INCY;
FLOATRET ret;
PRINT_DEBUG_NAME;
-
+
if (n <= 0) return(*a) ;
IDEBUG_START;
@@ -69,13 +69,13 @@ FLOATRET NAME(blasint *N, FLOAT *a, FLOAT *x, blasint *INCX, FLOAT *y, blasint *
IDEBUG_END;
return ret;
-
+
}
#else
FLOAT CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy){
-
+
FLOAT ret;
PRINT_DEBUG_CNAME;
diff --git a/interface/spmv.c b/interface/spmv.c
index 3f853e5..403458b 100644
--- a/interface/spmv.c
+++ b/interface/spmv.c
@@ -99,7 +99,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (incy == 0) info = 9;
@@ -111,7 +111,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
void CNAME(enum CBLAS_ORDER order,
@@ -138,7 +138,7 @@ void CNAME(enum CBLAS_ORDER order,
if (order == CblasColMajor) {
if (Uplo == CblasUpper) uplo = 0;
if (Uplo == CblasLower) uplo = 1;
-
+
info = -1;
if (incy == 0) info = 9;
diff --git a/interface/spr.c b/interface/spr.c
index aa2ff8f..1956986 100644
--- a/interface/spr.c
+++ b/interface/spr.c
@@ -75,7 +75,7 @@ static int (*spr_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, FLOAT *,
#ifndef CBLAS
-void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
+void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
FLOAT *x, blasint *INCX, FLOAT *a){
char uplo_arg = *UPLO;
@@ -97,7 +97,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (incx == 0) info = 5;
@@ -108,7 +108,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
void CNAME(enum CBLAS_ORDER order,
@@ -133,7 +133,7 @@ void CNAME(enum CBLAS_ORDER order,
if (order == CblasColMajor) {
if (Uplo == CblasUpper) uplo = 0;
if (Uplo == CblasLower) uplo = 1;
-
+
info = -1;
if (incx == 0) info = 5;
diff --git a/interface/spr2.c b/interface/spr2.c
index e556d3f..73a811c 100644
--- a/interface/spr2.c
+++ b/interface/spr2.c
@@ -75,7 +75,7 @@ static int (*spr2_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLON
#ifndef CBLAS
-void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
+void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a){
char uplo_arg = *UPLO;
@@ -98,7 +98,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (incy == 0) info = 7;
@@ -110,7 +110,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
void CNAME(enum CBLAS_ORDER order,
@@ -136,7 +136,7 @@ void CNAME(enum CBLAS_ORDER order,
if (order == CblasColMajor) {
if (Uplo == CblasUpper) uplo = 0;
if (Uplo == CblasLower) uplo = 1;
-
+
info = -1;
if (incy == 0) info = 7;
diff --git a/interface/swap.c b/interface/swap.c
index 271fa08..3baeb27 100644
--- a/interface/swap.c
+++ b/interface/swap.c
@@ -45,7 +45,7 @@
#ifndef CBLAS
void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
-
+
blasint n = *N;
blasint incx = *INCX;
blasint incy = *INCY;
@@ -78,12 +78,12 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){
#ifdef SMP
nthreads = num_cpu_avail(1);
-
+
//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
if (incx == 0 || incy == 0)
nthreads = 1;
-
+
if (nthreads == 1) {
#endif
@@ -91,15 +91,15 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){
#ifdef SMP
} else {
-
+
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
-#endif
-
+#endif
+
blas_level1_thread(mode, n, 0, 0, dummyalpha,
x, incx, y, incy, NULL, 0, (void *)SWAP_K, nthreads);
}
@@ -111,5 +111,5 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){
IDEBUG_END;
return;
-
+
}
diff --git a/interface/symm.c b/interface/symm.c
index b447f13..959a4eb 100644
--- a/interface/symm.c
+++ b/interface/symm.c
@@ -121,12 +121,12 @@ static int (*symm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLA
#ifndef CBLAS
-void NAME(char *SIDE, char *UPLO,
- blasint *M, blasint *N,
- FLOAT *alpha, FLOAT *a, blasint *ldA,
- FLOAT *b, blasint *ldB,
+void NAME(char *SIDE, char *UPLO,
+ blasint *M, blasint *N,
+ FLOAT *alpha, FLOAT *a, blasint *ldA,
+ FLOAT *b, blasint *ldB,
FLOAT *beta, FLOAT *c, blasint *ldC){
-
+
char side_arg = *SIDE;
char uplo_arg = *UPLO;
@@ -143,7 +143,7 @@ void NAME(char *SIDE, char *UPLO,
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -179,13 +179,13 @@ void NAME(char *SIDE, char *UPLO,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
args.m = *M;
args.n = *N;
args.c = (void *)c;
args.ldc = *ldC;
-
+
info = 0;
if (args.ldc < MAX(1, args.m)) info = 12;
@@ -193,17 +193,17 @@ void NAME(char *SIDE, char *UPLO,
if (!side) {
args.a = (void *)a;
args.b = (void *)b;
-
+
args.lda = *ldA;
args.ldb = *ldB;
-
+
if (args.ldb < MAX(1, args.m)) info = 9;
if (args.lda < MAX(1, args.m)) info = 7;
} else {
args.a = (void *)b;
args.b = (void *)a;
-
+
args.lda = *ldB;
args.ldb = *ldA;
@@ -220,7 +220,7 @@ void NAME(char *SIDE, char *UPLO,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
@@ -254,7 +254,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -262,7 +262,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
@@ -304,24 +304,24 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
if (!side) {
args.a = (void *)a;
args.b = (void *)b;
-
+
args.lda = lda;
args.ldb = ldb;
-
+
if (args.ldb < MAX(1, args.m)) info = 9;
if (args.lda < MAX(1, args.m)) info = 7;
-
+
} else {
args.a = (void *)b;
args.b = (void *)a;
-
+
args.lda = ldb;
args.ldb = lda;
-
+
if (args.lda < MAX(1, args.m)) info = 9;
if (args.ldb < MAX(1, args.n)) info = 7;
}
-
+
if (args.n < 0) info = 4;
if (args.m < 0) info = 3;
if (uplo < 0) info = 2;
@@ -345,24 +345,24 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
if (!side) {
args.a = (void *)a;
args.b = (void *)b;
-
+
args.lda = lda;
args.ldb = ldb;
-
+
if (args.ldb < MAX(1, args.m)) info = 9;
if (args.lda < MAX(1, args.m)) info = 7;
-
+
} else {
args.a = (void *)b;
args.b = (void *)a;
-
+
args.lda = ldb;
args.ldb = lda;
-
+
if (args.lda < MAX(1, args.m)) info = 9;
if (args.ldb < MAX(1, args.n)) info = 7;
}
-
+
if (args.n < 0) info = 4;
if (args.m < 0) info = 3;
if (uplo < 0) info = 2;
@@ -383,10 +383,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
FUNCTION_PROFILE_START();
buffer = (FLOAT *)blas_memory_alloc(0);
-
+
sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A);
sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
-
+
#ifdef SMP
args.common = NULL;
args.nthreads = num_cpu_avail(3);
@@ -402,25 +402,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
#ifndef NO_AFFINITY
nodes = get_num_nodes();
-
+
if (nodes > 1) {
-
+
args.nthreads /= nodes;
-
- gemm_thread_mn(mode, &args, NULL, NULL,
+
+ gemm_thread_mn(mode, &args, NULL, NULL,
symm[4 | (side << 1) | uplo ], sa, sb, nodes);
-
+
} else {
#endif
#ifndef USE_SIMPLE_THREADED_LEVEL3
-
+
(symm[4 | (side << 1) | uplo ])(&args, NULL, NULL, sa, sb, 0);
-
+
#else
-
+
GEMM_THREAD(mode, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads);
-
+
#endif
#ifndef NO_AFFINITY
diff --git a/interface/symv.c b/interface/symv.c
index e8c24df..e4e300e 100644
--- a/interface/symv.c
+++ b/interface/symv.c
@@ -53,7 +53,7 @@
#ifndef CBLAS
-void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
+void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){
char uplo_arg = *UPLO;
@@ -67,7 +67,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
int (*symv[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = {
SYMV_U, SYMV_L,
};
-
+
#ifdef SMP
int (*symv_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = {
SYMV_THREAD_U, SYMV_THREAD_L,
@@ -88,7 +88,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (incy == 0) info = 10;
@@ -101,10 +101,10 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
-void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
+void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT beta, FLOAT *y, blasint incy) {
FLOAT *buffer;
diff --git a/interface/syr.c b/interface/syr.c
index 2b2d3d1..b29a81e 100644
--- a/interface/syr.c
+++ b/interface/syr.c
@@ -75,7 +75,7 @@ static int (*syr_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG
#ifndef CBLAS
-void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
+void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
FLOAT *x, blasint *INCX, FLOAT *a, blasint *LDA){
char uplo_arg = *UPLO;
@@ -98,7 +98,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (lda < MAX(1, n)) info = 7;
@@ -139,7 +139,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
if (incx == 0) info = 5;
if (n < 0) info = 2;
if (uplo < 0) info = 1;
-
+
}
if (order == CblasRowMajor) {
@@ -161,7 +161,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
}
#endif
-
+
if (n == 0) return;
if (alpha == ZERO) return;
diff --git a/interface/syr2.c b/interface/syr2.c
index 15dbae4..006567c 100644
--- a/interface/syr2.c
+++ b/interface/syr2.c
@@ -75,7 +75,7 @@ static int (*syr2_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLON
#ifndef CBLAS
-void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
+void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){
char uplo_arg = *UPLO;
@@ -99,7 +99,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (lda < MAX(1, n)) info = 9;
@@ -164,7 +164,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
}
#endif
-
+
if (n == 0) return;
if (alpha == ZERO) return;
@@ -188,7 +188,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
#ifdef SMP
} else {
-
+
(syr2_thread[uplo])(n, alpha, x, incx, y, incy, a, lda, buffer, nthreads);
}
diff --git a/interface/syr2k.c b/interface/syr2k.c
index 381e088..bfa5d8b 100644
--- a/interface/syr2k.c
+++ b/interface/syr2k.c
@@ -82,11 +82,11 @@ static int (*syr2k[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BL
#ifndef CBLAS
void NAME(char *UPLO, char *TRANS,
- blasint *N, blasint *K,
- FLOAT *alpha, FLOAT *a, blasint *ldA,
+ blasint *N, blasint *K,
+ FLOAT *alpha, FLOAT *a, blasint *ldA,
FLOAT *b, blasint *ldB,
FLOAT *beta, FLOAT *c, blasint *ldC){
-
+
char uplo_arg = *UPLO;
char trans_arg = *TRANS;
@@ -103,7 +103,7 @@ void NAME(char *UPLO, char *TRANS,
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -111,7 +111,7 @@ void NAME(char *UPLO, char *TRANS,
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
@@ -138,7 +138,7 @@ void NAME(char *UPLO, char *TRANS,
TOUPPER(uplo_arg);
TOUPPER(trans_arg);
-
+
uplo = -1;
trans = -1;
@@ -150,7 +150,7 @@ void NAME(char *UPLO, char *TRANS,
if (trans_arg == 'T') trans = 1;
if (trans_arg == 'C') trans = 1;
#else
-#ifdef HEMM
+#ifdef HEMM
if (trans_arg == 'N') trans = 0;
if (trans_arg == 'C') trans = 1;
#else
@@ -160,7 +160,7 @@ void NAME(char *UPLO, char *TRANS,
#endif
-
+
nrowa = args.n;
if (trans & 1) nrowa = args.k;
@@ -216,7 +216,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -224,7 +224,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
@@ -273,10 +273,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
#endif
info = -1;
-
+
nrowa = args.n;
if (trans & 1) nrowa = args.k;
-
+
if (args.ldc < MAX(1,args.n)) info = 12;
if (args.ldb < MAX(1,nrowa)) info = 9;
if (args.lda < MAX(1,nrowa)) info = 7;
@@ -291,7 +291,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
#ifdef HEMM
CAlpha[0] = alpha[0];
CAlpha[1] = -alpha[1];
-
+
args.alpha = (void *)CAlpha;
#endif
@@ -310,10 +310,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
#endif
info = -1;
-
+
nrowa = args.n;
if (trans & 1) nrowa = args.k;
-
+
if (args.ldc < MAX(1,args.n)) info = 12;
if (args.ldb < MAX(1,nrowa)) info = 9;
if (args.lda < MAX(1,nrowa)) info = 7;
@@ -331,16 +331,16 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
#endif
if (args.n == 0) return;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
buffer = (FLOAT *)blas_memory_alloc(0);
-
+
sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A);
sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
-
+
#ifdef SMP
if (!trans){
mode |= (BLAS_TRANSA_N | BLAS_TRANSB_T);
@@ -357,18 +357,18 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
#endif
(syr2k[(uplo << 1) | trans ])(&args, NULL, NULL, sa, sb, 0);
-
+
#ifdef SMP
} else {
syrk_thread(mode, &args, NULL, NULL, syr2k[(uplo << 1) | trans ], sa, sb, args.nthreads);
-
+
}
#endif
-
+
blas_memory_free(buffer);
-
+
FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, 2 * args.n * args.k + args.n * args.n, 2 * args.n * args.n * args.k);
IDEBUG_END;
diff --git a/interface/syrk.c b/interface/syrk.c
index 072cc86..f8c6970 100644
--- a/interface/syrk.c
+++ b/interface/syrk.c
@@ -88,10 +88,10 @@ static int (*syrk[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLA
#ifndef CBLAS
void NAME(char *UPLO, char *TRANS,
- blasint *N, blasint *K,
- FLOAT *alpha, FLOAT *a, blasint *ldA,
+ blasint *N, blasint *K,
+ FLOAT *alpha, FLOAT *a, blasint *ldA,
FLOAT *beta, FLOAT *c, blasint *ldC){
-
+
char uplo_arg = *UPLO;
char trans_arg = *TRANS;
@@ -108,7 +108,7 @@ void NAME(char *UPLO, char *TRANS,
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -116,7 +116,7 @@ void NAME(char *UPLO, char *TRANS,
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
@@ -154,7 +154,7 @@ void NAME(char *UPLO, char *TRANS,
if (trans_arg == 'T') trans = 1;
if (trans_arg == 'C') trans = 1;
#else
-#ifdef HEMM
+#ifdef HEMM
if (trans_arg == 'N') trans = 0;
if (trans_arg == 'C') trans = 1;
#else
@@ -163,7 +163,7 @@ void NAME(char *UPLO, char *TRANS,
#endif
#endif
-
+
nrowa = args.n;
if (trans & 1) nrowa = args.k;
@@ -180,7 +180,7 @@ void NAME(char *UPLO, char *TRANS,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
@@ -192,9 +192,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
#endif
FLOAT *a, blasint lda,
#if !defined(COMPLEX) || defined(HEMM)
- FLOAT beta,
+ FLOAT beta,
#else
- FLOAT *beta,
+ FLOAT *beta,
#endif
FLOAT *c, blasint ldc) {
@@ -213,7 +213,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -221,7 +221,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
@@ -264,10 +264,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
#endif
info = -1;
-
+
nrowa = args.n;
if (trans & 1) nrowa = args.k;
-
+
if (args.ldc < MAX(1,args.n)) info = 10;
if (args.lda < MAX(1,nrowa)) info = 7;
if (args.k < 0) info = 4;
@@ -292,10 +292,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
#endif
info = -1;
-
+
nrowa = args.n;
if (trans & 1) nrowa = args.k;
-
+
if (args.ldc < MAX(1,args.n)) info = 10;
if (args.lda < MAX(1,nrowa)) info = 7;
if (args.k < 0) info = 4;
@@ -312,25 +312,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
#endif
if (args.n == 0) return;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
buffer = (FLOAT *)blas_memory_alloc(0);
-
+
sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A);
sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
-
+
#ifdef SMP
if (!trans){
mode |= (BLAS_TRANSA_N | BLAS_TRANSB_T);
} else {
mode |= (BLAS_TRANSA_T | BLAS_TRANSB_N);
}
-
+
mode |= (uplo << BLAS_UPLO_SHIFT);
-
+
args.common = NULL;
args.nthreads = num_cpu_avail(3);
@@ -344,13 +344,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
} else {
#ifndef USE_SIMPLE_THREADED_LEVEL3
-
+
(syrk[4 | (uplo << 1) | trans ])(&args, NULL, NULL, sa, sb, 0);
-
+
#else
-
+
syrk_thread(mode, &args, NULL, NULL, syrk[(uplo << 1) | trans ], sa, sb, args.nthreads);
-
+
#endif
}
diff --git a/interface/tbmv.c b/interface/tbmv.c
index cec2be4..b5f3ab7 100644
--- a/interface/tbmv.c
+++ b/interface/tbmv.c
@@ -82,13 +82,13 @@ static int (*tbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLAS
#ifndef CBLAS
void NAME(char *UPLO, char *TRANS, char *DIAG,
- blasint *N, blasint *K,
+ blasint *N, blasint *K,
FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){
-
+
char uplo_arg = *UPLO;
char trans_arg = *TRANS;
char diag_arg = *DIAG;
-
+
blasint n = *N;
blasint k = *K;
blasint lda = *LDA;
@@ -167,7 +167,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (TransA == CblasTrans) trans = 1;
if (TransA == CblasConjNoTrans) trans = 0;
if (TransA == CblasConjTrans) trans = 1;
-
+
if (Diag == CblasUnit) unit = 0;
if (Diag == CblasNonUnit) unit = 1;
@@ -211,9 +211,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
}
#endif
-
+
if (n == 0) return;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
diff --git a/interface/tbsv.c b/interface/tbsv.c
index a07c4c5..12a1eb0 100644
--- a/interface/tbsv.c
+++ b/interface/tbsv.c
@@ -67,13 +67,13 @@ static int (*tbsv[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, v
#ifndef CBLAS
void NAME(char *UPLO, char *TRANS, char *DIAG,
- blasint *N, blasint *K,
+ blasint *N, blasint *K,
FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){
-
+
char uplo_arg = *UPLO;
char trans_arg = *TRANS;
char diag_arg = *DIAG;
-
+
blasint n = *N;
blasint k = *K;
blasint lda = *LDA;
@@ -146,7 +146,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (TransA == CblasTrans) trans = 1;
if (TransA == CblasConjNoTrans) trans = 0;
if (TransA == CblasConjTrans) trans = 1;
-
+
if (Diag == CblasUnit) unit = 0;
if (Diag == CblasNonUnit) unit = 1;
@@ -190,9 +190,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
}
#endif
-
+
if (n == 0) return;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
diff --git a/interface/tpmv.c b/interface/tpmv.c
index f0fc4f7..edf0104 100644
--- a/interface/tpmv.c
+++ b/interface/tpmv.c
@@ -83,11 +83,11 @@ static int (*tpmv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, int)
void NAME(char *UPLO, char *TRANS, char *DIAG,
blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){
-
+
char uplo_arg = *UPLO;
char trans_arg = *TRANS;
char diag_arg = *DIAG;
-
+
blasint n = *N;
blasint incx = *INCX;
@@ -133,7 +133,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
@@ -162,7 +162,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (TransA == CblasTrans) trans = 1;
if (TransA == CblasConjNoTrans) trans = 0;
if (TransA == CblasConjTrans) trans = 1;
-
+
if (Diag == CblasUnit) unit = 0;
if (Diag == CblasNonUnit) unit = 1;
@@ -202,9 +202,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
}
#endif
-
+
if (n == 0) return;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
diff --git a/interface/tpsv.c b/interface/tpsv.c
index 9dafd0b..58be77c 100644
--- a/interface/tpsv.c
+++ b/interface/tpsv.c
@@ -68,11 +68,11 @@ static int (*tpsv[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, void *) = {
void NAME(char *UPLO, char *TRANS, char *DIAG,
blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){
-
+
char uplo_arg = *UPLO;
char trans_arg = *TRANS;
char diag_arg = *DIAG;
-
+
blasint n = *N;
blasint incx = *INCX;
@@ -115,7 +115,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
@@ -141,7 +141,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (TransA == CblasTrans) trans = 1;
if (TransA == CblasConjNoTrans) trans = 0;
if (TransA == CblasConjTrans) trans = 1;
-
+
if (Diag == CblasUnit) unit = 0;
if (Diag == CblasNonUnit) unit = 1;
@@ -183,7 +183,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
if (n == 0) return;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
diff --git a/interface/trmv.c b/interface/trmv.c
index ed23ced..2e52527 100644
--- a/interface/trmv.c
+++ b/interface/trmv.c
@@ -83,11 +83,11 @@ static int (*trmv_thread[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOA
void NAME(char *UPLO, char *TRANS, char *DIAG,
blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){
-
+
char uplo_arg = *UPLO;
char trans_arg = *TRANS;
char diag_arg = *DIAG;
-
+
blasint n = *N;
blasint lda = *LDA;
blasint incx = *INCX;
@@ -135,7 +135,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
@@ -164,7 +164,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (TransA == CblasTrans) trans = 1;
if (TransA == CblasConjNoTrans) trans = 0;
if (TransA == CblasConjTrans) trans = 1;
-
+
if (Diag == CblasUnit) unit = 0;
if (Diag == CblasNonUnit) unit = 1;
@@ -208,7 +208,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
if (n == 0) return;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
@@ -222,12 +222,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (nthreads == 1) {
#endif
-
+
(trmv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer);
-
+
#ifdef SMP
} else {
-
+
(trmv_thread[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer, nthreads);
}
diff --git a/interface/trsm.c b/interface/trsm.c
index 5836ce2..2663729 100644
--- a/interface/trsm.c
+++ b/interface/trsm.c
@@ -87,18 +87,18 @@ static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLA
TRSM_LTUU, TRSM_LTUN, TRSM_LTLU, TRSM_LTLN,
TRSM_LRUU, TRSM_LRUN, TRSM_LRLU, TRSM_LRLN,
TRSM_LCUU, TRSM_LCUN, TRSM_LCLU, TRSM_LCLN,
- TRSM_RNUU, TRSM_RNUN, TRSM_RNLU, TRSM_RNLN,
+ TRSM_RNUU, TRSM_RNUN, TRSM_RNLU, TRSM_RNLN,
TRSM_RTUU, TRSM_RTUN, TRSM_RTLU, TRSM_RTLN,
- TRSM_RRUU, TRSM_RRUN, TRSM_RRLU, TRSM_RRLN,
+ TRSM_RRUU, TRSM_RRUN, TRSM_RRLU, TRSM_RRLN,
TRSM_RCUU, TRSM_RCUN, TRSM_RCLU, TRSM_RCLN,
#else
TRMM_LNUU, TRMM_LNUN, TRMM_LNLU, TRMM_LNLN,
TRMM_LTUU, TRMM_LTUN, TRMM_LTLU, TRMM_LTLN,
TRMM_LRUU, TRMM_LRUN, TRMM_LRLU, TRMM_LRLN,
TRMM_LCUU, TRMM_LCUN, TRMM_LCLU, TRMM_LCLN,
- TRMM_RNUU, TRMM_RNUN, TRMM_RNLU, TRMM_RNLN,
+ TRMM_RNUU, TRMM_RNUN, TRMM_RNLU, TRMM_RNLN,
TRMM_RTUU, TRMM_RTUN, TRMM_RTLU, TRMM_RTLN,
- TRMM_RRUU, TRMM_RRUN, TRMM_RRLU, TRMM_RRLN,
+ TRMM_RRUU, TRMM_RRUN, TRMM_RRLU, TRMM_RRLN,
TRMM_RCUU, TRMM_RCUN, TRMM_RCLU, TRMM_RCLN,
#endif
};
@@ -108,7 +108,7 @@ static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLA
void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG,
blasint *M, blasint *N, FLOAT *alpha,
FLOAT *a, blasint *ldA, FLOAT *b, blasint *ldB){
-
+
char side_arg = *SIDE;
char uplo_arg = *UPLO;
char trans_arg = *TRANS;
@@ -127,7 +127,7 @@ void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG,
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -135,7 +135,7 @@ void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG,
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
@@ -182,7 +182,7 @@ void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
nrowa = args.m;
if (side & 1) nrowa = args.n;
@@ -201,10 +201,10 @@ void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
-void CNAME(enum CBLAS_ORDER order,
+void CNAME(enum CBLAS_ORDER order,
enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
enum CBLAS_TRANSPOSE Trans, enum CBLAS_DIAG Diag,
blasint m, blasint n,
@@ -231,7 +231,7 @@ void CNAME(enum CBLAS_ORDER order,
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -239,7 +239,7 @@ void CNAME(enum CBLAS_ORDER order,
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
#endif
@@ -269,7 +269,7 @@ void CNAME(enum CBLAS_ORDER order,
if (Side == CblasLeft) side = 0;
if (Side == CblasRight) side = 1;
-
+
if (Uplo == CblasUpper) uplo = 0;
if (Uplo == CblasLower) uplo = 1;
@@ -282,15 +282,15 @@ void CNAME(enum CBLAS_ORDER order,
if (Trans == CblasConjNoTrans) trans = 2;
if (Trans == CblasConjTrans) trans = 3;
#endif
-
+
if (Diag == CblasUnit) unit = 0;
if (Diag == CblasNonUnit) unit = 1;
info = -1;
-
+
nrowa = args.m;
if (side & 1) nrowa = args.n;
-
+
if (args.ldb < MAX(1,args.m)) info = 11;
if (args.lda < MAX(1,nrowa)) info = 9;
if (args.n < 0) info = 6;
@@ -307,7 +307,7 @@ void CNAME(enum CBLAS_ORDER order,
if (Side == CblasLeft) side = 1;
if (Side == CblasRight) side = 0;
-
+
if (Uplo == CblasUpper) uplo = 1;
if (Uplo == CblasLower) uplo = 0;
@@ -320,15 +320,15 @@ void CNAME(enum CBLAS_ORDER order,
if (Trans == CblasConjNoTrans) trans = 2;
if (Trans == CblasConjTrans) trans = 3;
#endif
-
+
if (Diag == CblasUnit) unit = 0;
if (Diag == CblasNonUnit) unit = 1;
info = -1;
-
+
nrowa = args.m;
if (side & 1) nrowa = args.n;
-
+
if (args.ldb < MAX(1,args.m)) info = 11;
if (args.lda < MAX(1,nrowa)) info = 9;
if (args.n < 0) info = 6;
@@ -353,10 +353,10 @@ void CNAME(enum CBLAS_ORDER order,
FUNCTION_PROFILE_START();
buffer = (FLOAT *)blas_memory_alloc(0);
-
+
sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A);
sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
-
+
#ifdef SMP
mode |= (trans << BLAS_TRANSA_SHIFT);
mode |= (side << BLAS_RSIDE_SHIFT);
@@ -367,7 +367,7 @@ void CNAME(enum CBLAS_ORDER order,
#endif
(trsm[(side<<4) | (trans<<2) | (uplo<<1) | unit])(&args, NULL, NULL, sa, sb, 0);
-
+
#ifdef SMP
} else {
if (!side) {
@@ -377,10 +377,10 @@ void CNAME(enum CBLAS_ORDER order,
}
}
#endif
-
+
blas_memory_free(buffer);
- FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE,
+ FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE,
(!side) ? args.m * (args.m + args.n) : args.n * (args.m + args.n),
(!side) ? args.m * args.m * args.n : args.m * args.n * args.n);
diff --git a/interface/trsv.c b/interface/trsv.c
index 8ef6998..a054d8e 100644
--- a/interface/trsv.c
+++ b/interface/trsv.c
@@ -68,11 +68,11 @@ static int (*trsv[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = {
void NAME(char *UPLO, char *TRANS, char *DIAG,
blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){
-
+
char uplo_arg = *UPLO;
char trans_arg = *TRANS;
char diag_arg = *DIAG;
-
+
blasint n = *N;
blasint lda = *LDA;
blasint incx = *INCX;
@@ -103,7 +103,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (incx == 0) info = 8;
@@ -143,7 +143,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (TransA == CblasTrans) trans = 1;
if (TransA == CblasConjNoTrans) trans = 0;
if (TransA == CblasConjTrans) trans = 1;
-
+
if (Diag == CblasUnit) unit = 0;
if (Diag == CblasNonUnit) unit = 1;
@@ -187,7 +187,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
if (n == 0) return;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
diff --git a/kernel/arm/zswap.c b/interface/zaxpby.c
similarity index 70%
copy from kernel/arm/zswap.c
copy to interface/zaxpby.c
index 4e3e73d..9e83244 100644
--- a/kernel/arm/zswap.c
+++ b/interface/zaxpby.c
@@ -25,46 +25,50 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-/**************************************************************************************
-* 2013/09/14 Saar
-* BLASTEST float : OK
-* BLASTEST double : OK
-* CTEST : OK
-* TEST : OK
-*
-**************************************************************************************/
+/**********************************************************************
+ 2014/06/07 Saar
+
+**********************************************************************/
+
-#include "common.h"
#include <stdio.h>
+#include "common.h"
+#ifdef FUNCTION_PROFILE
+#include "functable.h"
+#endif
+
+#ifndef CBLAS
+
+void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY)
+{
+
+ blasint n = *N;
+ blasint incx = *INCX;
+ blasint incy = *INCY;
+
+#else
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT *y, blasint incy)
{
- BLASLONG i=0;
- BLASLONG ix=0,iy=0;
- FLOAT temp[2];
- if ( n < 0 ) return(0);
+#endif
- BLASLONG inc_x2 = 2 * inc_x;
- BLASLONG inc_y2 = 2 * inc_y;
+ if (n <= 0) return;
- while(i < n)
- {
+ FLOAT alpha_r = *(ALPHA + 0);
+ FLOAT alpha_i = *(ALPHA + 1);
+ FLOAT beta_r = *(BETA + 0);
+ FLOAT beta_i = *(BETA + 1);
- temp[0] = x[ix] ;
- temp[1] = x[ix+1] ;
- x[ix] = y[iy] ;
- x[ix+1] = y[iy+1] ;
- y[iy] = temp[0] ;
- y[iy+1] = temp[1] ;
+ FUNCTION_PROFILE_START();
- ix += inc_x2 ;
- iy += inc_y2 ;
- i++ ;
+ if (incx < 0) x -= (n - 1) * incx * 2;
+ if (incy < 0) y -= (n - 1) * incy * 2;
- }
- return(0);
+ AXPBY_K (n, alpha_r, alpha_i, x, incx, beta_r, beta_i, y, incy);
-}
-
+ FUNCTION_PROFILE_END(4, 2 * n, 2 * n);
+
+ return;
+}
diff --git a/interface/zaxpy.c b/interface/zaxpy.c
index 9ed72ef..daa12ba 100644
--- a/interface/zaxpy.c
+++ b/interface/zaxpy.c
@@ -45,7 +45,7 @@
#ifndef CBLAS
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
-
+
blasint n = *N;
blasint incx = *INCX;
blasint incy = *INCY;
@@ -96,18 +96,18 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
#else
AXPYC_K(n, 0, 0, alpha_r, alpha_i, x, incx, y, incy, NULL, 0);
#endif
-
+
#ifdef SMP
} else {
-
+
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
-
+#endif
+
blas_level1_thread(mode, n, 0, 0, ALPHA, x, incx, y, incy, NULL, 0,
#ifndef CONJ
(void *)AXPYU_K,
@@ -117,11 +117,11 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
nthreads);
}
#endif
-
+
FUNCTION_PROFILE_END(4, 2 * n, 2 * n);
IDEBUG_END;
return;
-
+
}
diff --git a/interface/zgbmv.c b/interface/zgbmv.c
index ae1fd24..a18cede 100644
--- a/interface/zgbmv.c
+++ b/interface/zgbmv.c
@@ -135,7 +135,7 @@ void NAME(char *TRANS, blasint *M, blasint *N,
if (n < 0) info = 3;
if (m < 0) info = 2;
if (i < 0) info = 1;
-
+
trans = i;
if (info != 0){
@@ -178,7 +178,7 @@ void CNAME(enum CBLAS_ORDER order,
if (TransA == CblasTrans) trans = 1;
if (TransA == CblasConjNoTrans) trans = 2;
if (TransA == CblasConjTrans) trans = 3;
-
+
info = -1;
if (incy == 0) info = 13;
@@ -234,7 +234,7 @@ void CNAME(enum CBLAS_ORDER order,
if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0);
if (alpha_r == ZERO && alpha_i == ZERO) return;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
diff --git a/interface/zgemv.c b/interface/zgemv.c
index fb47842..fcc2fda 100644
--- a/interface/zgemv.c
+++ b/interface/zgemv.c
@@ -53,11 +53,11 @@
#ifdef SMP
static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = {
#ifdef XDOUBLE
- xgemv_thread_n, xgemv_thread_t, xgemv_thread_r, xgemv_thread_c, xgemv_thread_o, xgemv_thread_u, xgemv_thread_s, xgemv_thread_d,
+ xgemv_thread_n, xgemv_thread_t, xgemv_thread_r, xgemv_thread_c, xgemv_thread_o, xgemv_thread_u, xgemv_thread_s, xgemv_thread_d,
#elif defined DOUBLE
- zgemv_thread_n, zgemv_thread_t, zgemv_thread_r, zgemv_thread_c, zgemv_thread_o, zgemv_thread_u, zgemv_thread_s, zgemv_thread_d,
+ zgemv_thread_n, zgemv_thread_t, zgemv_thread_r, zgemv_thread_c, zgemv_thread_o, zgemv_thread_u, zgemv_thread_s, zgemv_thread_d,
#else
- cgemv_thread_n, cgemv_thread_t, cgemv_thread_r, cgemv_thread_c, cgemv_thread_o, cgemv_thread_u, cgemv_thread_s, cgemv_thread_d,
+ cgemv_thread_n, cgemv_thread_t, cgemv_thread_r, cgemv_thread_c, cgemv_thread_o, cgemv_thread_u, cgemv_thread_s, cgemv_thread_d,
#endif
};
#endif
@@ -68,7 +68,7 @@ void NAME(char *TRANS, blasint *M, blasint *N,
FLOAT *ALPHA, FLOAT *a, blasint *LDA,
FLOAT *x, blasint *INCX,
FLOAT *BETA, FLOAT *y, blasint *INCY){
-
+
char trans = *TRANS;
blasint m = *M;
blasint n = *N;
@@ -86,7 +86,7 @@ void NAME(char *TRANS, blasint *M, blasint *N,
GEMV_N, GEMV_T, GEMV_R, GEMV_C,
GEMV_O, GEMV_U, GEMV_S, GEMV_D,
};
-
+
blasint info;
blasint lenx, leny;
blasint i;
@@ -169,7 +169,7 @@ void CNAME(enum CBLAS_ORDER order,
if (TransA == CblasTrans) trans = 1;
if (TransA == CblasConjNoTrans) trans = 2;
if (TransA == CblasConjTrans) trans = 3;
-
+
info = -1;
if (incy == 0) info = 11;
@@ -178,7 +178,7 @@ void CNAME(enum CBLAS_ORDER order,
if (n < 0) info = 3;
if (m < 0) info = 2;
if (trans < 0) info = 1;
-
+
}
if (order == CblasRowMajor) {
@@ -208,7 +208,7 @@ void CNAME(enum CBLAS_ORDER order,
}
#endif
-
+
/* Quick return if possible. */
if (m == 0 || n == 0) return;
@@ -237,13 +237,13 @@ void CNAME(enum CBLAS_ORDER order,
if (nthreads == 1) {
#endif
-
+
(gemv[(int)trans])(m, n, 0, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer);
#ifdef SMP
-
+
} else {
-
+
(gemv_thread[(int)trans])(m, n, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads);
}
diff --git a/interface/zger.c b/interface/zger.c
index ad52f40..f46a462 100644
--- a/interface/zger.c
+++ b/interface/zger.c
@@ -42,6 +42,13 @@
#include "functable.h"
#endif
+#ifdef SMP
+#ifdef __64BIT__
+#define SMPTEST 1
+#endif
+#endif
+
+
#ifdef XDOUBLE
#ifndef CONJ
#define ERROR_NAME "XGERU "
@@ -109,7 +116,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha,
blasint incy = *INCY;
blasint lda = *LDA;
FLOAT *buffer;
-#ifdef SMP
+#ifdef SMPTEST
int nthreads;
#endif
@@ -129,7 +136,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
void CNAME(enum CBLAS_ORDER order,
@@ -144,7 +151,7 @@ void CNAME(enum CBLAS_ORDER order,
FLOAT *buffer;
blasint info, t;
-#ifdef SMP
+#ifdef SMPTEST
int nthreads;
#endif
@@ -195,7 +202,7 @@ void CNAME(enum CBLAS_ORDER order,
if (m == 0 || n == 0) return;
if ((alpha_r == 0.) && (alpha_i == 0.)) return;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
@@ -205,7 +212,7 @@ void CNAME(enum CBLAS_ORDER order,
buffer = (FLOAT *)blas_memory_alloc(1);
-#ifdef SMP
+#ifdef SMPTEST
nthreads = num_cpu_avail(2);
if (nthreads == 1) {
@@ -221,7 +228,7 @@ void CNAME(enum CBLAS_ORDER order,
}
#endif
-#ifdef SMP
+#ifdef SMPTEST
} else {
@@ -245,5 +252,5 @@ void CNAME(enum CBLAS_ORDER order,
IDEBUG_END;
return;
-
+
}
diff --git a/interface/zhbmv.c b/interface/zhbmv.c
index 00ba915..8a16bbe 100644
--- a/interface/zhbmv.c
+++ b/interface/zhbmv.c
@@ -75,7 +75,7 @@ static int (*hbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLO
#ifndef CBLAS
-void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
+void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){
char uplo_arg = *UPLO;
@@ -105,7 +105,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *
if (uplo_arg == 'L') uplo = 1;
if (uplo_arg == 'V') uplo = 2;
if (uplo_arg == 'M') uplo = 3;
-
+
info = 0;
if (incy == 0) info = 11;
@@ -119,7 +119,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
void CNAME(enum CBLAS_ORDER order,
@@ -150,7 +150,7 @@ void CNAME(enum CBLAS_ORDER order,
if (order == CblasColMajor) {
if (Uplo == CblasUpper) uplo = 0;
if (Uplo == CblasLower) uplo = 1;
-
+
info = -1;
if (incy == 0) info = 11;
diff --git a/interface/zhemv.c b/interface/zhemv.c
index 3cba445..c60eedc 100644
--- a/interface/zhemv.c
+++ b/interface/zhemv.c
@@ -53,7 +53,7 @@
#ifndef CBLAS
-void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
+void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){
char uplo_arg = *UPLO;
@@ -92,7 +92,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
if (uplo_arg == 'L') uplo = 1;
if (uplo_arg == 'V') uplo = 2;
if (uplo_arg == 'M') uplo = 3;
-
+
info = 0;
if (incy == 0) info = 10;
@@ -108,7 +108,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
#else
-void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA,
+void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA,
FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT *y, blasint incy) {
FLOAT alpha_r = ALPHA[0];
diff --git a/interface/zher.c b/interface/zher.c
index ad982dd..9bedb01 100644
--- a/interface/zher.c
+++ b/interface/zher.c
@@ -75,7 +75,7 @@ static int (*her_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG
#ifndef CBLAS
-void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
+void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
FLOAT *x, blasint *INCX, FLOAT *a, blasint *LDA){
char uplo_arg = *UPLO;
@@ -98,7 +98,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (lda < MAX(1, n)) info = 7;
@@ -139,7 +139,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
if (incx == 0) info = 5;
if (n < 0) info = 2;
if (uplo < 0) info = 1;
-
+
}
if (order == CblasRowMajor) {
@@ -161,7 +161,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
}
#endif
-
+
if (n == 0) return;
if (alpha == ZERO) return;
@@ -173,7 +173,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
if (incx < 0 ) x -= (n - 1) * incx * 2;
buffer = (FLOAT *)blas_memory_alloc(1);
-
+
#ifdef SMP
nthreads = num_cpu_avail(2);
diff --git a/interface/zher2.c b/interface/zher2.c
index 88fecec..b342457 100644
--- a/interface/zher2.c
+++ b/interface/zher2.c
@@ -75,7 +75,7 @@ static int (*her2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASL
#ifndef CBLAS
-void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
+void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){
char uplo_arg = *UPLO;
@@ -100,7 +100,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (lda < MAX(1, n)) info = 9;
@@ -113,7 +113,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) {
@@ -193,7 +193,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
} else {
(her2_thread[uplo])(n, ALPHA, x, incx, y, incy, a, lda, buffer, nthreads);
-
+
}
#endif
diff --git a/interface/zhpmv.c b/interface/zhpmv.c
index d7013e6..bab6e55 100644
--- a/interface/zhpmv.c
+++ b/interface/zhpmv.c
@@ -101,7 +101,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (incy == 0) info = 9;
@@ -144,7 +144,7 @@ void CNAME(enum CBLAS_ORDER order,
if (order == CblasColMajor) {
if (Uplo == CblasUpper) uplo = 0;
if (Uplo == CblasLower) uplo = 1;
-
+
info = -1;
if (incy == 0) info = 9;
diff --git a/interface/zhpr.c b/interface/zhpr.c
index c48e352..5159ba7 100644
--- a/interface/zhpr.c
+++ b/interface/zhpr.c
@@ -75,7 +75,7 @@ static int (*hpr_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, FLOAT *,
#ifndef CBLAS
-void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
+void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
FLOAT *x, blasint *INCX, FLOAT *a){
char uplo_arg = *UPLO;
@@ -97,7 +97,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (incx == 0) info = 5;
@@ -133,7 +133,7 @@ void CNAME(enum CBLAS_ORDER order,
if (order == CblasColMajor) {
if (Uplo == CblasUpper) uplo = 0;
if (Uplo == CblasLower) uplo = 1;
-
+
info = -1;
if (incx == 0) info = 5;
@@ -158,7 +158,7 @@ void CNAME(enum CBLAS_ORDER order,
}
#endif
-
+
if (n == 0) return;
if (alpha == ZERO) return;
@@ -170,7 +170,7 @@ void CNAME(enum CBLAS_ORDER order,
if (incx < 0 ) x -= (n - 1) * incx * 2;
buffer = (FLOAT *)blas_memory_alloc(1);
-
+
#ifdef SMP
nthreads = num_cpu_avail(2);
diff --git a/interface/zhpr2.c b/interface/zhpr2.c
index cf1d5f9..1712e5d 100644
--- a/interface/zhpr2.c
+++ b/interface/zhpr2.c
@@ -75,7 +75,7 @@ static int (*hpr2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASL
#ifndef CBLAS
-void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
+void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a){
char uplo_arg = *UPLO;
@@ -99,7 +99,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (incy == 0) info = 7;
@@ -111,7 +111,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
void CNAME(enum CBLAS_ORDER order,
@@ -139,7 +139,7 @@ void CNAME(enum CBLAS_ORDER order,
if (order == CblasColMajor) {
if (Uplo == CblasUpper) uplo = 0;
if (Uplo == CblasLower) uplo = 1;
-
+
info = -1;
if (incy == 0) info = 7;
@@ -188,7 +188,7 @@ void CNAME(enum CBLAS_ORDER order,
#endif
(hpr2[uplo])(n, alpha_r, alpha_i, x, incx, y, incy, a, buffer);
-
+
#ifdef SMP
} else {
diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c
new file mode 100644
index 0000000..79af6d7
--- /dev/null
+++ b/interface/zimatcopy.c
@@ -0,0 +1,185 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/***********************************************************
+ * 2014/06/10 Saar
+***********************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+#ifdef FUNCTION_PROFILE
+#include "functable.h"
+#endif
+
+#if defined(DOUBLE)
+#define ERROR_NAME "ZIMATCOPY"
+#else
+#define ERROR_NAME "CIMATCOPY"
+#endif
+
+#define BlasRowMajor 0
+#define BlasColMajor 1
+#define BlasNoTrans 0
+#define BlasTrans 1
+#define BlasTransConj 2
+#define BlasConj 3
+
+void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb)
+{
+
+ char Order, Trans;
+ int order=-1,trans=-1;
+ blasint info = -1;
+ FLOAT *b;
+ size_t msize;
+
+ Order = *ORDER;
+ Trans = *TRANS;
+
+ TOUPPER(Order);
+ TOUPPER(Trans);
+
+ if ( Order == 'C' ) order = BlasColMajor;
+ if ( Order == 'R' ) order = BlasRowMajor;
+ if ( Trans == 'N' ) trans = BlasNoTrans;
+ if ( Trans == 'T' ) trans = BlasTrans;
+ if ( Trans == 'C' ) trans = BlasTransConj;
+ if ( Trans == 'R' ) trans = BlasConj;
+
+ if ( order == BlasColMajor)
+ {
+ if ( trans == BlasNoTrans && *ldb < *rows ) info = 9;
+ if ( trans == BlasConj && *ldb < *rows ) info = 9;
+ if ( trans == BlasTrans && *ldb < *cols ) info = 9;
+ if ( trans == BlasTransConj && *ldb < *cols ) info = 9;
+ }
+ if ( order == BlasRowMajor)
+ {
+ if ( trans == BlasNoTrans && *ldb < *cols ) info = 9;
+ if ( trans == BlasConj && *ldb < *cols ) info = 9;
+ if ( trans == BlasTrans && *ldb < *rows ) info = 9;
+ if ( trans == BlasTransConj && *ldb < *rows ) info = 9;
+ }
+
+ if ( order == BlasColMajor && *lda < *rows ) info = 7;
+ if ( order == BlasRowMajor && *lda < *cols ) info = 7;
+ if ( *cols <= 0 ) info = 4;
+ if ( *rows <= 0 ) info = 3;
+ if ( trans < 0 ) info = 2;
+ if ( order < 0 ) info = 1;
+
+ if (info >= 0) {
+ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+ return;
+ }
+
+ if ( *lda > *ldb )
+ msize = (*lda) * (*ldb) * sizeof(FLOAT) * 2;
+ else
+ msize = (*ldb) * (*ldb) * sizeof(FLOAT) * 2;
+
+ b = malloc(msize);
+ if ( b == NULL )
+ {
+ printf("Memory alloc failed\n");
+ exit(1);
+ }
+
+
+ if ( order == BlasColMajor )
+ {
+
+ if ( trans == BlasNoTrans )
+ {
+ OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
+ OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
+ free(b);
+ return;
+ }
+ if ( trans == BlasConj )
+ {
+ OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
+ OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
+ free(b);
+ return;
+ }
+ if ( trans == BlasTrans )
+ {
+ OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
+ OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
+ free(b);
+ return;
+ }
+ if ( trans == BlasTransConj )
+ {
+ OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
+ OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
+ free(b);
+ return;
+ }
+
+ }
+ else
+ {
+
+ if ( trans == BlasNoTrans )
+ {
+ OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
+ OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
+ free(b);
+ return;
+ }
+ if ( trans == BlasConj )
+ {
+ OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
+ OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
+ free(b);
+ return;
+ }
+ if ( trans == BlasTrans )
+ {
+ OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
+ OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
+ free(b);
+ return;
+ }
+ if ( trans == BlasTransConj )
+ {
+ OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
+ OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
+ free(b);
+ return;
+ }
+
+ }
+
+ return;
+
+}
+
+
diff --git a/interface/zomatcopy.c b/interface/zomatcopy.c
new file mode 100644
index 0000000..eec4d3c
--- /dev/null
+++ b/interface/zomatcopy.c
@@ -0,0 +1,154 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/***********************************************************
+ * 2014/06/09 Saar
+***********************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+#ifdef FUNCTION_PROFILE
+#include "functable.h"
+#endif
+
+#if defined(DOUBLE)
+#define ERROR_NAME "ZOMATCOPY"
+#else
+#define ERROR_NAME "COMATCOPY"
+#endif
+
+#define BlasRowMajor 0
+#define BlasColMajor 1
+#define BlasNoTrans 0
+#define BlasTrans 1
+#define BlasTransConj 2
+#define BlasConj 3
+
+void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb)
+{
+
+ char Order, Trans;
+ int order=-1,trans=-1;
+ blasint info = -1;
+
+ Order = *ORDER;
+ Trans = *TRANS;
+
+ TOUPPER(Order);
+ TOUPPER(Trans);
+
+ if ( Order == 'C' ) order = BlasColMajor;
+ if ( Order == 'R' ) order = BlasRowMajor;
+ if ( Trans == 'N' ) trans = BlasNoTrans;
+ if ( Trans == 'T' ) trans = BlasTrans;
+ if ( Trans == 'C' ) trans = BlasTransConj;
+ if ( Trans == 'R' ) trans = BlasConj;
+
+ if ( order == BlasColMajor)
+ {
+ if ( trans == BlasNoTrans && *ldb < *rows ) info = 9;
+ if ( trans == BlasConj && *ldb < *rows ) info = 9;
+ if ( trans == BlasTrans && *ldb < *cols ) info = 9;
+ if ( trans == BlasTransConj && *ldb < *cols ) info = 9;
+ }
+ if ( order == BlasRowMajor)
+ {
+ if ( trans == BlasNoTrans && *ldb < *cols ) info = 9;
+ if ( trans == BlasConj && *ldb < *cols ) info = 9;
+ if ( trans == BlasTrans && *ldb < *rows ) info = 9;
+ if ( trans == BlasTransConj && *ldb < *rows ) info = 9;
+ }
+
+ if ( order == BlasColMajor && *lda < *rows ) info = 7;
+ if ( order == BlasRowMajor && *lda < *cols ) info = 7;
+ if ( *cols <= 0 ) info = 4;
+ if ( *rows <= 0 ) info = 3;
+ if ( trans < 0 ) info = 2;
+ if ( order < 0 ) info = 1;
+
+ if (info >= 0) {
+ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+ return;
+ }
+
+ if ( order == BlasColMajor )
+ {
+
+ if ( trans == BlasNoTrans )
+ {
+ OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
+ return;
+ }
+ if ( trans == BlasConj )
+ {
+ OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
+ return;
+ }
+ if ( trans == BlasTrans )
+ {
+ OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
+ return;
+ }
+ if ( trans == BlasTransConj )
+ {
+ OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
+ return;
+ }
+
+ }
+ else
+ {
+
+ if ( trans == BlasNoTrans )
+ {
+ OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
+ return;
+ }
+ if ( trans == BlasConj )
+ {
+ OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
+ return;
+ }
+ if ( trans == BlasTrans )
+ {
+ OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
+ return;
+ }
+ if ( trans == BlasTransConj )
+ {
+ OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
+ return;
+ }
+
+ }
+
+ return;
+
+}
+
+
diff --git a/interface/zrot.c b/interface/zrot.c
index f18bbc6..1c45f68 100644
--- a/interface/zrot.c
+++ b/interface/zrot.c
@@ -43,7 +43,7 @@
#endif
void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){
-
+
BLASLONG n = *N;
BLASLONG incx = *INCX;
BLASLONG incy = *INCY;
@@ -68,5 +68,5 @@ void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C
IDEBUG_END;
return;
-
+
}
diff --git a/interface/zsbmv.c b/interface/zsbmv.c
index 6d445d7..b71d4c5 100644
--- a/interface/zsbmv.c
+++ b/interface/zsbmv.c
@@ -43,6 +43,14 @@
#include "functable.h"
#endif
+/*
+#ifdef SMP
+#ifdef __64BIT__
+#define SMPTEST 1
+#endif
+#endif
+*/
+
#ifdef XDOUBLE
#define ERROR_NAME "XSBMV "
#elif defined(DOUBLE)
@@ -61,7 +69,7 @@ static int (*sbmv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT
#endif
};
-#ifdef SMPBUG
+#ifdef SMPTEST
static int (*sbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = {
#ifdef XDOUBLE
xsbmv_thread_U, xsbmv_thread_L,
@@ -73,7 +81,7 @@ static int (*sbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLO
};
#endif
-void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
+void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
FLOAT *b, blasint *INCX, FLOAT *BETA, FLOAT *c, blasint *INCY){
char uplo_arg = *UPLO;
@@ -90,7 +98,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *
blasint info;
int uplo;
FLOAT *buffer;
-#ifdef SMPBUG
+#ifdef SMPTEST
int nthreads;
#endif
@@ -101,7 +109,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (incy == 0) info = 11;
@@ -115,7 +123,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
if (n == 0) return;
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0);
@@ -123,7 +131,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
IDEBUG_START;
-
+
FUNCTION_PROFILE_START();
if (incx < 0 ) b -= (n - 1) * incx * COMPSIZE;
@@ -131,7 +139,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *
buffer = (FLOAT *)blas_memory_alloc(1);
-#ifdef SMPBUG
+#ifdef SMPTEST
nthreads = num_cpu_avail(2);
if (nthreads == 1) {
@@ -139,7 +147,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *
(sbmv[uplo])(n, k, alpha_r, alpha_i, a, lda, b, incx, c, incy, buffer);
-#ifdef SMPBUG
+#ifdef SMPTEST
} else {
(sbmv_thread[uplo])(n, k, ALPHA, a, lda, b, incx, c, incy, buffer, nthreads);
diff --git a/interface/zscal.c b/interface/zscal.c
index ad99874..507d649 100644
--- a/interface/zscal.c
+++ b/interface/zscal.c
@@ -45,7 +45,7 @@
#ifndef CBLAS
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX){
-
+
blasint n = *N;
blasint incx = *INCX;
@@ -90,6 +90,9 @@ void CNAME(blasint n, FLOAT alpha_r, FLOAT *x, blasint incx){
#ifdef SMP
nthreads = num_cpu_avail(1);
+ if ( n <= 1048576 )
+ nthreads = 1;
+
if (nthreads == 1) {
#endif
@@ -101,8 +104,8 @@ void CNAME(blasint n, FLOAT alpha_r, FLOAT *x, blasint incx){
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
-
+#endif
+
blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads);
}
@@ -113,5 +116,5 @@ void CNAME(blasint n, FLOAT alpha_r, FLOAT *x, blasint incx){
IDEBUG_END;
return;
-
+
}
diff --git a/interface/zspmv.c b/interface/zspmv.c
index 6555087..be11463 100644
--- a/interface/zspmv.c
+++ b/interface/zspmv.c
@@ -99,7 +99,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (incy == 0) info = 9;
@@ -111,7 +111,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
if (n == 0) return;
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0);
diff --git a/interface/zspr.c b/interface/zspr.c
index 0021bcd..574b59a 100644
--- a/interface/zspr.c
+++ b/interface/zspr.c
@@ -73,7 +73,7 @@ static int (*spr_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, FLOAT
};
#endif
-void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
+void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
FLOAT *x, blasint *INCX, FLOAT *a){
char uplo_arg = *UPLO;
@@ -96,7 +96,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (incx == 0) info = 5;
@@ -107,7 +107,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
if (n == 0) return;
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
@@ -125,9 +125,9 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
if (nthreads == 1) {
#endif
-
+
(spr[uplo])(n, alpha_r, alpha_i, x, incx, a, buffer);
-
+
#ifdef SMP
} else {
diff --git a/interface/zspr2.c b/interface/zspr2.c
index b54e165..44c36d5 100644
--- a/interface/zspr2.c
+++ b/interface/zspr2.c
@@ -73,7 +73,7 @@ static int (*spr2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASL
};
#endif
-void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
+void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a){
char uplo_arg = *UPLO;
@@ -97,7 +97,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (incy == 0) info = 7;
@@ -109,7 +109,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
if (n == 0) return;
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
@@ -128,12 +128,12 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
if (nthreads == 1) {
#endif
-
+
(spr2[uplo])(n, alpha_r, alpha_i, x, incx, y, incy, a, buffer);
-
+
#ifdef SMP
} else {
-
+
(spr2_thread[uplo])(n, ALPHA, x, incx, y, incy, a, buffer, nthreads);
}
diff --git a/interface/zswap.c b/interface/zswap.c
index 06a8892..fc62f73 100644
--- a/interface/zswap.c
+++ b/interface/zswap.c
@@ -45,7 +45,7 @@
#ifndef CBLAS
void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
-
+
blasint n = *N;
blasint incx = *INCX;
blasint incy = *INCY;
@@ -99,8 +99,8 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
-
+#endif
+
blas_level1_thread(mode, n, 0, 0, dummyalpha,
x, incx, y, incy, NULL, 0, (void *)SWAP_K, nthreads);
@@ -112,5 +112,5 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){
IDEBUG_END;
return;
-
+
}
diff --git a/interface/zsymv.c b/interface/zsymv.c
index afb2c17..1d6ff1f 100644
--- a/interface/zsymv.c
+++ b/interface/zsymv.c
@@ -51,7 +51,7 @@
#define ERROR_NAME "CSYMV "
#endif
-void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
+void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
FLOAT *b, blasint *INCX, FLOAT *BETA, FLOAT *c, blasint *INCY){
char uplo_arg = *UPLO;
@@ -88,7 +88,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (incy == 0) info = 10;
@@ -101,7 +101,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
if (n == 0) return;
if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0);
@@ -127,12 +127,12 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
#ifdef SMP
} else {
-
+
(symv_thread[uplo])(n, ALPHA, a, lda, b, incx, c, incy, buffer, nthreads);
-
+
}
#endif
-
+
blas_memory_free(buffer);
FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n);
diff --git a/interface/zsyr.c b/interface/zsyr.c
index b6b5202..5d62e87 100644
--- a/interface/zsyr.c
+++ b/interface/zsyr.c
@@ -76,7 +76,7 @@ static int (*syr_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLO
#ifndef CBLAS
-void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
+void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
FLOAT *x, blasint *INCX, FLOAT *a, blasint *LDA){
char uplo_arg = *UPLO;
@@ -100,7 +100,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (lda < MAX(1, n)) info = 7;
@@ -112,7 +112,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
@@ -142,7 +142,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO
if (incx == 0) info = 5;
if (n < 0) info = 2;
if (uplo < 0) info = 1;
-
+
}
if (order == CblasRowMajor) {
diff --git a/interface/zsyr2.c b/interface/zsyr2.c
index 0c705cb..7c81c20 100644
--- a/interface/zsyr2.c
+++ b/interface/zsyr2.c
@@ -73,7 +73,7 @@ static int (*syr2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASL
};
#endif
-void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
+void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){
char uplo_arg = *UPLO;
@@ -98,7 +98,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (lda < MAX(1, n)) info = 9;
@@ -111,7 +111,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
if (n == 0) return;
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
diff --git a/interface/ztbmv.c b/interface/ztbmv.c
index 85f53c4..0b62431 100644
--- a/interface/ztbmv.c
+++ b/interface/ztbmv.c
@@ -94,13 +94,13 @@ static int (*tbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLAS
#ifndef CBLAS
void NAME(char *UPLO, char *TRANS, char *DIAG,
- blasint *N, blasint *K,
+ blasint *N, blasint *K,
FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){
-
+
char uplo_arg = *UPLO;
char trans_arg = *TRANS;
char diag_arg = *DIAG;
-
+
blasint n = *N;
blasint k = *K;
blasint lda = *LDA;
@@ -150,7 +150,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
@@ -179,7 +179,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (TransA == CblasTrans) trans = 1;
if (TransA == CblasConjNoTrans) trans = 2;
if (TransA == CblasConjTrans) trans = 3;
-
+
if (Diag == CblasUnit) unit = 0;
if (Diag == CblasNonUnit) unit = 1;
@@ -225,7 +225,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
if (n == 0) return;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
diff --git a/interface/ztbsv.c b/interface/ztbsv.c
index 3846a4b..8afd2af 100644
--- a/interface/ztbsv.c
+++ b/interface/ztbsv.c
@@ -73,13 +73,13 @@ static int (*tbsv[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, v
#ifndef CBLAS
void NAME(char *UPLO, char *TRANS, char *DIAG,
- blasint *N, blasint *K,
+ blasint *N, blasint *K,
FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){
-
+
char uplo_arg = *UPLO;
char trans_arg = *TRANS;
char diag_arg = *DIAG;
-
+
blasint n = *N;
blasint k = *K;
blasint lda = *LDA;
@@ -126,7 +126,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
@@ -152,7 +152,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (TransA == CblasTrans) trans = 1;
if (TransA == CblasConjNoTrans) trans = 2;
if (TransA == CblasConjTrans) trans = 3;
-
+
if (Diag == CblasUnit) unit = 0;
if (Diag == CblasNonUnit) unit = 1;
@@ -198,7 +198,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
if (n == 0) return;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
diff --git a/interface/ztpmv.c b/interface/ztpmv.c
index 2f9c48f..f9dfa75 100644
--- a/interface/ztpmv.c
+++ b/interface/ztpmv.c
@@ -95,7 +95,7 @@ static int (*tpmv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, int)
void NAME(char *UPLO, char *TRANS, char *DIAG,
blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){
-
+
char uplo_arg = *UPLO;
char trans_arg = *TRANS;
char diag_arg = *DIAG;
@@ -132,7 +132,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
if (incx == 0) info = 7;
@@ -145,7 +145,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
@@ -174,7 +174,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (TransA == CblasTrans) trans = 1;
if (TransA == CblasConjNoTrans) trans = 2;
if (TransA == CblasConjTrans) trans = 3;
-
+
if (Diag == CblasUnit) unit = 0;
if (Diag == CblasNonUnit) unit = 1;
@@ -216,7 +216,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
if (n == 0) return;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
diff --git a/interface/ztpsv.c b/interface/ztpsv.c
index fde500e..c63e4d0 100644
--- a/interface/ztpsv.c
+++ b/interface/ztpsv.c
@@ -74,11 +74,11 @@ static int (*tpsv[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, void *) = {
void NAME(char *UPLO, char *TRANS, char *DIAG,
blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){
-
+
char uplo_arg = *UPLO;
char trans_arg = *TRANS;
char diag_arg = *DIAG;
-
+
blasint n = *N;
blasint incx = *INCX;
@@ -121,7 +121,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
@@ -147,7 +147,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (TransA == CblasTrans) trans = 1;
if (TransA == CblasConjNoTrans) trans = 2;
if (TransA == CblasConjTrans) trans = 3;
-
+
if (Diag == CblasUnit) unit = 0;
if (Diag == CblasNonUnit) unit = 1;
@@ -189,7 +189,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
if (n == 0) return;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
diff --git a/interface/ztrmv.c b/interface/ztrmv.c
index 5a18a85..1abaac9 100644
--- a/interface/ztrmv.c
+++ b/interface/ztrmv.c
@@ -95,7 +95,7 @@ static int (*trmv_thread[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOA
void NAME(char *UPLO, char *TRANS, char *DIAG,
blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){
-
+
char uplo_arg = *UPLO;
char trans_arg = *TRANS;
char diag_arg = *DIAG;
@@ -147,7 +147,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
@@ -176,7 +176,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (TransA == CblasTrans) trans = 1;
if (TransA == CblasConjNoTrans) trans = 2;
if (TransA == CblasConjTrans) trans = 3;
-
+
if (Diag == CblasUnit) unit = 0;
if (Diag == CblasNonUnit) unit = 1;
@@ -220,7 +220,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
if (n == 0) return;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
@@ -239,9 +239,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#ifdef SMP
} else {
-
+
(trmv_thread[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer, nthreads);
-
+
}
#endif
diff --git a/interface/ztrsv.c b/interface/ztrsv.c
index 08f7dc6..ceac172 100644
--- a/interface/ztrsv.c
+++ b/interface/ztrsv.c
@@ -74,11 +74,11 @@ static int (*trsv[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = {
void NAME(char *UPLO, char *TRANS, char *DIAG,
blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){
-
+
char uplo_arg = *UPLO;
char trans_arg = *TRANS;
char diag_arg = *DIAG;
-
+
blasint n = *N;
blasint lda = *LDA;
blasint incx = *INCX;
@@ -109,7 +109,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
-
+
info = 0;
@@ -124,7 +124,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
-
+
#else
@@ -151,7 +151,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (TransA == CblasTrans) trans = 1;
if (TransA == CblasConjNoTrans) trans = 2;
if (TransA == CblasConjTrans) trans = 3;
-
+
if (Diag == CblasUnit) unit = 0;
if (Diag == CblasNonUnit) unit = 1;
@@ -195,7 +195,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
if (n == 0) return;
-
+
IDEBUG_START;
FUNCTION_PROFILE_START();
diff --git a/kernel/Makefile b/kernel/Makefile
index 55edcd2..a0a8fcd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -53,9 +53,9 @@ SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX)
CCOMMON_OPT += -DTS=$(TSUFFIX)
endif
-KERNEL_INTERFACE = ../common_level1.h ../common_level2.h ../common_level3.h
+KERNEL_INTERFACE = ../common_level1.h ../common_level2.h ../common_level3.h
ifneq ($(NO_LAPACK), 1)
-KERNEL_INTERFACE += ../common_lapack.h
+KERNEL_INTERFACE += ../common_lapack.h
endif
ifeq ($(ARCH), x86)
@@ -93,7 +93,7 @@ setparam$(TSUFFIX).$(SUFFIX): setparam$(TSUFFIX).c kernel$(TSUFFIX).h
setparam$(TSUFFIX).c : setparam-ref.c
sed 's/TS/$(TSUFFIX)/g' $< > $(@F)
-kernel$(TSUFFIX).h : $(KERNEL_INTERFACE)
+kernel$(TSUFFIX).h : $(KERNEL_INTERFACE)
sed 's/\ *(/$(TSUFFIX)(/g' $^ > $(@F)
diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1
index 667145c..7c7cb27 100644
--- a/kernel/Makefile.L1
+++ b/kernel/Makefile.L1
@@ -432,18 +432,38 @@ ifndef LSAME_KERNEL
LSAME_KERNEL = lsame.S
endif
+### AXPBY ###
+
+ifndef SAXPBYKERNEL
+SAXPBYKERNEL = ../arm/axpby.c
+endif
+
+ifndef DAXPBYKERNEL
+DAXPBYKERNEL = ../arm/axpby.c
+endif
+
+ifndef CAXPBYKERNEL
+CAXPBYKERNEL = ../arm/zaxpby.c
+endif
+
+ifndef ZAXPBYKERNEL
+ZAXPBYKERNEL = ../arm/zaxpby.c
+endif
+
SBLASOBJS += \
samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \
isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \
sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \
sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \
- snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX)
+ snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \
+ saxpby_k$(TSUFFIX).$(SUFFIX)
DBLASOBJS += \
damax_k$(TSUFFIX).$(SUFFIX) damin_k$(TSUFFIX).$(SUFFIX) dmax_k$(TSUFFIX).$(SUFFIX) dmin_k$(TSUFFIX).$(SUFFIX) \
idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \
dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \
- dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX)
+ dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \
+ daxpby_k$(TSUFFIX).$(SUFFIX)
QBLASOBJS += \
qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \
@@ -455,13 +475,13 @@ CBLASOBJS += \
camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \
casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \
cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \
- cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX)
+ cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX)
ZBLASOBJS += \
zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \
zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \
zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \
- zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX)
+ zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX)
XBLASOBJS += \
xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \
@@ -474,294 +494,320 @@ XBLASOBJS += \
-$(KDIR)samax_k$(TSUFFIX).$(SUFFIX) $(KDIR)samax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAMAXKERNEL)
+$(KDIR)samax_k$(TSUFFIX).$(SUFFIX) $(KDIR)samax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAMAXKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@
-$(KDIR)damax_k$(TSUFFIX).$(SUFFIX) $(KDIR)damax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAMAXKERNEL)
+$(KDIR)damax_k$(TSUFFIX).$(SUFFIX) $(KDIR)damax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAMAXKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@
-$(KDIR)qamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)qamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAMAXKERNEL)
+$(KDIR)qamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)qamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAMAXKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@
-$(KDIR)camax_k$(TSUFFIX).$(SUFFIX) $(KDIR)camax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAMAXKERNEL)
+$(KDIR)camax_k$(TSUFFIX).$(SUFFIX) $(KDIR)camax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAMAXKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@
-$(KDIR)zamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)zamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAMAXKERNEL)
+$(KDIR)zamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)zamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAMAXKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@
-$(KDIR)xamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)xamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAMAXKERNEL)
+$(KDIR)xamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)xamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAMAXKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@
### AMIN ###
-$(KDIR)samin_k$(TSUFFIX).$(SUFFIX) $(KDIR)samin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAMINKERNEL)
+$(KDIR)samin_k$(TSUFFIX).$(SUFFIX) $(KDIR)samin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAMINKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@
-$(KDIR)damin_k$(TSUFFIX).$(SUFFIX) $(KDIR)damin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAMINKERNEL)
+$(KDIR)damin_k$(TSUFFIX).$(SUFFIX) $(KDIR)damin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAMINKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@
-$(KDIR)qamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)qamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAMINKERNEL)
+$(KDIR)qamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)qamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAMINKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@
-$(KDIR)camin_k$(TSUFFIX).$(SUFFIX) $(KDIR)camin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAMINKERNEL)
+$(KDIR)camin_k$(TSUFFIX).$(SUFFIX) $(KDIR)camin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAMINKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@
-$(KDIR)zamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)zamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAMINKERNEL)
+$(KDIR)zamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)zamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAMINKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@
-$(KDIR)xamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)xamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAMINKERNEL)
+$(KDIR)xamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)xamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAMINKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@
### MAX ###
-$(KDIR)smax_k$(TSUFFIX).$(SUFFIX) $(KDIR)smax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SMAXKERNEL)
+$(KDIR)smax_k$(TSUFFIX).$(SUFFIX) $(KDIR)smax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SMAXKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@
-$(KDIR)dmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)dmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DMAXKERNEL)
+$(KDIR)dmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)dmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DMAXKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@
-$(KDIR)qmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)qmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QMAXKERNEL)
+$(KDIR)qmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)qmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QMAXKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@
### MIN ###
-$(KDIR)smin_k$(TSUFFIX).$(SUFFIX) $(KDIR)smin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SMINKERNEL)
+$(KDIR)smin_k$(TSUFFIX).$(SUFFIX) $(KDIR)smin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SMINKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@
-$(KDIR)dmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)dmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DMINKERNEL)
+$(KDIR)dmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)dmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DMINKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@
-$(KDIR)qmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)qmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QMINKERNEL)
+$(KDIR)qmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)qmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QMINKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@
### IAMAX ###
-$(KDIR)isamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)isamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISAMAXKERNEL)
+$(KDIR)isamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)isamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISAMAXKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@
-$(KDIR)idamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)idamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDAMAXKERNEL)
+$(KDIR)idamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)idamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDAMAXKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@
-$(KDIR)iqamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQAMAXKERNEL)
+$(KDIR)iqamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQAMAXKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@
-$(KDIR)icamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)icamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ICAMAXKERNEL)
+$(KDIR)icamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)icamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ICAMAXKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@
-$(KDIR)izamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)izamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IZAMAXKERNEL)
+$(KDIR)izamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)izamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IZAMAXKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@
-$(KDIR)ixamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)ixamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IXAMAXKERNEL)
+$(KDIR)ixamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)ixamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IXAMAXKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@
### IAMIN ###
-$(KDIR)isamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)isamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISAMINKERNEL)
+$(KDIR)isamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)isamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISAMINKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@
-$(KDIR)idamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDAMINKERNEL)
+$(KDIR)idamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDAMINKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@
-$(KDIR)iqamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQAMINKERNEL)
+$(KDIR)iqamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQAMINKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@
-$(KDIR)icamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)icamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ICAMINKERNEL)
+$(KDIR)icamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)icamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ICAMINKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@
-$(KDIR)izamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)izamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IZAMINKERNEL)
+$(KDIR)izamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)izamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IZAMINKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@
-$(KDIR)ixamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)ixamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IXAMINKERNEL)
+$(KDIR)ixamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)ixamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IXAMINKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@
### IMAX ###
-$(KDIR)ismax_k$(TSUFFIX).$(SUFFIX) $(KDIR)ismax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISMAXKERNEL)
+$(KDIR)ismax_k$(TSUFFIX).$(SUFFIX) $(KDIR)ismax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISMAXKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@
-$(KDIR)idmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDMAXKERNEL)
+$(KDIR)idmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDMAXKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@
-$(KDIR)iqmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMAXKERNEL)
+$(KDIR)iqmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMAXKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@
### IMIN ###
-$(KDIR)ismin_k$(TSUFFIX).$(SUFFIX) $(KDIR)ismin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISMINKERNEL)
+$(KDIR)ismin_k$(TSUFFIX).$(SUFFIX) $(KDIR)ismin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISMINKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@
-$(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDMINKERNEL)
+$(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDMINKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@
-$(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL)
+$(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@
-$(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL)
+$(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
-$(KDIR)dasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DASUMKERNEL)
+$(KDIR)dasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DASUMKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
-$(KDIR)qasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QASUMKERNEL)
+$(KDIR)qasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QASUMKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
-$(KDIR)casum_k$(TSUFFIX).$(SUFFIX) $(KDIR)casum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CASUMKERNEL)
+$(KDIR)casum_k$(TSUFFIX).$(SUFFIX) $(KDIR)casum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CASUMKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@
-$(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZASUMKERNEL)
+$(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZASUMKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@
-$(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL)
+$(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
-$(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL)
+$(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@
-$(KDIR)daxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPYKERNEL)
+$(KDIR)daxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@
-$(KDIR)qaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAXPYKERNEL)
+$(KDIR)qaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAXPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@
-$(KDIR)caxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL)
+$(KDIR)caxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@
-$(KDIR)zaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL)
+$(KDIR)zaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@
-$(KDIR)xaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL)
+$(KDIR)xaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DXDOUBLE $< -o $@
-$(KDIR)caxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL)
+$(KDIR)caxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -UDOUBLE $< -o $@
-$(KDIR)zaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL)
+$(KDIR)zaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DDOUBLE $< -o $@
-$(KDIR)xaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL)
+$(KDIR)xaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DXDOUBLE $< -o $@
-$(KDIR)scopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)scopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOPYKERNEL)
+$(KDIR)scopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)scopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@
-$(KDIR)dcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)dcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOPYKERNEL)
+$(KDIR)dcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)dcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@
-$(KDIR)qcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QCOPYKERNEL)
+$(KDIR)qcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QCOPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@
-$(KDIR)ccopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)ccopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOPYKERNEL)
+$(KDIR)ccopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)ccopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@
-$(KDIR)zcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOPYKERNEL)
+$(KDIR)zcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@
-$(KDIR)xcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XCOPYKERNEL)
+$(KDIR)xcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XCOPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@
-$(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DDOTKERNEL)
+$(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
-$(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL)
+$(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
-$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
+$(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
+ $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
+
+ifdef DSDOTKERNEL
+
+$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@
-$(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
- $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
+$(KDIR)sdsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSDOTKERNEL)
+ $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@
+
+else
-$(KDIR)sdsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
+$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@
-$(KDIR)zdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZDOTKERNEL)
+$(KDIR)sdsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
+ $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@
+
+endif
+
+$(KDIR)zdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZDOTKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ $< -o $@
-$(KDIR)zdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZDOTKERNEL)
+$(KDIR)zdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZDOTKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCONJ $< -o $@
-$(KDIR)xdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)xdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XDOTKERNEL)
+$(KDIR)xdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)xdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XDOTKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UCONJ $< -o $@
-$(KDIR)xdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XDOTKERNEL)
+$(KDIR)xdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XDOTKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCONJ $< -o $@
-$(KDIR)cdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)cdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CDOTKERNEL)
+$(KDIR)cdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)cdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CDOTKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UCONJ $< -o $@
-$(KDIR)cdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)cdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CDOTKERNEL)
+$(KDIR)cdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)cdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CDOTKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCONJ $< -o $@
-$(KDIR)snrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)snrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SNRM2KERNEL)
+$(KDIR)snrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)snrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SNRM2KERNEL)
$(CC) $(CFLAGS) -UCOMPLEX -c -UDOUBLE $< -o $@
-$(KDIR)dnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)dnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DNRM2KERNEL)
+$(KDIR)dnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)dnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DNRM2KERNEL)
$(CC) $(CFLAGS) -UCOMPLEX -c -DDOUBLE $< -o $@
-$(KDIR)qnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)qnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QNRM2KERNEL)
+$(KDIR)qnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)qnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QNRM2KERNEL)
$(CC) $(CFLAGS) -UCOMPLEX -c -DXDOUBLE $< -o $@
-$(KDIR)cnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)cnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CNRM2KERNEL)
+$(KDIR)cnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)cnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CNRM2KERNEL)
$(CC) $(CFLAGS) -DCOMPLEX -c -UDOUBLE $< -o $@
-$(KDIR)znrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)znrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZNRM2KERNEL)
+$(KDIR)znrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)znrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZNRM2KERNEL)
$(CC) $(CFLAGS) -DCOMPLEX -c -DDOUBLE $< -o $@
-$(KDIR)xnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)xnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XNRM2KERNEL)
+$(KDIR)xnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)xnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XNRM2KERNEL)
$(CC) $(CFLAGS) -DCOMPLEX -c -DXDOUBLE $< -o $@
-$(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTKERNEL)
+$(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@
-$(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL)
+$(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@
-$(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL)
+$(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@
-$(KDIR)csrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)csrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CROTKERNEL)
+$(KDIR)csrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)csrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CROTKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UDOUBLE $< -o $@
-$(KDIR)zdrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZROTKERNEL)
+$(KDIR)zdrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZROTKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DDOUBLE $< -o $@
-$(KDIR)xqrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)xqrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XROTKERNEL)
+$(KDIR)xqrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)xqrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XROTKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DXDOUBLE $< -o $@
-$(KDIR)sscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)sscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSCALKERNEL)
+$(KDIR)sscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)sscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSCALKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
-$(KDIR)dscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)dscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSCALKERNEL)
+$(KDIR)dscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)dscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSCALKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
-$(KDIR)qscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)qscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSCALKERNEL)
+$(KDIR)qscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)qscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSCALKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
-$(KDIR)cscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)cscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSCALKERNEL)
+$(KDIR)cscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)cscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSCALKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@
-$(KDIR)zscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)zscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSCALKERNEL)
+$(KDIR)zscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)zscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSCALKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@
-$(KDIR)xscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)xscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSCALKERNEL)
+$(KDIR)xscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)xscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSCALKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
-$(KDIR)sswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)sswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSWAPKERNEL)
+$(KDIR)sswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)sswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSWAPKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
-$(KDIR)dswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)dswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSWAPKERNEL)
+$(KDIR)dswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)dswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSWAPKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
-$(KDIR)qswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)qswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSWAPKERNEL)
+$(KDIR)qswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)qswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSWAPKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
-$(KDIR)cswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)cswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSWAPKERNEL)
+$(KDIR)cswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)cswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSWAPKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@
-$(KDIR)zswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)zswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSWAPKERNEL)
+$(KDIR)zswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)zswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSWAPKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@
-$(KDIR)xswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)xswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSWAPKERNEL)
+$(KDIR)xswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)xswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSWAPKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
+
+$(KDIR)saxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPBYKERNEL)
+ $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@
+
+$(KDIR)daxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPBYKERNEL)
+ $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@
+
+$(KDIR)caxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPBYKERNEL)
+ $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@
+
+$(KDIR)zaxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPBYKERNEL)
+ $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@
+
+
diff --git a/kernel/Makefile.L2 b/kernel/Makefile.L2
index ae46411..2aeb8f0 100644
--- a/kernel/Makefile.L2
+++ b/kernel/Makefile.L2
@@ -219,210 +219,210 @@ XBLASOBJS += \
xhemv_U$(TSUFFIX).$(SUFFIX) xhemv_L$(TSUFFIX).$(SUFFIX) xhemv_V$(TSUFFIX).$(SUFFIX) xhemv_M$(TSUFFIX).$(SUFFIX) \
xgeru_k$(TSUFFIX).$(SUFFIX) xgerc_k$(TSUFFIX).$(SUFFIX) xgerv_k$(TSUFFIX).$(SUFFIX) xgerd_k$(TSUFFIX).$(SUFFIX)
-$(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+$(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -UTRANS $< -o $@
-$(KDIR)sgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+$(KDIR)sgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DTRANS $< -o $@
-$(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+$(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -UTRANS $< -o $@
-
-$(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+
+$(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@
-$(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL)
+$(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL)
$(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -UTRANS $< -o $@
-
-$(KDIR)qgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVTKERNEL)
+
+$(KDIR)qgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVTKERNEL)
$(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DTRANS $< -o $@
-$(KDIR)cgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+$(KDIR)cgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@
-$(KDIR)cgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+$(KDIR)cgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -UCONJ -UXCONJ $< -o $@
-$(KDIR)cgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+$(KDIR)cgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -DCONJ -UXCONJ $< -o $@
-$(KDIR)cgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+$(KDIR)cgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -DCONJ -UXCONJ $< -o $@
-$(KDIR)cgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+$(KDIR)cgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -UCONJ -DXCONJ $< -o $@
-$(KDIR)cgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+$(KDIR)cgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -UCONJ -DXCONJ $< -o $@
-$(KDIR)cgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+$(KDIR)cgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -DCONJ -DXCONJ $< -o $@
-$(KDIR)cgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+$(KDIR)cgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@
-$(KDIR)zgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+$(KDIR)zgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@
-$(KDIR)zgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+$(KDIR)zgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -UCONJ -UXCONJ $< -o $@
-$(KDIR)zgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+$(KDIR)zgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -DCONJ -UXCONJ $< -o $@
-$(KDIR)zgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+$(KDIR)zgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -DCONJ -UXCONJ $< -o $@
-$(KDIR)zgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+$(KDIR)zgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -UCONJ -DXCONJ $< -o $@
-$(KDIR)zgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+$(KDIR)zgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -UCONJ -DXCONJ $< -o $@
-$(KDIR)zgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+$(KDIR)zgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -DCONJ -DXCONJ $< -o $@
-$(KDIR)zgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
+$(KDIR)zgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@
-$(KDIR)xgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL)
+$(KDIR)xgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL)
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@
-$(KDIR)xgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL)
+$(KDIR)xgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL)
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -UCONJ -UXCONJ $< -o $@
-$(KDIR)xgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL)
+$(KDIR)xgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL)
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -DCONJ -UXCONJ $< -o $@
-$(KDIR)xgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL)
+$(KDIR)xgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL)
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -DCONJ -UXCONJ $< -o $@
-$(KDIR)xgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL)
+$(KDIR)xgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL)
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -UCONJ -DXCONJ $< -o $@
-$(KDIR)xgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL)
+$(KDIR)xgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL)
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -UCONJ -DXCONJ $< -o $@
-$(KDIR)xgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL)
+$(KDIR)xgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL)
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -DCONJ -DXCONJ $< -o $@
-$(KDIR)xgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL)
+$(KDIR)xgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL)
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@
-$(KDIR)ssymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_U_KERNEL) $(SSYMV_U_PARAM)
+$(KDIR)ssymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_U_KERNEL) $(SSYMV_U_PARAM)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $@
-$(KDIR)ssymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_L_KERNEL) $(SSYMV_L_PARAM)
+$(KDIR)ssymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_L_KERNEL) $(SSYMV_L_PARAM)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $@
-$(KDIR)dsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_U_KERNEL) $(DSYMV_U_PARAM)
+$(KDIR)dsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_U_KERNEL) $(DSYMV_U_PARAM)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $@
-$(KDIR)dsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_L_KERNEL) $(DSYMV_L_PARAM)
+$(KDIR)dsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_L_KERNEL) $(DSYMV_L_PARAM)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $@
-$(KDIR)qsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_U_KERNEL)
+$(KDIR)qsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_U_KERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $@
-$(KDIR)qsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_L_KERNEL)
+$(KDIR)qsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_L_KERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $@
-$(KDIR)csymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_U_KERNEL) $(CSYMV_U_PARAM)
+$(KDIR)csymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_U_KERNEL) $(CSYMV_U_PARAM)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $@
-$(KDIR)csymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_L_KERNEL) $(CSYMV_L_PARAM)
+$(KDIR)csymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_L_KERNEL) $(CSYMV_L_PARAM)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $@
-$(KDIR)zsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_U_KERNEL) $(ZSYMV_U_PARAM)
+$(KDIR)zsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_U_KERNEL) $(ZSYMV_U_PARAM)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $@
-$(KDIR)zsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_L_KERNEL) $(ZSYMV_L_PARAM)
+$(KDIR)zsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_L_KERNEL) $(ZSYMV_L_PARAM)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $@
-$(KDIR)xsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_U_KERNEL)
+$(KDIR)xsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_U_KERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $@
-$(KDIR)xsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_L_KERNEL)
+$(KDIR)xsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_L_KERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $@
-$(KDIR)sger_k$(TSUFFIX).$(SUFFIX) $(KDIR)sger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGERKERNEL) $(SGERPARAM)
+$(KDIR)sger_k$(TSUFFIX).$(SUFFIX) $(KDIR)sger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGERKERNEL) $(SGERPARAM)
$(CC) -c $(CFLAGS) -UDOUBLE $< -o $@
-$(KDIR)dger_k$(TSUFFIX).$(SUFFIX) $(KDIR)dger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGERKERNEL) $(DGERPARAM)
+$(KDIR)dger_k$(TSUFFIX).$(SUFFIX) $(KDIR)dger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGERKERNEL) $(DGERPARAM)
$(CC) -c $(CFLAGS) -DDOUBLE $< -o $@
-$(KDIR)qger_k$(TSUFFIX).$(SUFFIX) $(KDIR)qger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGERKERNEL) $(QGERPARAM)
+$(KDIR)qger_k$(TSUFFIX).$(SUFFIX) $(KDIR)qger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGERKERNEL) $(QGERPARAM)
$(CC) -c $(CFLAGS) -DXDOUBLE $< -o $@
-$(KDIR)cgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERUKERNEL) $(CGERPARAM)
+$(KDIR)cgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERUKERNEL) $(CGERPARAM)
$(CC) -c $(CFLAGS) -UDOUBLE -UCONJ $< -o $@
-$(KDIR)cgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERCKERNEL) $(CGERPARAM)
+$(KDIR)cgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERCKERNEL) $(CGERPARAM)
$(CC) -c $(CFLAGS) -UDOUBLE -DCONJ $< -o $@
-$(KDIR)cgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERUKERNEL) $(CGERPARAM)
+$(KDIR)cgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERUKERNEL) $(CGERPARAM)
$(CC) -c $(CFLAGS) -UDOUBLE -UCONJ -DXCONJ $< -o $@
-$(KDIR)cgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERCKERNEL) $(CGERPARAM)
+$(KDIR)cgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERCKERNEL) $(CGERPARAM)
$(CC) -c $(CFLAGS) -UDOUBLE -DCONJ -DXCONJ $< -o $@
-$(KDIR)zgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERUKERNEL) $(ZGERPARAM)
+$(KDIR)zgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERUKERNEL) $(ZGERPARAM)
$(CC) -c $(CFLAGS) -DDOUBLE -UCONJ $< -o $@
-$(KDIR)zgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERCKERNEL) $(ZGERPARAM)
+$(KDIR)zgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERCKERNEL) $(ZGERPARAM)
$(CC) -c $(CFLAGS) -DDOUBLE -DCONJ $< -o $@
-$(KDIR)zgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERUKERNEL) $(ZGERPARAM)
+$(KDIR)zgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERUKERNEL) $(ZGERPARAM)
$(CC) -c $(CFLAGS) -DDOUBLE -UCONJ -DXCONJ $< -o $@
-$(KDIR)zgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERCKERNEL) $(ZGERPARAM)
+$(KDIR)zgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERCKERNEL) $(ZGERPARAM)
$(CC) -c $(CFLAGS) -DDOUBLE -DCONJ -DXCONJ $< -o $@
-$(KDIR)xgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERUKERNEL) $(XGERPARAM)
+$(KDIR)xgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERUKERNEL) $(XGERPARAM)
$(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ $< -o $@
-$(KDIR)xgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM)
+$(KDIR)xgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM)
$(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ $< -o $@
-$(KDIR)xgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERUKERNEL) $(XGERPARAM)
+$(KDIR)xgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERUKERNEL) $(XGERPARAM)
$(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ -DXCONJ $< -o $@
-$(KDIR)xgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM)
+$(KDIR)xgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM)
$(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ -DXCONJ $< -o $@
-$(KDIR)chemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_U_KERNEL) $(CHEMV_U_PARAM)
+$(KDIR)chemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_U_KERNEL) $(CHEMV_U_PARAM)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $@
-$(KDIR)chemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_L_KERNEL) $(CHEMV_L_PARAM)
+$(KDIR)chemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_L_KERNEL) $(CHEMV_L_PARAM)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $@
-$(KDIR)chemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_V_KERNEL) $(CHEMV_U_PARAM) ../symcopy.h
+$(KDIR)chemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_V_KERNEL) $(CHEMV_U_PARAM) ../symcopy.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV -DHEMVREV $< -o $@
-$(KDIR)chemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_M_KERNEL) $(CHEMV_L_PARAM) ../symcopy.h
+$(KDIR)chemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_M_KERNEL) $(CHEMV_L_PARAM) ../symcopy.h
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@
-$(KDIR)zhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_U_KERNEL) $(ZHEMV_U_PARAM)
+$(KDIR)zhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_U_KERNEL) $(ZHEMV_U_PARAM)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $@
-$(KDIR)zhemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_L_KERNEL) $(ZHEMV_L_PARAM)
+$(KDIR)zhemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_L_KERNEL) $(ZHEMV_L_PARAM)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $@
-$(KDIR)zhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_V_KERNEL) $(ZHEMV_U_PARAM) ../symcopy.h
+$(KDIR)zhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_V_KERNEL) $(ZHEMV_U_PARAM) ../symcopy.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV -DHEMVREV $< -o $@
-$(KDIR)zhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_M_KERNEL) $(ZHEMV_L_PARAM) ../symcopy.h
+$(KDIR)zhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_M_KERNEL) $(ZHEMV_L_PARAM) ../symcopy.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@
-$(KDIR)xhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_U_KERNEL)
+$(KDIR)xhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_U_KERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $@
-$(KDIR)xhemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_L_KERNEL)
+$(KDIR)xhemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_L_KERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $@
-$(KDIR)xhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_V_KERNEL) ../symcopy.h
+$(KDIR)xhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_V_KERNEL) ../symcopy.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV -DHEMVREV $< -o $@
-$(KDIR)xhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_M_KERNEL) ../symcopy.h
+$(KDIR)xhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_M_KERNEL) ../symcopy.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index b9b4bef..268177c 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -1,3 +1,5 @@
+USE_GEMM3M = 0
+
ifeq ($(ARCH), x86)
USE_GEMM3M = 1
endif
@@ -22,7 +24,7 @@ ifeq ($(ARCH), arm64)
USE_TRMM = 1
endif
-ifeq ($(TARGET), LOONGSON3B)
+ifeq ($(TARGET), LOONGSON3B)
USE_TRMM = 1
endif
@@ -122,7 +124,7 @@ XBLASOBJS += \
xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \
xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) \
-ifdef USE_GEMM3M
+ifeq ($(USE_GEMM3M), 1)
CBLASOBJS += cgemm3m_kernel$(TSUFFIX).$(SUFFIX)
ZBLASOBJS += zgemm3m_kernel$(TSUFFIX).$(SUFFIX)
@@ -256,7 +258,7 @@ XBLASOBJS += \
xhemm_iutcopy$(TSUFFIX).$(SUFFIX) xhemm_iltcopy$(TSUFFIX).$(SUFFIX) \
xhemm_outcopy$(TSUFFIX).$(SUFFIX) xhemm_oltcopy$(TSUFFIX).$(SUFFIX)
-ifdef USE_GEMM3M
+ifeq ($(USE_GEMM3M), 1)
CBLASOBJS += \
cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \
@@ -320,6 +322,28 @@ XBLASOBJS += \
endif
+###### BLAS extensions #####
+SBLASOBJS += \
+ somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
+ somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX)
+
+DBLASOBJS += \
+ domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
+ domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX)
+
+CBLASOBJS += \
+ comatcopy_k_cn$(TSUFFIX).$(SUFFIX) comatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
+ comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
+ comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
+ comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX)
+
+ZBLASOBJS += \
+ zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
+ zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
+ zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
+ zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX)
+
+
SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
@@ -513,7 +537,7 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@
-ifdef USE_TRMM
+ifdef USE_TRMM
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@@ -3237,3 +3261,178 @@ $(KDIR)xtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL
$(KDIR)xtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c
$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
+
+
+##### BLAS extensions ######
+
+ifndef DOMATCOPY_CN
+DOMATCOPY_CN = ../arm/omatcopy_cn.c
+endif
+
+$(KDIR)domatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_CN)
+ $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@
+
+ifndef DOMATCOPY_RN
+DOMATCOPY_RN = ../arm/omatcopy_rn.c
+endif
+
+$(KDIR)domatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_RN)
+ $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@
+
+ifndef DOMATCOPY_CT
+DOMATCOPY_CT = ../arm/omatcopy_ct.c
+endif
+
+$(KDIR)domatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_CT)
+ $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@
+
+ifndef DOMATCOPY_RT
+DOMATCOPY_RT = ../arm/omatcopy_rt.c
+endif
+
+$(KDIR)domatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_RT)
+ $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@
+
+ifndef SOMATCOPY_CN
+SOMATCOPY_CN = ../arm/omatcopy_cn.c
+endif
+
+$(KDIR)somatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_CN)
+ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@
+
+ifndef SOMATCOPY_RN
+SOMATCOPY_RN = ../arm/omatcopy_rn.c
+endif
+
+$(KDIR)somatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_RN)
+ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@
+
+ifndef SOMATCOPY_CT
+SOMATCOPY_CT = ../arm/omatcopy_ct.c
+endif
+
+$(KDIR)somatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_CT)
+ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@
+
+ifndef SOMATCOPY_RT
+SOMATCOPY_RT = ../arm/omatcopy_rt.c
+endif
+
+$(KDIR)somatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_RT)
+ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@
+
+
+ifndef COMATCOPY_CN
+COMATCOPY_CN = ../arm/zomatcopy_cn.c
+endif
+
+$(KDIR)comatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CN)
+ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
+
+ifndef COMATCOPY_RN
+COMATCOPY_RN = ../arm/zomatcopy_rn.c
+endif
+
+$(KDIR)comatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RN)
+ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
+
+ifndef COMATCOPY_CT
+COMATCOPY_CT = ../arm/zomatcopy_ct.c
+endif
+
+$(KDIR)comatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CT)
+ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
+
+ifndef COMATCOPY_RT
+COMATCOPY_RT = ../arm/zomatcopy_rt.c
+endif
+
+$(KDIR)comatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RT)
+ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
+
+ifndef COMATCOPY_CNC
+COMATCOPY_CNC = ../arm/zomatcopy_cnc.c
+endif
+
+$(KDIR)comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CNC)
+ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
+
+ifndef COMATCOPY_RNC
+COMATCOPY_RNC = ../arm/zomatcopy_rnc.c
+endif
+
+$(KDIR)comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RNC)
+ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
+
+ifndef COMATCOPY_CTC
+COMATCOPY_CTC = ../arm/zomatcopy_ctc.c
+endif
+
+$(KDIR)comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CTC)
+ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
+
+ifndef COMATCOPY_RTC
+COMATCOPY_RTC = ../arm/zomatcopy_rtc.c
+endif
+
+$(KDIR)comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RTC)
+ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
+
+
+ifndef ZOMATCOPY_CN
+ZOMATCOPY_CN = ../arm/zomatcopy_cn.c
+endif
+
+$(KDIR)zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CN)
+ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
+
+ifndef ZOMATCOPY_RN
+ZOMATCOPY_RN = ../arm/zomatcopy_rn.c
+endif
+
+$(KDIR)zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RN)
+ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
+
+ifndef ZOMATCOPY_CT
+ZOMATCOPY_CT = ../arm/zomatcopy_ct.c
+endif
+
+$(KDIR)zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CT)
+ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
+
+ifndef ZOMATCOPY_RT
+ZOMATCOPY_RT = ../arm/zomatcopy_rt.c
+endif
+
+$(KDIR)zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RT)
+ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
+
+ifndef ZOMATCOPY_CNC
+ZOMATCOPY_CNC = ../arm/zomatcopy_cnc.c
+endif
+
+$(KDIR)zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CNC)
+ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
+
+ifndef ZOMATCOPY_RNC
+ZOMATCOPY_RNC = ../arm/zomatcopy_rnc.c
+endif
+
+$(KDIR)zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RNC)
+ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
+
+ifndef ZOMATCOPY_CTC
+ZOMATCOPY_CTC = ../arm/zomatcopy_ctc.c
+endif
+
+$(KDIR)zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CTC)
+ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
+
+ifndef ZOMATCOPY_RTC
+ZOMATCOPY_RTC = ../arm/zomatcopy_rtc.c
+endif
+
+$(KDIR)zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RTC)
+ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
+
+
diff --git a/kernel/alpha/KERNEL b/kernel/alpha/KERNEL
index a39ccd5..01734bf 100644
--- a/kernel/alpha/KERNEL
+++ b/kernel/alpha/KERNEL
@@ -74,8 +74,8 @@ SGEMMKERNEL = gemm_kernel_4x4.S
SGEMM_BETA = gemm_beta.S
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
-SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX)
-SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX)
+SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX)
DGEMMKERNEL = gemm_kernel_4x4.S
DGEMM_BETA = gemm_beta.S
diff --git a/kernel/alpha/cnrm2.S b/kernel/alpha/cnrm2.S
index 03343b2..bd1ab87 100644
--- a/kernel/alpha/cnrm2.S
+++ b/kernel/alpha/cnrm2.S
@@ -75,7 +75,7 @@
.mask 0x4000000,-16
ldah $29, 0($27) !gpdisp!1
lda $29, 0($29) !gpdisp!1
-
+
lda $sp, -16($sp)
ldq $27, sqrt($29) !literal!2
stq $26, 0($sp)
@@ -85,7 +85,7 @@
#else
PROFCODE
#endif
-
+
fclr a0
sll INCX, ZBASE_SHIFT, INCX
fclr a1
diff --git a/kernel/alpha/dnrm2.S b/kernel/alpha/dnrm2.S
index b8ccc75..0dfb649 100644
--- a/kernel/alpha/dnrm2.S
+++ b/kernel/alpha/dnrm2.S
@@ -75,7 +75,7 @@
.mask 0x4000000,-16
ldah $29, 0($27) !gpdisp!1
lda $29, 0($29) !gpdisp!1
-
+
lda $sp, -16($sp)
ldq $27, sqrt($29) !literal!2
stq $26, 0($sp)
@@ -85,7 +85,7 @@
#else
PROFCODE
#endif
-
+
fclr a0
SXADDQ INCX, 0, INCX
fclr a1
diff --git a/kernel/alpha/gemm_kernel_4x4.S b/kernel/alpha/gemm_kernel_4x4.S
index 4e92534..c55d817 100644
--- a/kernel/alpha/gemm_kernel_4x4.S
+++ b/kernel/alpha/gemm_kernel_4x4.S
@@ -167,7 +167,7 @@
sra N, 2, J
ble J, $L40
.align 4
-
+
$L01:
mov C, C1
addq C, LDC, C2
@@ -291,7 +291,7 @@ $L11:
fclr c09
lda AO, 4 * SIZE(AO)
fclr c10
-#endif
+#endif
lds $f31, 7 * SIZE(C4)
fclr c14
@@ -1456,7 +1456,7 @@ $L40:
fclr t1
addq C2, LDC, C
fclr t2
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
mov OFFSET, KK
#endif
diff --git a/kernel/alpha/gemv_n.S b/kernel/alpha/gemv_n.S
index 665b217..3e9d1d7 100644
--- a/kernel/alpha/gemv_n.S
+++ b/kernel/alpha/gemv_n.S
@@ -621,7 +621,7 @@ $L16:
LD a1, 1 * SIZE(A1)
LD a2, 0 * SIZE(A2)
LD a3, 1 * SIZE(A2)
-
+
LD y0, 0 * SIZE(Y1)
LD y1, 1 * SIZE(Y1)
@@ -854,7 +854,7 @@ $L22:
lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1)
lda A1, 8 * SIZE(A1)
- lda Y1, 8 * SIZE(Y1)
+ lda Y1, 8 * SIZE(Y1)
bgt I, $L22
.align 4
@@ -954,7 +954,7 @@ $L26:
LD a1, 1 * SIZE(A1)
LD a2, 0 * SIZE(A2)
LD a3, 1 * SIZE(A2)
-
+
LD y0, 0 * SIZE(Y1)
LD y1, 1 * SIZE(Y1)
@@ -1173,7 +1173,7 @@ $L36:
LD a0, 0 * SIZE(A1)
LD a1, 1 * SIZE(A1)
-
+
LD y0, 0 * SIZE(Y1)
MUL alpha1, a0, a0
LD y1, 1 * SIZE(Y1)
diff --git a/kernel/alpha/iamax.S b/kernel/alpha/iamax.S
index cb87632..2be5d5d 100644
--- a/kernel/alpha/iamax.S
+++ b/kernel/alpha/iamax.S
@@ -313,7 +313,7 @@ $L22:
LD $f10, 0 * SIZE(XX)
fabs $f14, $f22
addq XX, INCX, XX
- cmpteq $f0, $f18, $f2
+ cmpteq $f0, $f18, $f2
LD $f11, 0 * SIZE(XX)
fabs $f15, $f23
@@ -376,7 +376,7 @@ $L22:
$L23:
fabs $f14, $f22
- cmpteq $f0, $f18, $f2
+ cmpteq $f0, $f18, $f2
fabs $f15, $f23
cmpteq $f0, $f19, $f3
diff --git a/kernel/alpha/imax.S b/kernel/alpha/imax.S
index b0cf5c8..d8958c8 100644
--- a/kernel/alpha/imax.S
+++ b/kernel/alpha/imax.S
@@ -44,7 +44,7 @@
#define X $17
#define INCX $18
#define XX $19
-
+
#ifndef USE_MIN
#define CMPLT(a, b) cmptlt a, b
#else
diff --git a/kernel/alpha/izamax.S b/kernel/alpha/izamax.S
index 2269b12..c932581 100644
--- a/kernel/alpha/izamax.S
+++ b/kernel/alpha/izamax.S
@@ -235,7 +235,7 @@ $L13:
fcmovne $f6, $f18, $f2
fcmovne $f7, $f19, $f3
.align 4
-
+
$L14:
addt $f8, $f9, $f16
addt $f10, $f11, $f17
diff --git a/kernel/alpha/snrm2.S b/kernel/alpha/snrm2.S
index b8ccc75..0dfb649 100644
--- a/kernel/alpha/snrm2.S
+++ b/kernel/alpha/snrm2.S
@@ -75,7 +75,7 @@
.mask 0x4000000,-16
ldah $29, 0($27) !gpdisp!1
lda $29, 0($29) !gpdisp!1
-
+
lda $sp, -16($sp)
ldq $27, sqrt($29) !literal!2
stq $26, 0($sp)
@@ -85,7 +85,7 @@
#else
PROFCODE
#endif
-
+
fclr a0
SXADDQ INCX, 0, INCX
fclr a1
diff --git a/kernel/alpha/trsm_kernel_4x4_LN.S b/kernel/alpha/trsm_kernel_4x4_LN.S
index a1760c6..600b4e2 100644
--- a/kernel/alpha/trsm_kernel_4x4_LN.S
+++ b/kernel/alpha/trsm_kernel_4x4_LN.S
@@ -178,7 +178,7 @@
sra N, 2, J
ble J, $L40
.align 4
-
+
$L01:
#ifdef RT
sll K, 2 + BASE_SHIFT, TMP1
@@ -382,7 +382,7 @@ $L38:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c05, c05
SUB a3, c09, c09
@@ -392,7 +392,7 @@ $L38:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c05, c05
SUB a3, c09, c09
@@ -413,7 +413,7 @@ $L38:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a2, c01, t1
SUB c05, t1, c05
@@ -425,7 +425,7 @@ $L38:
LD b1, 5 * SIZE(BO)
LD b2, 6 * SIZE(BO)
LD b3, 7 * SIZE(BO)
-
+
MUL b1, c05, c05
MUL b2, c05, t1
SUB c09, t1, c09
@@ -435,7 +435,7 @@ $L38:
LD a1, 10 * SIZE(BO)
LD a2, 11 * SIZE(BO)
LD a3, 15 * SIZE(BO)
-
+
MUL a1, c09, c09
MUL a2, c09, t1
SUB c13, t1, c13
@@ -447,7 +447,7 @@ $L38:
LD a2, 14 * SIZE(BO)
LD a3, 13 * SIZE(BO)
LD a4, 12 * SIZE(BO)
-
+
MUL a1, c13, c13
MUL a2, c13, t1
SUB c09, t1, c09
@@ -459,7 +459,7 @@ $L38:
LD b1, 10 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 8 * SIZE(BO)
-
+
MUL b1, c09, c09
MUL b2, c09, t1
SUB c05, t1, c05
@@ -775,7 +775,7 @@ $L28:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
LD b1, 4 * SIZE(BO)
LD b2, 5 * SIZE(BO)
LD b3, 6 * SIZE(BO)
@@ -796,7 +796,7 @@ $L28:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
LD b1, 4 * SIZE(AO)
LD b2, 5 * SIZE(AO)
LD b3, 6 * SIZE(AO)
@@ -843,7 +843,7 @@ $L28:
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 3 * SIZE(AO)
-
+
MUL a1, c01, c01
MUL a1, c05, c05
MUL a1, c09, c09
@@ -870,7 +870,7 @@ $L28:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a1, c02, c02
@@ -895,7 +895,7 @@ $L28:
LD b1, 5 * SIZE(BO)
LD b2, 6 * SIZE(BO)
LD b3, 7 * SIZE(BO)
-
+
MUL b1, c05, c05
MUL b1, c06, c06
@@ -914,7 +914,7 @@ $L28:
LD a1, 10 * SIZE(BO)
LD a2, 11 * SIZE(BO)
LD a3, 15 * SIZE(BO)
-
+
MUL a1, c09, c09
MUL a1, c10, c10
@@ -933,7 +933,7 @@ $L28:
LD a2, 14 * SIZE(BO)
LD a3, 13 * SIZE(BO)
LD a4, 12 * SIZE(BO)
-
+
MUL a1, c13, c13
MUL a1, c14, c14
@@ -958,7 +958,7 @@ $L28:
LD b1, 10 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 8 * SIZE(BO)
-
+
MUL b1, c09, c09
MUL b1, c10, c10
@@ -1163,7 +1163,7 @@ $L11:
fclr c14
fclr c07
ble TMP1, $L18
-#endif
+#endif
ble L, $L15
.align 5
@@ -1490,7 +1490,7 @@ $L18:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
LD b1, 4 * SIZE(BO)
LD b2, 5 * SIZE(BO)
LD b3, 6 * SIZE(BO)
@@ -1530,7 +1530,7 @@ $L18:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
LD b1, 4 * SIZE(AO)
LD b2, 5 * SIZE(AO)
LD b3, 6 * SIZE(AO)
@@ -1572,7 +1572,7 @@ $L18:
LD a2, 14 * SIZE(AO)
LD a3, 13 * SIZE(AO)
LD a4, 12 * SIZE(AO)
-
+
MUL a1, c04, c04
MUL a1, c08, c08
MUL a1, c12, c12
@@ -1611,7 +1611,7 @@ $L18:
LD b1, 10 * SIZE(AO)
LD b2, 9 * SIZE(AO)
LD b3, 8 * SIZE(AO)
-
+
MUL b1, c03, c03
MUL b1, c07, c07
MUL b1, c11, c11
@@ -1667,7 +1667,7 @@ $L18:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
MUL a1, c01, c01
MUL a1, c05, c05
MUL a1, c09, c09
@@ -1706,7 +1706,7 @@ $L18:
LD b1, 5 * SIZE(AO)
LD b2, 6 * SIZE(AO)
LD b3, 7 * SIZE(AO)
-
+
MUL b1, c02, c02
MUL b1, c06, c06
MUL b1, c10, c10
@@ -1735,7 +1735,7 @@ $L18:
LD a1, 10 * SIZE(AO)
LD a2, 11 * SIZE(AO)
LD a3, 15 * SIZE(AO)
-
+
MUL a1, c03, c03
MUL a1, c07, c07
MUL a1, c11, c11
@@ -1762,7 +1762,7 @@ $L18:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a1, c02, c02
MUL a1, c03, c03
@@ -1801,7 +1801,7 @@ $L18:
LD b1, 5 * SIZE(BO)
LD b2, 6 * SIZE(BO)
LD b3, 7 * SIZE(BO)
-
+
MUL b1, c05, c05
MUL b1, c06, c06
MUL b1, c07, c07
@@ -1830,7 +1830,7 @@ $L18:
LD a1, 10 * SIZE(BO)
LD a2, 11 * SIZE(BO)
LD a3, 15 * SIZE(BO)
-
+
MUL a1, c09, c09
MUL a1, c10, c10
MUL a1, c11, c11
@@ -1857,7 +1857,7 @@ $L18:
LD a2, 14 * SIZE(BO)
LD a3, 13 * SIZE(BO)
LD a4, 12 * SIZE(BO)
-
+
MUL a1, c13, c13
MUL a1, c14, c14
MUL a1, c15, c15
@@ -1896,7 +1896,7 @@ $L18:
LD b1, 10 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 8 * SIZE(BO)
-
+
MUL b1, c09, c09
MUL b1, c10, c10
MUL b1, c11, c11
@@ -2093,7 +2093,7 @@ $L40:
addq C2, LDC, C
#endif
fclr t2
-
+
#ifdef LN
addq M, OFFSET, KK
#endif
@@ -2257,13 +2257,13 @@ $L78:
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c05, c05
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c05, c05
#endif
@@ -2279,7 +2279,7 @@ $L78:
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 3 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a2, c01, t1
SUB c05, t1, c05
@@ -2520,7 +2520,7 @@ $L68:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c05, c05
SUB a3, c02, c02
@@ -2530,7 +2530,7 @@ $L68:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c05, c05
@@ -2559,7 +2559,7 @@ $L68:
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 3 * SIZE(AO)
-
+
MUL a1, c01, c01
MUL a1, c05, c05
@@ -2577,7 +2577,7 @@ $L68:
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 3 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a1, c02, c02
@@ -2914,7 +2914,7 @@ $L58:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
LD b1, 4 * SIZE(BO)
LD b2, 5 * SIZE(BO)
LD b3, 6 * SIZE(BO)
@@ -2934,7 +2934,7 @@ $L58:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
LD b1, 4 * SIZE(AO)
LD b2, 5 * SIZE(AO)
LD b3, 6 * SIZE(AO)
@@ -2956,7 +2956,7 @@ $L58:
LD a2, 14 * SIZE(AO)
LD a3, 13 * SIZE(AO)
LD a4, 12 * SIZE(AO)
-
+
MUL a1, c04, c04
MUL a1, c08, c08
@@ -2981,7 +2981,7 @@ $L58:
LD b1, 10 * SIZE(AO)
LD b2, 9 * SIZE(AO)
LD b3, 8 * SIZE(AO)
-
+
MUL b1, c03, c03
MUL b1, c07, c07
@@ -3019,7 +3019,7 @@ $L58:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
MUL a1, c01, c01
MUL a1, c05, c05
@@ -3044,7 +3044,7 @@ $L58:
LD b1, 5 * SIZE(AO)
LD b2, 6 * SIZE(AO)
LD b3, 7 * SIZE(AO)
-
+
MUL b1, c02, c02
MUL b1, c06, c06
@@ -3063,7 +3063,7 @@ $L58:
LD a1, 10 * SIZE(AO)
LD a2, 11 * SIZE(AO)
LD a3, 15 * SIZE(AO)
-
+
MUL a1, c03, c03
MUL a1, c07, c07
@@ -3081,7 +3081,7 @@ $L58:
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 3 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a1, c02, c02
MUL a1, c03, c03
@@ -3382,11 +3382,11 @@ $L118:
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
-
+
SUB a1, c01, c01
#else
LD a1, 0 * SIZE(AO)
-
+
SUB a1, c01, c01
#endif
@@ -3398,7 +3398,7 @@ $L118:
#if defined(RN) || defined(RT)
LD a1, 0 * SIZE(BO)
-
+
MUL a1, c01, c01
#endif
@@ -3593,13 +3593,13 @@ $L108:
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
#endif
@@ -3619,7 +3619,7 @@ $L108:
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 3 * SIZE(AO)
-
+
MUL a1, c01, c01
MUL a2, c01, t1
SUB c02, t1, c02
@@ -3628,7 +3628,7 @@ $L108:
#if defined(RN) || defined(RT)
LD a1, 0 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a1, c02, c02
#endif
@@ -3886,7 +3886,7 @@ $L98:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c03, c03
@@ -3896,7 +3896,7 @@ $L98:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c03, c03
@@ -3908,7 +3908,7 @@ $L98:
LD a2, 14 * SIZE(AO)
LD a3, 13 * SIZE(AO)
LD a4, 12 * SIZE(AO)
-
+
MUL a1, c04, c04
MUL a2, c04, t1
SUB c03, t1, c03
@@ -3920,7 +3920,7 @@ $L98:
LD b1, 10 * SIZE(AO)
LD b2, 9 * SIZE(AO)
LD b3, 8 * SIZE(AO)
-
+
MUL b1, c03, c03
MUL b2, c03, t1
SUB c02, t1, c02
@@ -3942,7 +3942,7 @@ $L98:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
MUL a1, c01, c01
MUL a2, c01, t1
SUB c02, t1, c02
@@ -3954,7 +3954,7 @@ $L98:
LD b1, 5 * SIZE(AO)
LD b2, 6 * SIZE(AO)
LD b3, 7 * SIZE(AO)
-
+
MUL b1, c02, c02
MUL b2, c02, t1
SUB c03, t1, c03
@@ -3964,7 +3964,7 @@ $L98:
LD a1, 10 * SIZE(AO)
LD a2, 11 * SIZE(AO)
LD a3, 15 * SIZE(AO)
-
+
MUL a1, c03, c03
MUL a2, c03, t1
SUB c04, t1, c04
@@ -3973,7 +3973,7 @@ $L98:
#if defined(RN) || defined(RT)
LD a1, 0 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a1, c02, c02
MUL a1, c03, c03
diff --git a/kernel/alpha/trsm_kernel_4x4_LT.S b/kernel/alpha/trsm_kernel_4x4_LT.S
index 2848d26..81436d0 100644
--- a/kernel/alpha/trsm_kernel_4x4_LT.S
+++ b/kernel/alpha/trsm_kernel_4x4_LT.S
@@ -178,7 +178,7 @@
sra N, 2, J
ble J, $L40
.align 4
-
+
$L01:
#ifdef RT
sll K, 2 + BASE_SHIFT, TMP1
@@ -313,7 +313,7 @@ $L11:
fclr c14
fclr c07
ble TMP1, $L18
-#endif
+#endif
ble L, $L15
.align 5
@@ -640,7 +640,7 @@ $L18:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
LD b1, 4 * SIZE(BO)
LD b2, 5 * SIZE(BO)
LD b3, 6 * SIZE(BO)
@@ -680,7 +680,7 @@ $L18:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
LD b1, 4 * SIZE(AO)
LD b2, 5 * SIZE(AO)
LD b3, 6 * SIZE(AO)
@@ -722,7 +722,7 @@ $L18:
LD a2, 14 * SIZE(AO)
LD a3, 13 * SIZE(AO)
LD a4, 12 * SIZE(AO)
-
+
MUL a1, c04, c04
MUL a1, c08, c08
MUL a1, c12, c12
@@ -761,7 +761,7 @@ $L18:
LD b1, 10 * SIZE(AO)
LD b2, 9 * SIZE(AO)
LD b3, 8 * SIZE(AO)
-
+
MUL b1, c03, c03
MUL b1, c07, c07
MUL b1, c11, c11
@@ -817,7 +817,7 @@ $L18:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
MUL a1, c01, c01
MUL a1, c05, c05
MUL a1, c09, c09
@@ -856,7 +856,7 @@ $L18:
LD b1, 5 * SIZE(AO)
LD b2, 6 * SIZE(AO)
LD b3, 7 * SIZE(AO)
-
+
MUL b1, c02, c02
MUL b1, c06, c06
MUL b1, c10, c10
@@ -885,7 +885,7 @@ $L18:
LD a1, 10 * SIZE(AO)
LD a2, 11 * SIZE(AO)
LD a3, 15 * SIZE(AO)
-
+
MUL a1, c03, c03
MUL a1, c07, c07
MUL a1, c11, c11
@@ -912,7 +912,7 @@ $L18:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a1, c02, c02
MUL a1, c03, c03
@@ -951,7 +951,7 @@ $L18:
LD b1, 5 * SIZE(BO)
LD b2, 6 * SIZE(BO)
LD b3, 7 * SIZE(BO)
-
+
MUL b1, c05, c05
MUL b1, c06, c06
MUL b1, c07, c07
@@ -980,7 +980,7 @@ $L18:
LD a1, 10 * SIZE(BO)
LD a2, 11 * SIZE(BO)
LD a3, 15 * SIZE(BO)
-
+
MUL a1, c09, c09
MUL a1, c10, c10
MUL a1, c11, c11
@@ -1007,7 +1007,7 @@ $L18:
LD a2, 14 * SIZE(BO)
LD a3, 13 * SIZE(BO)
LD a4, 12 * SIZE(BO)
-
+
MUL a1, c13, c13
MUL a1, c14, c14
MUL a1, c15, c15
@@ -1046,7 +1046,7 @@ $L18:
LD b1, 10 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 8 * SIZE(BO)
-
+
MUL b1, c09, c09
MUL b1, c10, c10
MUL b1, c11, c11
@@ -1456,7 +1456,7 @@ $L28:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
LD b1, 4 * SIZE(BO)
LD b2, 5 * SIZE(BO)
LD b3, 6 * SIZE(BO)
@@ -1477,7 +1477,7 @@ $L28:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
LD b1, 4 * SIZE(AO)
LD b2, 5 * SIZE(AO)
LD b3, 6 * SIZE(AO)
@@ -1524,7 +1524,7 @@ $L28:
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 3 * SIZE(AO)
-
+
MUL a1, c01, c01
MUL a1, c05, c05
MUL a1, c09, c09
@@ -1551,7 +1551,7 @@ $L28:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a1, c02, c02
@@ -1576,7 +1576,7 @@ $L28:
LD b1, 5 * SIZE(BO)
LD b2, 6 * SIZE(BO)
LD b3, 7 * SIZE(BO)
-
+
MUL b1, c05, c05
MUL b1, c06, c06
@@ -1595,7 +1595,7 @@ $L28:
LD a1, 10 * SIZE(BO)
LD a2, 11 * SIZE(BO)
LD a3, 15 * SIZE(BO)
-
+
MUL a1, c09, c09
MUL a1, c10, c10
@@ -1614,7 +1614,7 @@ $L28:
LD a2, 14 * SIZE(BO)
LD a3, 13 * SIZE(BO)
LD a4, 12 * SIZE(BO)
-
+
MUL a1, c13, c13
MUL a1, c14, c14
@@ -1639,7 +1639,7 @@ $L28:
LD b1, 10 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 8 * SIZE(BO)
-
+
MUL b1, c09, c09
MUL b1, c10, c10
@@ -1912,7 +1912,7 @@ $L38:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c05, c05
SUB a3, c09, c09
@@ -1922,7 +1922,7 @@ $L38:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c05, c05
SUB a3, c09, c09
@@ -1943,7 +1943,7 @@ $L38:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a2, c01, t1
SUB c05, t1, c05
@@ -1955,7 +1955,7 @@ $L38:
LD b1, 5 * SIZE(BO)
LD b2, 6 * SIZE(BO)
LD b3, 7 * SIZE(BO)
-
+
MUL b1, c05, c05
MUL b2, c05, t1
SUB c09, t1, c09
@@ -1965,7 +1965,7 @@ $L38:
LD a1, 10 * SIZE(BO)
LD a2, 11 * SIZE(BO)
LD a3, 15 * SIZE(BO)
-
+
MUL a1, c09, c09
MUL a2, c09, t1
SUB c13, t1, c13
@@ -1977,7 +1977,7 @@ $L38:
LD a2, 14 * SIZE(BO)
LD a3, 13 * SIZE(BO)
LD a4, 12 * SIZE(BO)
-
+
MUL a1, c13, c13
MUL a2, c13, t1
SUB c09, t1, c09
@@ -1989,7 +1989,7 @@ $L38:
LD b1, 10 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 8 * SIZE(BO)
-
+
MUL b1, c09, c09
MUL b2, c09, t1
SUB c05, t1, c05
@@ -2092,7 +2092,7 @@ $L40:
addq C2, LDC, C
#endif
fclr t2
-
+
#ifdef LN
addq M, OFFSET, KK
#endif
@@ -2359,7 +2359,7 @@ $L58:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
LD b1, 4 * SIZE(BO)
LD b2, 5 * SIZE(BO)
LD b3, 6 * SIZE(BO)
@@ -2379,7 +2379,7 @@ $L58:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
LD b1, 4 * SIZE(AO)
LD b2, 5 * SIZE(AO)
LD b3, 6 * SIZE(AO)
@@ -2401,7 +2401,7 @@ $L58:
LD a2, 14 * SIZE(AO)
LD a3, 13 * SIZE(AO)
LD a4, 12 * SIZE(AO)
-
+
MUL a1, c04, c04
MUL a1, c08, c08
@@ -2426,7 +2426,7 @@ $L58:
LD b1, 10 * SIZE(AO)
LD b2, 9 * SIZE(AO)
LD b3, 8 * SIZE(AO)
-
+
MUL b1, c03, c03
MUL b1, c07, c07
@@ -2464,7 +2464,7 @@ $L58:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
MUL a1, c01, c01
MUL a1, c05, c05
@@ -2489,7 +2489,7 @@ $L58:
LD b1, 5 * SIZE(AO)
LD b2, 6 * SIZE(AO)
LD b3, 7 * SIZE(AO)
-
+
MUL b1, c02, c02
MUL b1, c06, c06
@@ -2508,7 +2508,7 @@ $L58:
LD a1, 10 * SIZE(AO)
LD a2, 11 * SIZE(AO)
LD a3, 15 * SIZE(AO)
-
+
MUL a1, c03, c03
MUL a1, c07, c07
@@ -2526,7 +2526,7 @@ $L58:
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 3 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a1, c02, c02
MUL a1, c03, c03
@@ -2827,7 +2827,7 @@ $L68:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c05, c05
SUB a3, c02, c02
@@ -2837,7 +2837,7 @@ $L68:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c05, c05
@@ -2866,7 +2866,7 @@ $L68:
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 3 * SIZE(AO)
-
+
MUL a1, c01, c01
MUL a1, c05, c05
@@ -2884,7 +2884,7 @@ $L68:
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 3 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a1, c02, c02
@@ -3117,13 +3117,13 @@ $L78:
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c05, c05
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c05, c05
#endif
@@ -3139,7 +3139,7 @@ $L78:
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 3 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a2, c01, t1
SUB c05, t1, c05
@@ -3455,7 +3455,7 @@ $L98:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c03, c03
@@ -3465,7 +3465,7 @@ $L98:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c03, c03
@@ -3477,7 +3477,7 @@ $L98:
LD a2, 14 * SIZE(AO)
LD a3, 13 * SIZE(AO)
LD a4, 12 * SIZE(AO)
-
+
MUL a1, c04, c04
MUL a2, c04, t1
SUB c03, t1, c03
@@ -3489,7 +3489,7 @@ $L98:
LD b1, 10 * SIZE(AO)
LD b2, 9 * SIZE(AO)
LD b3, 8 * SIZE(AO)
-
+
MUL b1, c03, c03
MUL b2, c03, t1
SUB c02, t1, c02
@@ -3511,7 +3511,7 @@ $L98:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
MUL a1, c01, c01
MUL a2, c01, t1
SUB c02, t1, c02
@@ -3523,7 +3523,7 @@ $L98:
LD b1, 5 * SIZE(AO)
LD b2, 6 * SIZE(AO)
LD b3, 7 * SIZE(AO)
-
+
MUL b1, c02, c02
MUL b2, c02, t1
SUB c03, t1, c03
@@ -3533,7 +3533,7 @@ $L98:
LD a1, 10 * SIZE(AO)
LD a2, 11 * SIZE(AO)
LD a3, 15 * SIZE(AO)
-
+
MUL a1, c03, c03
MUL a2, c03, t1
SUB c04, t1, c04
@@ -3542,7 +3542,7 @@ $L98:
#if defined(RN) || defined(RT)
LD a1, 0 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a1, c02, c02
MUL a1, c03, c03
@@ -3759,13 +3759,13 @@ $L108:
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
#endif
@@ -3785,7 +3785,7 @@ $L108:
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 3 * SIZE(AO)
-
+
MUL a1, c01, c01
MUL a2, c01, t1
SUB c02, t1, c02
@@ -3794,7 +3794,7 @@ $L108:
#if defined(RN) || defined(RT)
LD a1, 0 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a1, c02, c02
#endif
@@ -3977,11 +3977,11 @@ $L118:
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
-
+
SUB a1, c01, c01
#else
LD a1, 0 * SIZE(AO)
-
+
SUB a1, c01, c01
#endif
@@ -3993,7 +3993,7 @@ $L118:
#if defined(RN) || defined(RT)
LD a1, 0 * SIZE(BO)
-
+
MUL a1, c01, c01
#endif
diff --git a/kernel/alpha/trsm_kernel_4x4_RT.S b/kernel/alpha/trsm_kernel_4x4_RT.S
index 6d3d2e3..71d6c43 100644
--- a/kernel/alpha/trsm_kernel_4x4_RT.S
+++ b/kernel/alpha/trsm_kernel_4x4_RT.S
@@ -410,7 +410,7 @@ $L98:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c03, c03
@@ -420,7 +420,7 @@ $L98:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c03, c03
@@ -432,7 +432,7 @@ $L98:
LD a2, 14 * SIZE(AO)
LD a3, 13 * SIZE(AO)
LD a4, 12 * SIZE(AO)
-
+
MUL a1, c04, c04
MUL a2, c04, t1
SUB c03, t1, c03
@@ -444,7 +444,7 @@ $L98:
LD b1, 10 * SIZE(AO)
LD b2, 9 * SIZE(AO)
LD b3, 8 * SIZE(AO)
-
+
MUL b1, c03, c03
MUL b2, c03, t1
SUB c02, t1, c02
@@ -466,7 +466,7 @@ $L98:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
MUL a1, c01, c01
MUL a2, c01, t1
SUB c02, t1, c02
@@ -478,7 +478,7 @@ $L98:
LD b1, 5 * SIZE(AO)
LD b2, 6 * SIZE(AO)
LD b3, 7 * SIZE(AO)
-
+
MUL b1, c02, c02
MUL b2, c02, t1
SUB c03, t1, c03
@@ -488,7 +488,7 @@ $L98:
LD a1, 10 * SIZE(AO)
LD a2, 11 * SIZE(AO)
LD a3, 15 * SIZE(AO)
-
+
MUL a1, c03, c03
MUL a2, c03, t1
SUB c04, t1, c04
@@ -497,7 +497,7 @@ $L98:
#if defined(RN) || defined(RT)
LD a1, 0 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a1, c02, c02
MUL a1, c03, c03
@@ -714,13 +714,13 @@ $L108:
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
#endif
@@ -740,7 +740,7 @@ $L108:
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 3 * SIZE(AO)
-
+
MUL a1, c01, c01
MUL a2, c01, t1
SUB c02, t1, c02
@@ -749,7 +749,7 @@ $L108:
#if defined(RN) || defined(RT)
LD a1, 0 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a1, c02, c02
#endif
@@ -932,11 +932,11 @@ $L118:
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
-
+
SUB a1, c01, c01
#else
LD a1, 0 * SIZE(AO)
-
+
SUB a1, c01, c01
#endif
@@ -948,7 +948,7 @@ $L118:
#if defined(RN) || defined(RT)
LD a1, 0 * SIZE(BO)
-
+
MUL a1, c01, c01
#endif
@@ -1025,7 +1025,7 @@ $L40:
addq C2, LDC, C
#endif
fclr t2
-
+
#ifdef LN
addq M, OFFSET, KK
#endif
@@ -1292,7 +1292,7 @@ $L58:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
LD b1, 4 * SIZE(BO)
LD b2, 5 * SIZE(BO)
LD b3, 6 * SIZE(BO)
@@ -1312,7 +1312,7 @@ $L58:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
LD b1, 4 * SIZE(AO)
LD b2, 5 * SIZE(AO)
LD b3, 6 * SIZE(AO)
@@ -1334,7 +1334,7 @@ $L58:
LD a2, 14 * SIZE(AO)
LD a3, 13 * SIZE(AO)
LD a4, 12 * SIZE(AO)
-
+
MUL a1, c04, c04
MUL a1, c08, c08
@@ -1359,7 +1359,7 @@ $L58:
LD b1, 10 * SIZE(AO)
LD b2, 9 * SIZE(AO)
LD b3, 8 * SIZE(AO)
-
+
MUL b1, c03, c03
MUL b1, c07, c07
@@ -1397,7 +1397,7 @@ $L58:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
MUL a1, c01, c01
MUL a1, c05, c05
@@ -1422,7 +1422,7 @@ $L58:
LD b1, 5 * SIZE(AO)
LD b2, 6 * SIZE(AO)
LD b3, 7 * SIZE(AO)
-
+
MUL b1, c02, c02
MUL b1, c06, c06
@@ -1441,7 +1441,7 @@ $L58:
LD a1, 10 * SIZE(AO)
LD a2, 11 * SIZE(AO)
LD a3, 15 * SIZE(AO)
-
+
MUL a1, c03, c03
MUL a1, c07, c07
@@ -1459,7 +1459,7 @@ $L58:
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 3 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a1, c02, c02
MUL a1, c03, c03
@@ -1760,7 +1760,7 @@ $L68:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c05, c05
SUB a3, c02, c02
@@ -1770,7 +1770,7 @@ $L68:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c05, c05
@@ -1799,7 +1799,7 @@ $L68:
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 3 * SIZE(AO)
-
+
MUL a1, c01, c01
MUL a1, c05, c05
@@ -1817,7 +1817,7 @@ $L68:
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 3 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a1, c02, c02
@@ -2050,13 +2050,13 @@ $L78:
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c05, c05
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c05, c05
#endif
@@ -2072,7 +2072,7 @@ $L78:
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 3 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a2, c01, t1
SUB c05, t1, c05
@@ -2156,7 +2156,7 @@ $L80:
sra N, 2, J
ble J, $L999
.align 4
-
+
$L01:
#ifdef RT
sll K, 2 + BASE_SHIFT, TMP1
@@ -2291,7 +2291,7 @@ $L11:
fclr c14
fclr c07
ble TMP1, $L18
-#endif
+#endif
ble L, $L15
.align 5
@@ -2618,7 +2618,7 @@ $L18:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
LD b1, 4 * SIZE(BO)
LD b2, 5 * SIZE(BO)
LD b3, 6 * SIZE(BO)
@@ -2658,7 +2658,7 @@ $L18:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
LD b1, 4 * SIZE(AO)
LD b2, 5 * SIZE(AO)
LD b3, 6 * SIZE(AO)
@@ -2700,7 +2700,7 @@ $L18:
LD a2, 14 * SIZE(AO)
LD a3, 13 * SIZE(AO)
LD a4, 12 * SIZE(AO)
-
+
MUL a1, c04, c04
MUL a1, c08, c08
MUL a1, c12, c12
@@ -2739,7 +2739,7 @@ $L18:
LD b1, 10 * SIZE(AO)
LD b2, 9 * SIZE(AO)
LD b3, 8 * SIZE(AO)
-
+
MUL b1, c03, c03
MUL b1, c07, c07
MUL b1, c11, c11
@@ -2795,7 +2795,7 @@ $L18:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
MUL a1, c01, c01
MUL a1, c05, c05
MUL a1, c09, c09
@@ -2834,7 +2834,7 @@ $L18:
LD b1, 5 * SIZE(AO)
LD b2, 6 * SIZE(AO)
LD b3, 7 * SIZE(AO)
-
+
MUL b1, c02, c02
MUL b1, c06, c06
MUL b1, c10, c10
@@ -2863,7 +2863,7 @@ $L18:
LD a1, 10 * SIZE(AO)
LD a2, 11 * SIZE(AO)
LD a3, 15 * SIZE(AO)
-
+
MUL a1, c03, c03
MUL a1, c07, c07
MUL a1, c11, c11
@@ -2890,7 +2890,7 @@ $L18:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a1, c02, c02
MUL a1, c03, c03
@@ -2929,7 +2929,7 @@ $L18:
LD b1, 5 * SIZE(BO)
LD b2, 6 * SIZE(BO)
LD b3, 7 * SIZE(BO)
-
+
MUL b1, c05, c05
MUL b1, c06, c06
MUL b1, c07, c07
@@ -2958,7 +2958,7 @@ $L18:
LD a1, 10 * SIZE(BO)
LD a2, 11 * SIZE(BO)
LD a3, 15 * SIZE(BO)
-
+
MUL a1, c09, c09
MUL a1, c10, c10
MUL a1, c11, c11
@@ -2985,7 +2985,7 @@ $L18:
LD a2, 14 * SIZE(BO)
LD a3, 13 * SIZE(BO)
LD a4, 12 * SIZE(BO)
-
+
MUL a1, c13, c13
MUL a1, c14, c14
MUL a1, c15, c15
@@ -3024,7 +3024,7 @@ $L18:
LD b1, 10 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 8 * SIZE(BO)
-
+
MUL b1, c09, c09
MUL b1, c10, c10
MUL b1, c11, c11
@@ -3434,7 +3434,7 @@ $L28:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
LD b1, 4 * SIZE(BO)
LD b2, 5 * SIZE(BO)
LD b3, 6 * SIZE(BO)
@@ -3455,7 +3455,7 @@ $L28:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
LD b1, 4 * SIZE(AO)
LD b2, 5 * SIZE(AO)
LD b3, 6 * SIZE(AO)
@@ -3502,7 +3502,7 @@ $L28:
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 3 * SIZE(AO)
-
+
MUL a1, c01, c01
MUL a1, c05, c05
MUL a1, c09, c09
@@ -3529,7 +3529,7 @@ $L28:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a1, c02, c02
@@ -3554,7 +3554,7 @@ $L28:
LD b1, 5 * SIZE(BO)
LD b2, 6 * SIZE(BO)
LD b3, 7 * SIZE(BO)
-
+
MUL b1, c05, c05
MUL b1, c06, c06
@@ -3573,7 +3573,7 @@ $L28:
LD a1, 10 * SIZE(BO)
LD a2, 11 * SIZE(BO)
LD a3, 15 * SIZE(BO)
-
+
MUL a1, c09, c09
MUL a1, c10, c10
@@ -3592,7 +3592,7 @@ $L28:
LD a2, 14 * SIZE(BO)
LD a3, 13 * SIZE(BO)
LD a4, 12 * SIZE(BO)
-
+
MUL a1, c13, c13
MUL a1, c14, c14
@@ -3617,7 +3617,7 @@ $L28:
LD b1, 10 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 8 * SIZE(BO)
-
+
MUL b1, c09, c09
MUL b1, c10, c10
@@ -3890,7 +3890,7 @@ $L38:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c05, c05
SUB a3, c09, c09
@@ -3900,7 +3900,7 @@ $L38:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c05, c05
SUB a3, c09, c09
@@ -3921,7 +3921,7 @@ $L38:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
MUL a1, c01, c01
MUL a2, c01, t1
SUB c05, t1, c05
@@ -3933,7 +3933,7 @@ $L38:
LD b1, 5 * SIZE(BO)
LD b2, 6 * SIZE(BO)
LD b3, 7 * SIZE(BO)
-
+
MUL b1, c05, c05
MUL b2, c05, t1
SUB c09, t1, c09
@@ -3943,7 +3943,7 @@ $L38:
LD a1, 10 * SIZE(BO)
LD a2, 11 * SIZE(BO)
LD a3, 15 * SIZE(BO)
-
+
MUL a1, c09, c09
MUL a2, c09, t1
SUB c13, t1, c13
@@ -3955,7 +3955,7 @@ $L38:
LD a2, 14 * SIZE(BO)
LD a3, 13 * SIZE(BO)
LD a4, 12 * SIZE(BO)
-
+
MUL a1, c13, c13
MUL a2, c13, t1
SUB c09, t1, c09
@@ -3967,7 +3967,7 @@ $L38:
LD b1, 10 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 8 * SIZE(BO)
-
+
MUL b1, c09, c09
MUL b2, c09, t1
SUB c05, t1, c05
diff --git a/kernel/alpha/zamax.S b/kernel/alpha/zamax.S
index 01fb4e1..f1ea18d 100644
--- a/kernel/alpha/zamax.S
+++ b/kernel/alpha/zamax.S
@@ -234,7 +234,7 @@ $L13:
fcmovne $f6, $f18, $f2
fcmovne $f7, $f19, $f3
.align 4
-
+
$L14:
addt $f8, $f9, $f16
addt $f10, $f11, $f17
diff --git a/kernel/alpha/zaxpy.S b/kernel/alpha/zaxpy.S
index a6f3c1d..1416769 100644
--- a/kernel/alpha/zaxpy.S
+++ b/kernel/alpha/zaxpy.S
@@ -126,7 +126,7 @@ $MainLoop:
LD $f0, 0*SIZE($18)
MUL $f29, $f1, $f23
LD $f1, 1*SIZE($18)
-
+
MUL $f29, $f2, $f24
unop
MUL $f30, $f3, $f25
@@ -151,7 +151,7 @@ $MainLoop:
addq $20, 8*SIZE, $20
MUL $f29, $f5, $f23
LD $f5, 5*SIZE($18)
-
+
ADD $f16, $f8, $f16
LD $f8, 0*SIZE($20)
MUL $f29, $f6, $f24
@@ -181,7 +181,7 @@ $MainLoop:
ADD1 $f24, $f25, $f18
ST $f19,-5*SIZE($20)
ADD2 $f26, $f27, $f19
-
+
ADD $f16, $f12, $f16
LD $f12, 4*SIZE($20)
ADD $f17, $f13, $f17
@@ -207,7 +207,7 @@ $MainLoopEnd:
MUL $f30, $f1, $f21
MUL $f30, $f0, $f22
MUL $f29, $f1, $f23
-
+
MUL $f29, $f2, $f24
MUL $f30, $f3, $f25
MUL $f30, $f2, $f26
@@ -222,7 +222,7 @@ $MainLoopEnd:
MUL $f30, $f4, $f22
ADD2 $f26, $f27, $f19
MUL $f29, $f5, $f23
-
+
ADD $f16, $f8, $f16
MUL $f29, $f6, $f24
ADD $f17, $f28, $f17
@@ -242,7 +242,7 @@ $MainLoopEnd:
ADD1 $f24, $f25, $f18
ST $f19, 3*SIZE($20)
ADD2 $f26, $f27, $f19
-
+
ADD $f16, $f12, $f16
ADD $f17, $f13, $f17
ADD $f18, $f14, $f18
@@ -281,7 +281,7 @@ $RemainLoop:
LD $f0, 0*SIZE($18)
MUL $f29, $f1, $f23
LD $f1, 1*SIZE($18)
-
+
ADD1 $f20, $f21, $f16
ADD2 $f22, $f23, $f17
ADD $f16, $f8, $f16
@@ -300,7 +300,7 @@ $RemainLoopEnd:
MUL $f30, $f1, $f21
MUL $f30, $f0, $f22
MUL $f29, $f1, $f23
-
+
ADD1 $f20, $f21, $f16
ADD2 $f22, $f23, $f17
ADD $f16, $f8, $f16
@@ -326,11 +326,11 @@ $End:
$Sub:
SXSUBL $16, SIZE, $22
- addq $22, $22, $22 # Complex
+ addq $22, $22, $22 # Complex
.align 4
- addq $19, $19, $19 # Complex
- addq $21, $21, $21 # Complex
+ addq $19, $19, $19 # Complex
+ addq $21, $21, $21 # Complex
ble $4, $SubRemain
LD $f0, 0*SIZE($18)
@@ -409,7 +409,7 @@ $SubMainLoop:
unop
MUL $f29, $f5, $f23
LD $f5, 1*SIZE($18)
-
+
ADD $f16, $f8, $f16
LD $f8, 0*SIZE($24)
MUL $f29, $f6, $f24
@@ -486,7 +486,7 @@ $SubMainLoopEnd:
MUL $f30, $f1, $f21
MUL $f30, $f0, $f22
MUL $f29, $f1, $f23
-
+
MUL $f29, $f2, $f24
MUL $f30, $f3, $f25
MUL $f30, $f2, $f26
@@ -501,7 +501,7 @@ $SubMainLoopEnd:
MUL $f30, $f4, $f22
ADD2 $f26, $f27, $f19
MUL $f29, $f5, $f23
-
+
ADD $f16, $f8, $f16
MUL $f29, $f6, $f24
ADD $f17, $f28, $f17
@@ -586,7 +586,7 @@ $SubRemainLoopEnd:
MUL $f30, $f1, $f21
MUL $f30, $f0, $f22
MUL $f29, $f1, $f23
-
+
ADD1 $f20, $f21, $f16
ADD2 $f22, $f23, $f17
ADD $f16, $f8, $f16
diff --git a/kernel/alpha/zgemm_kernel_2x2.S b/kernel/alpha/zgemm_kernel_2x2.S
index 33c50dd..67ba6d1 100644
--- a/kernel/alpha/zgemm_kernel_2x2.S
+++ b/kernel/alpha/zgemm_kernel_2x2.S
@@ -211,7 +211,7 @@ CNAME:
sra N, 1, J
ble J, $L30
.align 4
-
+
$L01:
mov C, C1
addq C, LDC, C2
diff --git a/kernel/alpha/znrm2.S b/kernel/alpha/znrm2.S
index 03343b2..bd1ab87 100644
--- a/kernel/alpha/znrm2.S
+++ b/kernel/alpha/znrm2.S
@@ -75,7 +75,7 @@
.mask 0x4000000,-16
ldah $29, 0($27) !gpdisp!1
lda $29, 0($29) !gpdisp!1
-
+
lda $sp, -16($sp)
ldq $27, sqrt($29) !literal!2
stq $26, 0($sp)
@@ -85,7 +85,7 @@
#else
PROFCODE
#endif
-
+
fclr a0
sll INCX, ZBASE_SHIFT, INCX
fclr a1
diff --git a/kernel/alpha/ztrsm_kernel_2x2_LN.S b/kernel/alpha/ztrsm_kernel_2x2_LN.S
index 2921f9e..dcbe4e2 100644
--- a/kernel/alpha/ztrsm_kernel_2x2_LN.S
+++ b/kernel/alpha/ztrsm_kernel_2x2_LN.S
@@ -235,7 +235,7 @@ CNAME:
sra N, 1, J
ble J, $L30
.align 4
-
+
$L01:
#ifdef RT
sll K, ZBASE_SHIFT + 1, TMP1
@@ -524,7 +524,7 @@ $L28:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c09, c09
@@ -534,7 +534,7 @@ $L28:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c09, c09
@@ -620,7 +620,7 @@ $L28:
MUL a4, c09, t2
ADD6 c01, t1, c01
ADD5 c02, t2, c02
-
+
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
@@ -1116,7 +1116,7 @@ $L18:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
LD b1, 4 * SIZE(BO)
LD b2, 5 * SIZE(BO)
LD b3, 6 * SIZE(BO)
@@ -1136,7 +1136,7 @@ $L18:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
LD b1, 4 * SIZE(AO)
LD b2, 5 * SIZE(AO)
LD b3, 6 * SIZE(AO)
@@ -1193,7 +1193,7 @@ $L18:
ADD5 c02, t2, c02
ADD6 c09, t3, c09
ADD5 c10, t4, c10
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
@@ -1373,7 +1373,7 @@ $L18:
ADD5 c02, t2, c02
ADD6 c03, t3, c03
ADD5 c04, t4, c04
-
+
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
@@ -1709,7 +1709,7 @@ $L58:
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
#endif
@@ -2043,7 +2043,7 @@ $L48:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c03, c03
@@ -2053,7 +2053,7 @@ $L48:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c03, c03
@@ -2083,7 +2083,7 @@ $L48:
ADD6 c01, t1, c01
ADD5 c02, t2, c02
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
diff --git a/kernel/alpha/ztrsm_kernel_2x2_LT.S b/kernel/alpha/ztrsm_kernel_2x2_LT.S
index e6ffc0f..e0c8202 100644
--- a/kernel/alpha/ztrsm_kernel_2x2_LT.S
+++ b/kernel/alpha/ztrsm_kernel_2x2_LT.S
@@ -235,7 +235,7 @@ CNAME:
sra N, 1, J
ble J, $L30
.align 4
-
+
$L01:
#ifdef RT
sll K, ZBASE_SHIFT + 1, TMP1
@@ -697,7 +697,7 @@ $L18:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
LD b1, 4 * SIZE(BO)
LD b2, 5 * SIZE(BO)
LD b3, 6 * SIZE(BO)
@@ -717,7 +717,7 @@ $L18:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
LD b1, 4 * SIZE(AO)
LD b2, 5 * SIZE(AO)
LD b3, 6 * SIZE(AO)
@@ -774,7 +774,7 @@ $L18:
ADD5 c02, t2, c02
ADD6 c09, t3, c09
ADD5 c10, t4, c10
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
@@ -954,7 +954,7 @@ $L18:
ADD5 c02, t2, c02
ADD6 c03, t3, c03
ADD5 c04, t4, c04
-
+
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
@@ -1301,7 +1301,7 @@ $L28:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c09, c09
@@ -1311,7 +1311,7 @@ $L28:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c09, c09
@@ -1397,7 +1397,7 @@ $L28:
MUL a4, c09, t2
ADD6 c01, t1, c01
ADD5 c02, t2, c02
-
+
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
@@ -1771,7 +1771,7 @@ $L48:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c03, c03
@@ -1781,7 +1781,7 @@ $L48:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c03, c03
@@ -1811,7 +1811,7 @@ $L48:
ADD6 c01, t1, c01
ADD5 c02, t2, c02
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
@@ -2123,7 +2123,7 @@ $L58:
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
#endif
diff --git a/kernel/alpha/ztrsm_kernel_2x2_RT.S b/kernel/alpha/ztrsm_kernel_2x2_RT.S
index 4c490fc..e890f59 100644
--- a/kernel/alpha/ztrsm_kernel_2x2_RT.S
+++ b/kernel/alpha/ztrsm_kernel_2x2_RT.S
@@ -521,7 +521,7 @@ $L48:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c03, c03
@@ -531,7 +531,7 @@ $L48:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c03, c03
@@ -561,7 +561,7 @@ $L48:
ADD6 c01, t1, c01
ADD5 c02, t2, c02
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
@@ -873,7 +873,7 @@ $L58:
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
#endif
@@ -968,7 +968,7 @@ $L30:
sra N, 1, J
ble J, $L999
.align 4
-
+
$L01:
#ifdef RT
sll K, ZBASE_SHIFT + 1, TMP1
@@ -1430,7 +1430,7 @@ $L18:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
LD b1, 4 * SIZE(BO)
LD b2, 5 * SIZE(BO)
LD b3, 6 * SIZE(BO)
@@ -1450,7 +1450,7 @@ $L18:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
LD b1, 4 * SIZE(AO)
LD b2, 5 * SIZE(AO)
LD b3, 6 * SIZE(AO)
@@ -1507,7 +1507,7 @@ $L18:
ADD5 c02, t2, c02
ADD6 c09, t3, c09
ADD5 c10, t4, c10
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
@@ -1687,7 +1687,7 @@ $L18:
ADD5 c02, t2, c02
ADD6 c03, t3, c03
ADD5 c04, t4, c04
-
+
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
@@ -2034,7 +2034,7 @@ $L28:
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c09, c09
@@ -2044,7 +2044,7 @@ $L28:
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
-
+
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c09, c09
@@ -2130,7 +2130,7 @@ $L28:
MUL a4, c09, t2
ADD6 c01, t1, c01
ADD5 c02, t2, c02
-
+
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
diff --git a/kernel/arm/KERNEL.ARMV5 b/kernel/arm/KERNEL.ARMV5
index ecf278c..27157da 100644
--- a/kernel/arm/KERNEL.ARMV5
+++ b/kernel/arm/KERNEL.ARMV5
@@ -85,13 +85,13 @@ DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
-SGEMMKERNEL = ../generic/gemmkernel_2x2.c
+SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
-DGEMMKERNEL = ../generic/gemmkernel_2x2.c
+DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6
index 6edcf1c..7132ca7 100644
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -1,5 +1,7 @@
SGEMVNKERNEL = ../arm/gemv_n.c
SGEMVTKERNEL = ../arm/gemv_t.c
+CGEMVNKERNEL = ../arm/zgemv_n.c
+CGEMVTKERNEL = ../arm/zgemv_t.c
DGEMVNKERNEL = ../arm/gemv_n.c
DGEMVTKERNEL = ../arm/gemv_t.c
@@ -16,7 +18,7 @@ CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
#STRMMKERNEL = ../generic/trmmkernel_2x2.c
-#SGEMMKERNEL = ../generic/gemmkernel_2x2.c
+#SGEMMKERNEL = ../generic/gemmkernel_2x2.c
#SGEMMONCOPY = ../generic/gemm_ncopy_2.c
#SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
@@ -96,12 +98,12 @@ ZSWAPKERNEL = swap_vfp.S
# BAD SGEMVNKERNEL = gemv_n_vfp.S
# BAD DGEMVNKERNEL = gemv_n_vfp.S
-CGEMVNKERNEL = cgemv_n_vfp.S
+# CGEMVNKERNEL = cgemv_n_vfp.S
ZGEMVNKERNEL = zgemv_n_vfp.S
# BAD SGEMVTKERNEL = gemv_t_vfp.S
# BAD DGEMVTKERNEL = gemv_t_vfp.S
-CGEMVTKERNEL = cgemv_t_vfp.S
+# CGEMVTKERNEL = cgemv_t_vfp.S
ZGEMVTKERNEL = zgemv_t_vfp.S
STRMMKERNEL = strmm_kernel_4x2_vfp.S
@@ -109,7 +111,7 @@ DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
#CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S
-SGEMMKERNEL = sgemm_kernel_4x2_vfp.S
+SGEMMKERNEL = sgemm_kernel_4x2_vfp.S
SGEMMINCOPY = sgemm_ncopy_4_vfp.S
SGEMMITCOPY = sgemm_tcopy_4_vfp.S
SGEMMINCOPYOBJ = sgemm_incopy.o
@@ -119,7 +121,7 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
-DGEMMKERNEL = dgemm_kernel_4x2_vfp.S
+DGEMMKERNEL = dgemm_kernel_4x2_vfp.S
DGEMMINCOPY = dgemm_ncopy_4_vfp.S
DGEMMITCOPY = dgemm_tcopy_4_vfp.S
DGEMMINCOPYOBJ = dgemm_incopy.o
diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7
index 790883e..c435486 100644
--- a/kernel/arm/KERNEL.ARMV7
+++ b/kernel/arm/KERNEL.ARMV7
@@ -1,5 +1,7 @@
SGEMVNKERNEL = ../arm/gemv_n.c
SGEMVTKERNEL = ../arm/gemv_t.c
+CGEMVNKERNEL = ../arm/zgemv_n.c
+CGEMVTKERNEL = ../arm/zgemv_t.c
#################################################################################
@@ -77,37 +79,37 @@ ZSCALKERNEL = zscal.c
# BAD SGEMVNKERNEL = gemv_n_vfp.S
DGEMVNKERNEL = gemv_n_vfp.S
-CGEMVNKERNEL = cgemv_n_vfp.S
+#CGEMVNKERNEL = cgemv_n_vfp.S
ZGEMVNKERNEL = zgemv_n_vfp.S
# BAD SGEMVTKERNEL = gemv_t_vfp.S
DGEMVTKERNEL = gemv_t_vfp.S
-CGEMVTKERNEL = cgemv_t_vfp.S
+#CGEMVTKERNEL = cgemv_t_vfp.S
ZGEMVTKERNEL = zgemv_t_vfp.S
STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
-DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
+DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S
ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S
-#SGEMMKERNEL = ../generic/gemmkernel_2x2.c
-SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S
-SGEMMINCOPY =
-SGEMMITCOPY =
+#SGEMMKERNEL = ../generic/gemmkernel_2x2.c
+SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S
+SGEMMINCOPY =
+SGEMMITCOPY =
SGEMMONCOPY = sgemm_ncopy_4_vfp.S
SGEMMOTCOPY = sgemm_tcopy_4_vfp.S
-SGEMMINCOPYOBJ =
-SGEMMITCOPYOBJ =
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
-DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S
-DGEMMINCOPY =
-DGEMMITCOPY =
+DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S
+DGEMMINCOPY =
+DGEMMITCOPY =
DGEMMONCOPY = dgemm_ncopy_4_vfp.S
DGEMMOTCOPY = dgemm_tcopy_4_vfp.S
-DGEMMINCOPYOBJ =
-DGEMMITCOPYOBJ =
+DGEMMINCOPYOBJ =
+DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
diff --git a/kernel/arm/amax.c b/kernel/arm/amax.c
index 55107ca..ec6b111 100644
--- a/kernel/arm/amax.c
+++ b/kernel/arm/amax.c
@@ -60,7 +60,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n)
{
- if( ABS(x[ix]) > ABS(maxf) )
+ if( ABS(x[ix]) > ABS(maxf) )
{
maxf = ABS(x[ix]);
}
@@ -69,5 +69,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
}
return(maxf);
}
-
+
diff --git a/kernel/arm/amin.c b/kernel/arm/amin.c
index 3f7e97b..fc89604 100644
--- a/kernel/arm/amin.c
+++ b/kernel/arm/amin.c
@@ -60,7 +60,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n)
{
- if( ABS(x[ix]) < ABS(minf) )
+ if( ABS(x[ix]) < ABS(minf) )
{
minf = ABS(x[ix]);
}
@@ -69,5 +69,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
}
return(minf);
}
-
+
diff --git a/kernel/arm/asum.c b/kernel/arm/asum.c
index 5ac6936..5b6e6eb 100644
--- a/kernel/arm/asum.c
+++ b/kernel/arm/asum.c
@@ -63,5 +63,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
}
return(sumf);
}
-
+
diff --git a/kernel/arm/axpy.c b/kernel/arm/axpby.c
similarity index 74%
copy from kernel/arm/axpy.c
copy to kernel/arm/axpby.c
index dceddf7..278747f 100644
--- a/kernel/arm/axpy.c
+++ b/kernel/arm/axpby.c
@@ -25,40 +25,72 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-/**************************************************************************************
-* 2013/09/14 Saar
-* BLASTEST float : OK
-* BLASTEST double : OK
-* CTEST : OK
-* TEST : OK
-*
-**************************************************************************************/
-
#include "common.h"
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix,iy;
if ( n < 0 ) return(0);
- if ( da == 0.0 ) return(0);
ix = 0;
iy = 0;
- while(i < n)
+ if ( beta == 0.0 )
{
- y[iy] += da * x[ix] ;
- ix += inc_x ;
- iy += inc_y ;
- i++ ;
+ if ( alpha == 0.0 )
+ {
+ while(i < n)
+ {
+ y[iy] = 0.0 ;
+ iy += inc_y ;
+ i++ ;
+ }
+ }
+ else
+ {
+ while(i < n)
+ {
+ y[iy] = alpha * x[ix] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+ }
+
+
+ }
}
+ else
+ {
+
+ if ( alpha == 0.0 )
+ {
+ while(i < n)
+ {
+ y[iy] = beta * y[iy] ;
+ iy += inc_y ;
+ i++ ;
+ }
+ }
+ else
+ {
+ while(i < n)
+ {
+ y[iy] = alpha * x[ix] + beta * y[iy] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+ }
+ }
+
+ }
+
return(0);
}
-
+
diff --git a/kernel/arm/axpy.c b/kernel/arm/axpy.c
index dceddf7..fb1094d 100644
--- a/kernel/arm/axpy.c
+++ b/kernel/arm/axpy.c
@@ -60,5 +60,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
return(0);
}
-
+
diff --git a/kernel/arm/ccopy_vfp.S b/kernel/arm/ccopy_vfp.S
index aaba782..874fcab 100644
--- a/kernel/arm/ccopy_vfp.S
+++ b/kernel/arm/ccopy_vfp.S
@@ -134,7 +134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
-
+
cmp N, #0
ble ccopy_kernel_L999
diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S
index b653888..2ccda33 100644
--- a/kernel/arm/cdot_vfp.S
+++ b/kernel/arm/cdot_vfp.S
@@ -187,7 +187,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
-
+
vsub.f32 s0 , s0 , s0
vsub.f32 s1 , s1 , s1
vsub.f32 s2 , s2 , s2
@@ -269,11 +269,11 @@ cdot_kernel_L999:
vldm r3, { s8 - s15} // restore floating point registers
#if !defined(CONJ)
- vsub.f32 s0 , s0, s2
- vadd.f32 s1 , s1, s3
+ vsub.f32 s0 , s0, s2
+ vadd.f32 s1 , s1, s3
#else
- vadd.f32 s0 , s0, s2
- vsub.f32 s1 , s1, s3
+ vadd.f32 s0 , s0, s2
+ vsub.f32 s1 , s1, s3
#endif
sub sp, fp, #24
diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S
index 75fbf09..a059ef5 100644
--- a/kernel/arm/cgemm_kernel_2x2_vfp.S
+++ b/kernel/arm/cgemm_kernel_2x2_vfp.S
@@ -88,7 +88,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define KMAC_R fnmacs
#define KMAC_I fmacs
@@ -834,7 +834,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble cgemm_kernel_L1_BEGIN
cgemm_kernel_L2_BEGIN:
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #1 // LDC * 2
@@ -903,7 +903,7 @@ cgemm_kernel_L2_M2_22:
b cgemm_kernel_L2_M2_44
-
+
cgemm_kernel_L2_M2_30:
tst L, #3
ble cgemm_kernel_L2_M2_40
@@ -968,7 +968,7 @@ cgemm_kernel_L2_M2_46:
subs L, L, #1
bne cgemm_kernel_L2_M2_46
-
+
cgemm_kernel_L2_M2_100:
SAVE2x2
@@ -1007,10 +1007,10 @@ cgemm_kernel_L2_M1_22:
subs L, L, #1
bgt cgemm_kernel_L2_M1_22
-
+
cgemm_kernel_L2_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble cgemm_kernel_L2_M1_100
@@ -1020,7 +1020,7 @@ cgemm_kernel_L2_M1_42:
subs L, L, #1
bgt cgemm_kernel_L2_M1_42
-
+
cgemm_kernel_L2_M1_100:
SAVE1x2
@@ -1033,7 +1033,7 @@ cgemm_kernel_L2_END:
lsl r4, r4, #4 // k * 2 * 4 * 2
add r3, r3, r4 // B = B + K * 2 * 8
mov BC, r3
-
+
subs J , #1 // j--
bgt cgemm_kernel_L2_BEGIN
@@ -1047,7 +1047,7 @@ cgemm_kernel_L1_BEGIN:
tst J , #1
ble cgemm_kernel_L999
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
@@ -1111,7 +1111,7 @@ cgemm_kernel_L1_M2_22:
b cgemm_kernel_L1_M2_44
-
+
cgemm_kernel_L1_M2_30:
tst L, #3
ble cgemm_kernel_L1_M2_40
@@ -1176,7 +1176,7 @@ cgemm_kernel_L1_M2_46:
subs L, L, #1
bne cgemm_kernel_L1_M2_46
-
+
cgemm_kernel_L1_M2_100:
SAVE2x1
@@ -1215,10 +1215,10 @@ cgemm_kernel_L1_M1_22:
subs L, L, #1
bgt cgemm_kernel_L1_M1_22
-
+
cgemm_kernel_L1_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble cgemm_kernel_L1_M1_100
@@ -1228,7 +1228,7 @@ cgemm_kernel_L1_M1_42:
subs L, L, #1
bgt cgemm_kernel_L1_M1_42
-
+
cgemm_kernel_L1_M1_100:
SAVE1x1
diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S
index 3aba68d..8bc200c 100644
--- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S
@@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define B_PRE 96
#define C_PRE 64
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define FADD_R fsubs
#define FADD_I fadds
@@ -891,7 +891,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble cgemm_kernel_L1_BEGIN
cgemm_kernel_L2_BEGIN:
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #1 // LDC * 2
@@ -960,7 +960,7 @@ cgemm_kernel_L2_M2_22:
b cgemm_kernel_L2_M2_44
-
+
cgemm_kernel_L2_M2_30:
tst L, #3
ble cgemm_kernel_L2_M2_40
@@ -1025,7 +1025,7 @@ cgemm_kernel_L2_M2_46:
subs L, L, #1
bne cgemm_kernel_L2_M2_46
-
+
cgemm_kernel_L2_M2_100:
SAVE2x2
@@ -1064,10 +1064,10 @@ cgemm_kernel_L2_M1_22:
subs L, L, #1
bgt cgemm_kernel_L2_M1_22
-
+
cgemm_kernel_L2_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble cgemm_kernel_L2_M1_100
@@ -1077,7 +1077,7 @@ cgemm_kernel_L2_M1_42:
subs L, L, #1
bgt cgemm_kernel_L2_M1_42
-
+
cgemm_kernel_L2_M1_100:
SAVE1x2
@@ -1090,7 +1090,7 @@ cgemm_kernel_L2_END:
lsl r4, r4, #4 // k * 2 * 4 * 2
add r3, r3, r4 // B = B + K * 2 * 8
mov BC, r3
-
+
subs J , #1 // j--
bgt cgemm_kernel_L2_BEGIN
@@ -1104,7 +1104,7 @@ cgemm_kernel_L1_BEGIN:
tst J , #1
ble cgemm_kernel_L999
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
@@ -1168,7 +1168,7 @@ cgemm_kernel_L1_M2_22:
b cgemm_kernel_L1_M2_44
-
+
cgemm_kernel_L1_M2_30:
tst L, #3
ble cgemm_kernel_L1_M2_40
@@ -1233,7 +1233,7 @@ cgemm_kernel_L1_M2_46:
subs L, L, #1
bne cgemm_kernel_L1_M2_46
-
+
cgemm_kernel_L1_M2_100:
SAVE2x1
@@ -1272,10 +1272,10 @@ cgemm_kernel_L1_M1_22:
subs L, L, #1
bgt cgemm_kernel_L1_M1_22
-
+
cgemm_kernel_L1_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble cgemm_kernel_L1_M1_100
@@ -1285,7 +1285,7 @@ cgemm_kernel_L1_M1_42:
subs L, L, #1
bgt cgemm_kernel_L1_M1_42
-
+
cgemm_kernel_L1_M1_100:
SAVE1x1
diff --git a/kernel/arm/cgemm_ncopy_2_vfp.S b/kernel/arm/cgemm_ncopy_2_vfp.S
index 08fbd55..29eeab4 100644
--- a/kernel/arm/cgemm_ncopy_2_vfp.S
+++ b/kernel/arm/cgemm_ncopy_2_vfp.S
@@ -150,7 +150,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
sub r4, fp, #128
vstm r4, { s8 - s15} // store floating point registers
- ldr BO, B
+ ldr BO, B
/*********************************************************************************************/
@@ -181,8 +181,8 @@ cgemm_ncopy_L2_M2_20:
COPY2x2
subs I , I , #1
bne cgemm_ncopy_L2_M2_20
-
-
+
+
cgemm_ncopy_L2_M2_40:
ands I, M , #1
@@ -194,7 +194,7 @@ cgemm_ncopy_L2_M2_60:
subs I , I , #1
bne cgemm_ncopy_L2_M2_60
-
+
cgemm_ncopy_L2_M2_END:
@@ -225,8 +225,8 @@ cgemm_ncopy_L1_M2_20:
subs I , I , #1
bne cgemm_ncopy_L1_M2_20
-
-
+
+
cgemm_ncopy_L1_M2_40:
ands I, M , #1
@@ -238,7 +238,7 @@ cgemm_ncopy_L1_M2_60:
subs I , I , #1
bne cgemm_ncopy_L1_M2_60
-
+
cgemm_ncopy_L1_M2_END:
diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S
index 522c4c7..712e7f0 100644
--- a/kernel/arm/cgemv_n_vfp.S
+++ b/kernel/arm/cgemv_n_vfp.S
@@ -551,7 +551,7 @@ cgemvn_kernel_F1X1:
ldr AO1, A
add r3, AO1, #8
str r3, A
-
+
ldr XO , X
INIT_F1
@@ -651,7 +651,7 @@ cgemvn_kernel_S1X1:
ldr AO1, A
add r3, AO1, #8
str r3, A
-
+
ldr XO , X
INIT_S1
diff --git a/kernel/arm/copy.c b/kernel/arm/copy.c
index f742a4a..7b4f04f 100644
--- a/kernel/arm/copy.c
+++ b/kernel/arm/copy.c
@@ -55,5 +55,5 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
return(0);
}
-
+
diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S
index a68434f..a48c860 100644
--- a/kernel/arm/ctrmm_kernel_2x2_vfp.S
+++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S
@@ -91,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define KMAC_R fnmacs
#define KMAC_I fmacs
@@ -848,7 +848,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble _L1_BEGIN
_L2_BEGIN:
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #1 // LDC * 2
@@ -951,7 +951,7 @@ _L2_M2_22:
b _L2_M2_44
-
+
_L2_M2_30:
tst L, #3
ble _L2_M2_40
@@ -1016,7 +1016,7 @@ _L2_M2_46:
subs L, L, #1
bne _L2_M2_46
-
+
_L2_M2_100:
SAVE2x2
@@ -1103,10 +1103,10 @@ _L2_M1_22:
subs L, L, #1
bgt _L2_M1_22
-
+
_L2_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L2_M1_100
@@ -1116,7 +1116,7 @@ _L2_M1_42:
subs L, L, #1
bgt _L2_M1_42
-
+
_L2_M1_100:
SAVE1x2
@@ -1147,7 +1147,7 @@ _L2_END:
lsl r4, r4, #4 // k * 2 * 4 * 2
add r3, r3, r4 // B = B + K * 2 * 8
mov BC, r3
-
+
#if !defined(LEFT)
ldr r3 , KK
add r3 , r3 , #2 // number of values in BO
@@ -1167,7 +1167,7 @@ _L1_BEGIN:
tst J , #1
ble _L999
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
@@ -1265,7 +1265,7 @@ _L1_M2_22:
b _L1_M2_44
-
+
_L1_M2_30:
tst L, #3
ble _L1_M2_40
@@ -1330,7 +1330,7 @@ _L1_M2_46:
subs L, L, #1
bne _L1_M2_46
-
+
_L1_M2_100:
SAVE2x1
@@ -1418,10 +1418,10 @@ _L1_M1_22:
subs L, L, #1
bgt _L1_M1_22
-
+
_L1_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L1_M1_100
@@ -1431,7 +1431,7 @@ _L1_M1_42:
subs L, L, #1
bgt _L1_M1_42
-
+
_L1_M1_100:
SAVE1x1
diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
index 28e555c..f06e260 100644
--- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
@@ -84,7 +84,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define B_PRE 96
#define C_PRE 64
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define FADD_R fsubs
#define FADD_I fadds
@@ -869,7 +869,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble _L1_BEGIN
_L2_BEGIN:
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #1 // LDC * 2
@@ -972,7 +972,7 @@ _L2_M2_22:
b _L2_M2_44
-
+
_L2_M2_30:
tst L, #3
ble _L2_M2_40
@@ -1037,7 +1037,7 @@ _L2_M2_46:
subs L, L, #1
bne _L2_M2_46
-
+
_L2_M2_100:
SAVE2x2
@@ -1124,10 +1124,10 @@ _L2_M1_22:
subs L, L, #1
bgt _L2_M1_22
-
+
_L2_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L2_M1_100
@@ -1137,7 +1137,7 @@ _L2_M1_42:
subs L, L, #1
bgt _L2_M1_42
-
+
_L2_M1_100:
SAVE1x2
@@ -1168,7 +1168,7 @@ _L2_END:
lsl r4, r4, #4 // k * 2 * 4 * 2
add r3, r3, r4 // B = B + K * 2 * 8
mov BC, r3
-
+
#if !defined(LEFT)
ldr r3 , KK
add r3 , r3 , #2 // number of values in BO
@@ -1188,7 +1188,7 @@ _L1_BEGIN:
tst J , #1
ble _L999
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
@@ -1286,7 +1286,7 @@ _L1_M2_22:
b _L1_M2_44
-
+
_L1_M2_30:
tst L, #3
ble _L1_M2_40
@@ -1351,7 +1351,7 @@ _L1_M2_46:
subs L, L, #1
bne _L1_M2_46
-
+
_L1_M2_100:
SAVE2x1
@@ -1439,10 +1439,10 @@ _L1_M1_22:
subs L, L, #1
bgt _L1_M1_22
-
+
_L1_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L1_M1_100
@@ -1452,7 +1452,7 @@ _L1_M1_42:
subs L, L, #1
bgt _L1_M1_42
-
+
_L1_M1_100:
SAVE1x1
diff --git a/kernel/arm/dcopy_vfp.S b/kernel/arm/dcopy_vfp.S
index 0fad3c4..da23992 100644
--- a/kernel/arm/dcopy_vfp.S
+++ b/kernel/arm/dcopy_vfp.S
@@ -134,7 +134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
-
+
cmp N, #0
ble dcopy_kernel_L999
diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S
index ab819ec..71b3c1c 100644
--- a/kernel/arm/ddot_vfp.S
+++ b/kernel/arm/ddot_vfp.S
@@ -151,7 +151,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
-
+
vsub.f64 d0 , d0 , d0
vsub.f64 d1 , d1 , d1
diff --git a/kernel/arm/dgemm_kernel_4x2_vfp.S b/kernel/arm/dgemm_kernel_4x2_vfp.S
index 55409a5..9fb881d 100644
--- a/kernel/arm/dgemm_kernel_4x2_vfp.S
+++ b/kernel/arm/dgemm_kernel_4x2_vfp.S
@@ -134,7 +134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldd d4 , [CO1]
fldd d5 , [CO1, #8 ]
-
+
pld [ CO1, #C_PRE ]
fmacd d4 , d0 , d8
fldd d6 , [CO1, #16 ]
@@ -208,7 +208,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldd d4 , [CO1]
fldd d5 , [CO1, #8 ]
-
+
fmacd d4 , d0 , d8
fmacd d5 , d0 , d9
@@ -262,7 +262,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldd d0, ALPHA
fldd d4 , [CO1]
-
+
fmacd d4 , d0 , d8
fstd d4 , [CO1]
@@ -319,7 +319,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldd d5 , [CO1, #8 ]
fldd d6 , [CO1, #16 ]
fldd d7 , [CO1, #24 ]
-
+
fmacd d4 , d0 , d8
fmacd d5 , d0 , d9
fmacd d6 , d0 , d10
@@ -364,7 +364,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldd d4 , [CO1]
fldd d5 , [CO1, #8 ]
-
+
fmacd d4 , d0 , d8
fmacd d5 , d0 , d9
@@ -402,7 +402,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldd d0, ALPHA
fldd d4 , [CO1]
-
+
fmacd d4 , d0 , d8
fstd d4 , [CO1]
@@ -490,10 +490,10 @@ dgemm_kernel_L2_M4_22:
subs L, L, #1
bgt dgemm_kernel_L2_M4_22
-
+
dgemm_kernel_L2_M4_40:
-
+
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L2_M4_100
@@ -503,7 +503,7 @@ dgemm_kernel_L2_M4_42:
subs L, L, #1
bgt dgemm_kernel_L2_M4_42
-
+
dgemm_kernel_L2_M4_100:
SAVE4x2
@@ -545,10 +545,10 @@ dgemm_kernel_L2_M2_22:
subs L, L, #1
bgt dgemm_kernel_L2_M2_22
-
+
dgemm_kernel_L2_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L2_M2_100
@@ -558,7 +558,7 @@ dgemm_kernel_L2_M2_42:
subs L, L, #1
bgt dgemm_kernel_L2_M2_42
-
+
dgemm_kernel_L2_M2_100:
SAVE2x2
@@ -592,10 +592,10 @@ dgemm_kernel_L2_M1_22:
subs L, L, #1
bgt dgemm_kernel_L2_M1_22
-
+
dgemm_kernel_L2_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L2_M1_100
@@ -605,7 +605,7 @@ dgemm_kernel_L2_M1_42:
subs L, L, #1
bgt dgemm_kernel_L2_M1_42
-
+
dgemm_kernel_L2_M1_100:
SAVE1x2
@@ -630,7 +630,7 @@ dgemm_kernel_L1_BEGIN:
tst J , #1
ble dgemm_kernel_L999
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
@@ -668,10 +668,10 @@ dgemm_kernel_L1_M4_22:
subs L, L, #1
bgt dgemm_kernel_L1_M4_22
-
+
dgemm_kernel_L1_M4_40:
-
+
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L1_M4_100
@@ -681,7 +681,7 @@ dgemm_kernel_L1_M4_42:
subs L, L, #1
bgt dgemm_kernel_L1_M4_42
-
+
dgemm_kernel_L1_M4_100:
SAVE4x1
@@ -723,10 +723,10 @@ dgemm_kernel_L1_M2_22:
subs L, L, #1
bgt dgemm_kernel_L1_M2_22
-
+
dgemm_kernel_L1_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L1_M2_100
@@ -736,7 +736,7 @@ dgemm_kernel_L1_M2_42:
subs L, L, #1
bgt dgemm_kernel_L1_M2_42
-
+
dgemm_kernel_L1_M2_100:
SAVE2x1
@@ -770,10 +770,10 @@ dgemm_kernel_L1_M1_22:
subs L, L, #1
bgt dgemm_kernel_L1_M1_22
-
+
dgemm_kernel_L1_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L1_M1_100
@@ -783,7 +783,7 @@ dgemm_kernel_L1_M1_42:
subs L, L, #1
bgt dgemm_kernel_L1_M1_42
-
+
dgemm_kernel_L1_M1_100:
SAVE1x1
diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S
index 3b6af19..7c1dbae 100644
--- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S
@@ -321,7 +321,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmiad CO1, { d8 - d11 }
pld [ r4 , #C_PRE ]
-
+
fmacd d8 , d0 , d16
fldd d12, [CO2]
fmacd d9 , d0 , d17
@@ -341,7 +341,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fstd d11, [CO1, #24 ]
fldmiad r4, { d8 - d11 }
-
+
fmacd d8 , d0 , d24
fstd d12, [CO2]
fmacd d9 , d0 , d25
@@ -425,7 +425,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldd d8 , [CO1]
fldd d9 , [CO1, #8 ]
-
+
fmacd d8 , d0 , d16
fmacd d9 , d0 , d17
@@ -443,7 +443,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldd d8 , [r4 ]
fldd d9 , [r4 , #8 ]
-
+
fmacd d8 , d0 , d24
fmacd d9 , d0 , d25
@@ -582,7 +582,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldd d9 , [CO1, #8 ]
fldd d10, [CO1, #16 ]
fldd d11, [CO1, #24 ]
-
+
fmacd d8 , d0 , d16
fmacd d9 , d0 , d17
fmacd d10, d0 , d18
@@ -654,7 +654,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldd d8 , [CO1]
fldd d9 , [CO1, #8 ]
-
+
fmacd d8 , d0 , d16
fmacd d9 , d0 , d17
@@ -760,7 +760,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldd d9 , [CO1, #8 ]
fldd d10, [CO1, #16 ]
fldd d11, [CO1, #24 ]
-
+
fmacd d8 , d0 , d16
fmacd d9 , d0 , d17
fmacd d10, d0 , d18
@@ -811,7 +811,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldd d8 , [CO1]
fldd d9 , [CO1, #8 ]
-
+
fmacd d8 , d0 , d16
fmacd d9 , d0 , d17
@@ -895,7 +895,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble dgemm_kernel_L2_BEGIN
dgemm_kernel_L4_BEGIN:
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #2 // LDC * 4
@@ -1000,7 +1000,7 @@ dgemm_kernel_L4_M4_46:
subs L, L, #1
bne dgemm_kernel_L4_M4_46
-
+
dgemm_kernel_L4_M4_100:
SAVE4x4
@@ -1042,10 +1042,10 @@ dgemm_kernel_L4_M2_22:
subs L, L, #1
bgt dgemm_kernel_L4_M2_22
-
+
dgemm_kernel_L4_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L4_M2_100
@@ -1055,7 +1055,7 @@ dgemm_kernel_L4_M2_42:
subs L, L, #1
bgt dgemm_kernel_L4_M2_42
-
+
dgemm_kernel_L4_M2_100:
SAVE2x4
@@ -1089,10 +1089,10 @@ dgemm_kernel_L4_M1_22:
subs L, L, #1
bgt dgemm_kernel_L4_M1_22
-
+
dgemm_kernel_L4_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L4_M1_100
@@ -1102,7 +1102,7 @@ dgemm_kernel_L4_M1_42:
subs L, L, #1
bgt dgemm_kernel_L4_M1_42
-
+
dgemm_kernel_L4_M1_100:
SAVE1x4
@@ -1115,7 +1115,7 @@ dgemm_kernel_L4_END:
lsl r4, r4, #5 // k * 4 * 8
add r3, r3, r4 // B = B + K * 4 * 8
mov BC, r3
-
+
subs J , #1 // j--
bgt dgemm_kernel_L4_BEGIN
@@ -1131,7 +1131,7 @@ dgemm_kernel_L2_BEGIN:
tst J , #2
ble dgemm_kernel_L1_BEGIN
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #1 // LDC * 2
@@ -1170,10 +1170,10 @@ dgemm_kernel_L2_M4_22:
subs L, L, #1
bgt dgemm_kernel_L2_M4_22
-
+
dgemm_kernel_L2_M4_40:
-
+
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L2_M4_100
@@ -1183,7 +1183,7 @@ dgemm_kernel_L2_M4_42:
subs L, L, #1
bgt dgemm_kernel_L2_M4_42
-
+
dgemm_kernel_L2_M4_100:
SAVE4x2
@@ -1225,10 +1225,10 @@ dgemm_kernel_L2_M2_22:
subs L, L, #1
bgt dgemm_kernel_L2_M2_22
-
+
dgemm_kernel_L2_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L2_M2_100
@@ -1238,7 +1238,7 @@ dgemm_kernel_L2_M2_42:
subs L, L, #1
bgt dgemm_kernel_L2_M2_42
-
+
dgemm_kernel_L2_M2_100:
SAVE2x2
@@ -1272,10 +1272,10 @@ dgemm_kernel_L2_M1_22:
subs L, L, #1
bgt dgemm_kernel_L2_M1_22
-
+
dgemm_kernel_L2_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L2_M1_100
@@ -1285,7 +1285,7 @@ dgemm_kernel_L2_M1_42:
subs L, L, #1
bgt dgemm_kernel_L2_M1_42
-
+
dgemm_kernel_L2_M1_100:
SAVE1x2
@@ -1307,7 +1307,7 @@ dgemm_kernel_L1_BEGIN:
tst J , #1
ble dgemm_kernel_L999
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
@@ -1345,10 +1345,10 @@ dgemm_kernel_L1_M4_22:
subs L, L, #1
bgt dgemm_kernel_L1_M4_22
-
+
dgemm_kernel_L1_M4_40:
-
+
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L1_M4_100
@@ -1358,7 +1358,7 @@ dgemm_kernel_L1_M4_42:
subs L, L, #1
bgt dgemm_kernel_L1_M4_42
-
+
dgemm_kernel_L1_M4_100:
SAVE4x1
@@ -1400,10 +1400,10 @@ dgemm_kernel_L1_M2_22:
subs L, L, #1
bgt dgemm_kernel_L1_M2_22
-
+
dgemm_kernel_L1_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L1_M2_100
@@ -1413,7 +1413,7 @@ dgemm_kernel_L1_M2_42:
subs L, L, #1
bgt dgemm_kernel_L1_M2_42
-
+
dgemm_kernel_L1_M2_100:
SAVE2x1
@@ -1447,10 +1447,10 @@ dgemm_kernel_L1_M1_22:
subs L, L, #1
bgt dgemm_kernel_L1_M1_22
-
+
dgemm_kernel_L1_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L1_M1_100
@@ -1460,7 +1460,7 @@ dgemm_kernel_L1_M1_42:
subs L, L, #1
bgt dgemm_kernel_L1_M1_42
-
+
dgemm_kernel_L1_M1_100:
SAVE1x1
diff --git a/kernel/arm/dgemm_ncopy_2_vfp.S b/kernel/arm/dgemm_ncopy_2_vfp.S
index 763c032..6266c61 100644
--- a/kernel/arm/dgemm_ncopy_2_vfp.S
+++ b/kernel/arm/dgemm_ncopy_2_vfp.S
@@ -127,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lsl LDA, OLD_LDA, #3 // lda = lda * 8
- ldr BO, B
+ ldr BO, B
/*********************************************************************************************/
@@ -152,8 +152,8 @@ dgemm_ncopy_L2_M2_20:
subs I , I , #1
bne dgemm_ncopy_L2_M2_20
-
-
+
+
dgemm_ncopy_L2_M2_40:
ands I, M , #1
@@ -165,7 +165,7 @@ dgemm_ncopy_L2_M2_60:
subs I , I , #1
bne dgemm_ncopy_L2_M2_60
-
+
dgemm_ncopy_L2_M2_END:
@@ -194,8 +194,8 @@ dgemm_ncopy_L1_M2_20:
subs I , I , #1
bne dgemm_ncopy_L1_M2_20
-
-
+
+
dgemm_ncopy_L1_M2_40:
ands I, M , #1
@@ -207,7 +207,7 @@ dgemm_ncopy_L1_M2_60:
subs I , I , #1
bne dgemm_ncopy_L1_M2_60
-
+
dgemm_ncopy_L1_M2_END:
diff --git a/kernel/arm/dgemm_ncopy_4_vfp.S b/kernel/arm/dgemm_ncopy_4_vfp.S
index ad6692e..ffc19a9 100644
--- a/kernel/arm/dgemm_ncopy_4_vfp.S
+++ b/kernel/arm/dgemm_ncopy_4_vfp.S
@@ -202,7 +202,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
sub r4, fp, #128
vstm r4, { d8 - d15} // store floating point registers
- ldr BO, B
+ ldr BO, B
dgemm_ncopy_L4_BEGIN:
@@ -227,8 +227,8 @@ dgemm_ncopy_L4_M4_20:
subs I , I , #1
bne dgemm_ncopy_L4_M4_20
-
-
+
+
dgemm_ncopy_L4_M4_40:
ands I, M , #3
@@ -240,7 +240,7 @@ dgemm_ncopy_L4_M4_60:
subs I , I , #1
bne dgemm_ncopy_L4_M4_60
-
+
dgemm_ncopy_L4_M4_END:
@@ -275,8 +275,8 @@ dgemm_ncopy_L2_M4_20:
subs I , I , #1
bne dgemm_ncopy_L2_M4_20
-
-
+
+
dgemm_ncopy_L2_M4_40:
ands I, M , #3
@@ -288,7 +288,7 @@ dgemm_ncopy_L2_M4_60:
subs I , I , #1
bne dgemm_ncopy_L2_M4_60
-
+
dgemm_ncopy_L2_M4_END:
@@ -316,8 +316,8 @@ dgemm_ncopy_L1_M4_20:
subs I , I , #1
bne dgemm_ncopy_L1_M4_20
-
-
+
+
dgemm_ncopy_L1_M4_40:
ands I, M , #3
@@ -329,7 +329,7 @@ dgemm_ncopy_L1_M4_60:
subs I , I , #1
bne dgemm_ncopy_L1_M4_60
-
+
dgemm_ncopy_L1_M4_END:
diff --git a/kernel/arm/dgemm_tcopy_4_vfp.S b/kernel/arm/dgemm_tcopy_4_vfp.S
index 88a139a..937f439 100644
--- a/kernel/arm/dgemm_tcopy_4_vfp.S
+++ b/kernel/arm/dgemm_tcopy_4_vfp.S
@@ -271,15 +271,15 @@ dgemm_tcopy_L4_M4_20:
subs I , I , #1
bne dgemm_tcopy_L4_M4_20
-
-
+
+
dgemm_tcopy_L4_M4_40:
tst N , #2
ble dgemm_tcopy_L4_M4_60
COPY2x4
-
+
dgemm_tcopy_L4_M4_60:
@@ -287,7 +287,7 @@ dgemm_tcopy_L4_M4_60:
ble dgemm_tcopy_L4_M4_END
COPY1x4
-
+
dgemm_tcopy_L4_M4_END:
@@ -326,8 +326,8 @@ dgemm_tcopy_L2_M4_20:
subs I , I , #1
bne dgemm_tcopy_L2_M4_20
-
-
+
+
dgemm_tcopy_L2_M4_40:
tst N , #2
@@ -373,8 +373,8 @@ dgemm_tcopy_L1_M4_20:
subs I , I , #1
bne dgemm_tcopy_L1_M4_20
-
-
+
+
dgemm_tcopy_L1_M4_40:
tst N , #2
diff --git a/kernel/arm/dot.c b/kernel/arm/dot.c
index 30490e2..46a84ad 100644
--- a/kernel/arm/dot.c
+++ b/kernel/arm/dot.c
@@ -60,5 +60,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
return(dot);
}
-
+
diff --git a/kernel/arm/dtrmm_kernel_4x2_vfp.S b/kernel/arm/dtrmm_kernel_4x2_vfp.S
index 762b9c5..3528e08 100644
--- a/kernel/arm/dtrmm_kernel_4x2_vfp.S
+++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S
@@ -198,7 +198,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldd d0, ALPHA
-
+
fmuld d4 , d0 , d8
fmuld d5 , d0 , d9
@@ -248,7 +248,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldd d0, ALPHA
-
+
fmuld d4 , d0 , d8
fstd d4 , [CO1]
@@ -508,10 +508,10 @@ _L2_M4_22:
subs L, L, #1
bgt _L2_M4_22
-
+
_L2_M4_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L2_M4_100
@@ -521,7 +521,7 @@ _L2_M4_42:
subs L, L, #1
bgt _L2_M4_42
-
+
_L2_M4_100:
SAVE4x2
@@ -613,10 +613,10 @@ _L2_M2_22:
subs L, L, #1
bgt _L2_M2_22
-
+
_L2_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L2_M2_100
@@ -626,7 +626,7 @@ _L2_M2_42:
subs L, L, #1
bgt _L2_M2_42
-
+
_L2_M2_100:
SAVE2x2
@@ -710,10 +710,10 @@ _L2_M1_22:
subs L, L, #1
bgt _L2_M1_22
-
+
_L2_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L2_M1_100
@@ -723,7 +723,7 @@ _L2_M1_42:
subs L, L, #1
bgt _L2_M1_42
-
+
_L2_M1_100:
SAVE1x2
@@ -774,7 +774,7 @@ _L1_BEGIN:
tst J , #1
ble _L999
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
@@ -851,10 +851,10 @@ _L1_M4_22:
subs L, L, #1
bgt _L1_M4_22
-
+
_L1_M4_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L1_M4_100
@@ -864,7 +864,7 @@ _L1_M4_42:
subs L, L, #1
bgt _L1_M4_42
-
+
_L1_M4_100:
SAVE4x1
@@ -956,10 +956,10 @@ _L1_M2_22:
subs L, L, #1
bgt _L1_M2_22
-
+
_L1_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L1_M2_100
@@ -969,7 +969,7 @@ _L1_M2_42:
subs L, L, #1
bgt _L1_M2_42
-
+
_L1_M2_100:
SAVE2x1
@@ -1053,10 +1053,10 @@ _L1_M1_22:
subs L, L, #1
bgt _L1_M1_22
-
+
_L1_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L1_M1_100
@@ -1066,7 +1066,7 @@ _L1_M1_42:
subs L, L, #1
bgt _L1_M1_42
-
+
_L1_M1_100:
SAVE1x1
diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S
index 0f8a929..04cc451 100644
--- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S
@@ -340,7 +340,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldd d0, ALPHA
add r4 , CO2, r3
-
+
fmuld d8 , d0 , d16
fmuld d9 , d0 , d17
fmuld d10, d0 , d18
@@ -355,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmuld d15, d0 , d23
fstd d11, [CO1, #24 ]
-
+
fmuld d8 , d0 , d24
fstd d12, [CO2]
fmuld d9 , d0 , d25
@@ -432,7 +432,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add r4 , CO2, r3
fldd d0, ALPHA
-
+
fmuld d8 , d0 , d16
fmuld d9 , d0 , d17
@@ -444,7 +444,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fstd d12, [CO2]
fstd d13, [CO2, #8 ]
-
+
fmuld d8 , d0 , d24
fmuld d9 , d0 , d25
@@ -571,7 +571,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add CO2 , CO1, r3
fldd d0, ALPHA
-
+
fmuld d8 , d0 , d16
fmuld d9 , d0 , d17
fmuld d10, d0 , d18
@@ -731,7 +731,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldd d0, ALPHA
-
+
fmuld d8 , d0 , d16
fmuld d9 , d0 , d17
fmuld d10, d0 , d18
@@ -779,7 +779,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldd d0, ALPHA
-
+
fmuld d8 , d0 , d16
fmuld d9 , d0 , d17
@@ -870,7 +870,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble _L2_BEGIN
_L4_BEGIN:
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #2 // LDC * 4
@@ -1026,14 +1026,14 @@ _L4_M4_22:
ble _L4_M4_41
b _L4_M4_22
-
+
_L4_M4_40:
INIT4x4
_L4_M4_41:
-
+
ands L , K1, #31 // L = L % 8
ble _L4_M4_100
@@ -1043,7 +1043,7 @@ _L4_M4_42:
subs L, L, #1
bgt _L4_M4_42
-
+
_L4_M4_100:
SAVE4x4
@@ -1135,10 +1135,10 @@ _L4_M2_22:
subs L, L, #1
bgt _L4_M2_22
-
+
_L4_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L4_M2_100
@@ -1148,7 +1148,7 @@ _L4_M2_42:
subs L, L, #1
bgt _L4_M2_42
-
+
_L4_M2_100:
SAVE2x4
@@ -1231,10 +1231,10 @@ _L4_M1_22:
subs L, L, #1
bgt _L4_M1_22
-
+
_L4_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L4_M1_100
@@ -1244,7 +1244,7 @@ _L4_M1_42:
subs L, L, #1
bgt _L4_M1_42
-
+
_L4_M1_100:
SAVE1x4
@@ -1297,7 +1297,7 @@ _L2_BEGIN:
tst J , #2
ble _L1_BEGIN
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #1 // LDC * 2
@@ -1375,10 +1375,10 @@ _L2_M4_22:
subs L, L, #1
bgt _L2_M4_22
-
+
_L2_M4_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L2_M4_100
@@ -1388,7 +1388,7 @@ _L2_M4_42:
subs L, L, #1
bgt _L2_M4_42
-
+
_L2_M4_100:
SAVE4x2
@@ -1480,10 +1480,10 @@ _L2_M2_22:
subs L, L, #1
bgt _L2_M2_22
-
+
_L2_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L2_M2_100
@@ -1493,7 +1493,7 @@ _L2_M2_42:
subs L, L, #1
bgt _L2_M2_42
-
+
_L2_M2_100:
SAVE2x2
@@ -1577,10 +1577,10 @@ _L2_M1_22:
subs L, L, #1
bgt _L2_M1_22
-
+
_L2_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L2_M1_100
@@ -1590,7 +1590,7 @@ _L2_M1_42:
subs L, L, #1
bgt _L2_M1_42
-
+
_L2_M1_100:
SAVE1x2
@@ -1638,7 +1638,7 @@ _L1_BEGIN:
tst J , #1
ble _L999
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
@@ -1715,10 +1715,10 @@ _L1_M4_22:
subs L, L, #1
bgt _L1_M4_22
-
+
_L1_M4_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L1_M4_100
@@ -1728,7 +1728,7 @@ _L1_M4_42:
subs L, L, #1
bgt _L1_M4_42
-
+
_L1_M4_100:
SAVE4x1
@@ -1820,10 +1820,10 @@ _L1_M2_22:
subs L, L, #1
bgt _L1_M2_22
-
+
_L1_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L1_M2_100
@@ -1833,7 +1833,7 @@ _L1_M2_42:
subs L, L, #1
bgt _L1_M2_42
-
+
_L1_M2_100:
SAVE2x1
@@ -1917,10 +1917,10 @@ _L1_M1_22:
subs L, L, #1
bgt _L1_M1_22
-
+
_L1_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L1_M1_100
@@ -1930,7 +1930,7 @@ _L1_M1_42:
subs L, L, #1
bgt _L1_M1_42
-
+
_L1_M1_100:
SAVE1x1
diff --git a/kernel/arm/gemv_n.c b/kernel/arm/gemv_n.c
index aedcca9..ef61b24 100644
--- a/kernel/arm/gemv_n.c
+++ b/kernel/arm/gemv_n.c
@@ -61,7 +61,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
a_ptr += lda;
ix += inc_x;
}
-
+ return(0);
}
-
+
diff --git a/kernel/arm/gemv_n_vfp.S b/kernel/arm/gemv_n_vfp.S
index f1cf9a0..505033c 100644
--- a/kernel/arm/gemv_n_vfp.S
+++ b/kernel/arm/gemv_n_vfp.S
@@ -594,7 +594,7 @@ gemvn_kernel_F1X1:
ldr AO1, A
add r3, AO1, #SIZE
str r3, A
-
+
ldr XO , X
INIT_F1
@@ -694,7 +694,7 @@ gemvn_kernel_S1X1:
ldr AO1, A
add r3, AO1, #SIZE
str r3, A
-
+
ldr XO , X
INIT_S1
diff --git a/kernel/arm/gemv_n_vfpv3.S b/kernel/arm/gemv_n_vfpv3.S
index e031c33..0e9ba0c 100644
--- a/kernel/arm/gemv_n_vfpv3.S
+++ b/kernel/arm/gemv_n_vfpv3.S
@@ -635,7 +635,7 @@ gemvn_kernel_F1X1:
ldr AO1, A
add r3, AO1, #SIZE
str r3, A
-
+
ldr XO , X
INIT_F1
@@ -735,7 +735,7 @@ gemvn_kernel_S1X1:
ldr AO1, A
add r3, AO1, #SIZE
str r3, A
-
+
ldr XO , X
INIT_S1
diff --git a/kernel/arm/gemv_t.c b/kernel/arm/gemv_t.c
index 8fd6a66..169047b 100644
--- a/kernel/arm/gemv_t.c
+++ b/kernel/arm/gemv_t.c
@@ -61,7 +61,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
iy += inc_y;
a_ptr += lda;
}
+ return(0);
}
-
+
diff --git a/kernel/arm/iamax.c b/kernel/arm/iamax.c
index 3b7fe1c..d211776 100644
--- a/kernel/arm/iamax.c
+++ b/kernel/arm/iamax.c
@@ -61,7 +61,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n)
{
- if( ABS(x[ix]) > ABS(maxf) )
+ if( ABS(x[ix]) > ABS(maxf) )
{
max = i;
maxf = ABS(x[ix]);
@@ -71,5 +71,5 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
}
return(max+1);
}
-
+
diff --git a/kernel/arm/iamax_vfp.S b/kernel/arm/iamax_vfp.S
index 1d73448..f50c28e 100644
--- a/kernel/arm/iamax_vfp.S
+++ b/kernel/arm/iamax_vfp.S
@@ -354,7 +354,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp INC_X, #0
beq iamax_kernel_L999
-
+
cmp INC_X, #1
bne iamax_kernel_S_BEGIN
diff --git a/kernel/arm/iamin.c b/kernel/arm/iamin.c
index fdb5d7a..7efce19 100644
--- a/kernel/arm/iamin.c
+++ b/kernel/arm/iamin.c
@@ -61,7 +61,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n)
{
- if( ABS(x[ix]) < ABS(minf) )
+ if( ABS(x[ix]) < ABS(minf) )
{
min = i;
minf = ABS(x[ix]);
@@ -71,5 +71,5 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
}
return(min+1);
}
-
+
diff --git a/kernel/arm/imax.c b/kernel/arm/imax.c
index e3e4b9a..28022f6 100644
--- a/kernel/arm/imax.c
+++ b/kernel/arm/imax.c
@@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n)
{
- if( x[ix] > maxf )
+ if( x[ix] > maxf )
{
max = i;
maxf = x[ix];
@@ -63,5 +63,5 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
}
return(max+1);
}
-
+
diff --git a/kernel/arm/imin.c b/kernel/arm/imin.c
index fbcadc2..fe8aa96 100644
--- a/kernel/arm/imin.c
+++ b/kernel/arm/imin.c
@@ -28,8 +28,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/**************************************************************************************
* 2013/08/19 Saar
-* BLASTEST float
-* BLASTEST double
+* BLASTEST float
+* BLASTEST double
*
**************************************************************************************/
@@ -51,7 +51,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n)
{
- if( x[ix] > minf )
+ if( x[ix] > minf )
{
min = i;
minf = x[ix];
@@ -61,5 +61,5 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
}
return(min+1);
}
-
+
diff --git a/kernel/arm/izamax.c b/kernel/arm/izamax.c
index a6ba863..54bb351 100644
--- a/kernel/arm/izamax.c
+++ b/kernel/arm/izamax.c
@@ -66,7 +66,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n)
{
- if( CABS1(x,ix) > CABS1(maxf,0) )
+ if( CABS1(x,ix) > CABS1(maxf,0) )
{
max = i;
maxf[0] = ABS(x[ix]);
@@ -77,5 +77,5 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
}
return(max+1);
}
-
+
diff --git a/kernel/arm/izamin.c b/kernel/arm/izamin.c
index 45c2a7c..448b3cb 100644
--- a/kernel/arm/izamin.c
+++ b/kernel/arm/izamin.c
@@ -66,7 +66,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n)
{
- if( CABS1(x,ix) < CABS1(minf,0) )
+ if( CABS1(x,ix) < CABS1(minf,0) )
{
min = i;
minf[0] = ABS(x[ix]);
@@ -77,5 +77,5 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
}
return(min+1);
}
-
+
diff --git a/kernel/arm/max.c b/kernel/arm/max.c
index 3239e34..04529db 100644
--- a/kernel/arm/max.c
+++ b/kernel/arm/max.c
@@ -50,7 +50,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n)
{
- if( x[ix] > maxf )
+ if( x[ix] > maxf )
{
maxf = x[ix];
}
@@ -59,5 +59,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
}
return(maxf);
}
-
+
diff --git a/kernel/arm/min.c b/kernel/arm/min.c
index de4c471..63c704c 100644
--- a/kernel/arm/min.c
+++ b/kernel/arm/min.c
@@ -50,7 +50,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n)
{
- if( x[ix] < minf )
+ if( x[ix] < minf )
{
minf = x[ix];
}
@@ -59,5 +59,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
}
return(minf);
}
-
+
diff --git a/kernel/arm/nrm2.c b/kernel/arm/nrm2.c
index d65c5a4..b4d810d 100644
--- a/kernel/arm/nrm2.c
+++ b/kernel/arm/nrm2.c
@@ -63,7 +63,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
n *= inc_x;
while(i < n)
{
-
+
if ( x[i] != 0.0 )
{
absxi = ABS( x[i] );
@@ -75,7 +75,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
else
{
ssq += ( absxi/scale ) * ( absxi/scale );
- }
+ }
}
i += inc_x;
@@ -84,5 +84,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
return(scale);
}
-
+
diff --git a/kernel/arm/nrm2_vfp.S b/kernel/arm/nrm2_vfp.S
index 4c62917..d80179a 100644
--- a/kernel/arm/nrm2_vfp.S
+++ b/kernel/arm/nrm2_vfp.S
@@ -61,8 +61,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmiad X!, { d4 }
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_F1_NEXT_\@
- vabs.f64 d4, d4
+ beq KERNEL_F1_NEXT_\@
+ vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
@@ -98,8 +98,8 @@ KERNEL_F1_NEXT_\@:
fldmiad X, { d4 }
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_S1_NEXT
- vabs.f64 d4, d4
+ beq KERNEL_S1_NEXT
+ vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
@@ -124,8 +124,8 @@ KERNEL_S1_NEXT:
fldmias X!, { s4 }
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_F1_NEXT_\@
- vabs.f32 s4, s4
+ beq KERNEL_F1_NEXT_\@
+ vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
@@ -161,8 +161,8 @@ KERNEL_F1_NEXT_\@:
fldmias X, { s4 }
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_S1_NEXT
- vabs.f32 s4, s4
+ beq KERNEL_S1_NEXT
+ vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
@@ -195,8 +195,8 @@ KERNEL_S1_NEXT:
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_F1_NEXT_\@
- vabs.f64 d4, d4
+ beq KERNEL_F1_NEXT_\@
+ vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
@@ -212,8 +212,8 @@ KERNEL_F1_NEXT_\@:
vcmpe.f64 d5, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_F1_END_\@
- vabs.f64 d5, d5
+ beq KERNEL_F1_END_\@
+ vabs.f64 d5, d5
vcmpe.f64 d0, d5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale
@@ -253,8 +253,8 @@ KERNEL_F1_END_\@:
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_S1_NEXT_\@
- vabs.f64 d4, d4
+ beq KERNEL_S1_NEXT_\@
+ vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
@@ -270,8 +270,8 @@ KERNEL_S1_NEXT_\@:
vcmpe.f64 d5, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_S1_END_\@
- vabs.f64 d5, d5
+ beq KERNEL_S1_END_\@
+ vabs.f64 d5, d5
vcmpe.f64 d0, d5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale
@@ -298,8 +298,8 @@ KERNEL_S1_END_\@:
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_F1_NEXT_\@
- vabs.f32 s4, s4
+ beq KERNEL_F1_NEXT_\@
+ vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
@@ -315,8 +315,8 @@ KERNEL_F1_NEXT_\@:
vcmpe.f32 s5, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_F1_END_\@
- vabs.f32 s5, s5
+ beq KERNEL_F1_END_\@
+ vabs.f32 s5, s5
vcmpe.f32 s0, s5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale
@@ -354,8 +354,8 @@ KERNEL_F1_END_\@:
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_S1_NEXT_\@
- vabs.f32 s4, s4
+ beq KERNEL_S1_NEXT_\@
+ vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
@@ -371,8 +371,8 @@ KERNEL_S1_NEXT_\@:
vcmpe.f32 s5, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_S1_END_\@
- vabs.f32 s5, s5
+ beq KERNEL_S1_END_\@
+ vabs.f32 s5, s5
vcmpe.f32 s0, s5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale
@@ -448,13 +448,13 @@ nrm2_begin:
#if defined(DOUBLE)
vsub.f64 d0 , d0 , d0 // scale=0.0
vldr.64 d1 , znrm2_one // ssq=1.0
- vmov.f64 d7 , d1 // value 1.0
- vmov.f64 d6 , d0 // value 0.0
+ vmov.f64 d7 , d1 // value 1.0
+ vmov.f64 d6 , d0 // value 0.0
#else
vsub.f32 s0 , s0 , s0 // scale=0.0
vldr.32 s1 , cnrm2_one // ssq=1.0
vmov.f32 s7 , s1 // value 1.0
- vmov.f32 s6 , s0 // value 0.0
+ vmov.f32 s6 , s0 // value 0.0
#endif
#else
@@ -462,13 +462,13 @@ nrm2_begin:
#if defined(DOUBLE)
vsub.f64 d0 , d0 , d0 // scale=0.0
vldr.64 d1 , dnrm2_one // ssq=1.0
- vmov.f64 d7 , d1 // value 1.0
- vmov.f64 d6 , d0 // value 0.0
+ vmov.f64 d7 , d1 // value 1.0
+ vmov.f64 d6 , d0 // value 0.0
#else
vsub.f32 s0 , s0 , s0 // scale=0.0
vldr.32 s1 , snrm2_one // ssq=1.0
vmov.f32 s7 , s1 // value 1.0
- vmov.f32 s6 , s0 // value 0.0
+ vmov.f32 s6 , s0 // value 0.0
#endif
@@ -481,7 +481,7 @@ nrm2_begin:
cmp INC_X, #0
beq nrm2_kernel_L999
-
+
cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
diff --git a/kernel/arm/nrm2_vfpv3.S b/kernel/arm/nrm2_vfpv3.S
index b56f8b0..34b251e 100644
--- a/kernel/arm/nrm2_vfpv3.S
+++ b/kernel/arm/nrm2_vfpv3.S
@@ -61,8 +61,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmiad X!, { d4 }
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_F1_NEXT_\@
- vabs.f64 d4, d4
+ beq KERNEL_F1_NEXT_\@
+ vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
@@ -98,8 +98,8 @@ KERNEL_F1_NEXT_\@:
fldmiad X, { d4 }
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_S1_NEXT
- vabs.f64 d4, d4
+ beq KERNEL_S1_NEXT
+ vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
@@ -124,8 +124,8 @@ KERNEL_S1_NEXT:
fldmias X!, { s4 }
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_F1_NEXT_\@
- vabs.f32 s4, s4
+ beq KERNEL_F1_NEXT_\@
+ vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
@@ -161,8 +161,8 @@ KERNEL_F1_NEXT_\@:
fldmias X, { s4 }
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_S1_NEXT
- vabs.f32 s4, s4
+ beq KERNEL_S1_NEXT
+ vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
@@ -195,8 +195,8 @@ KERNEL_S1_NEXT:
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_F1_NEXT_\@
- vabs.f64 d4, d4
+ beq KERNEL_F1_NEXT_\@
+ vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
@@ -212,8 +212,8 @@ KERNEL_F1_NEXT_\@:
vcmpe.f64 d5, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_F1_END_\@
- vabs.f64 d5, d5
+ beq KERNEL_F1_END_\@
+ vabs.f64 d5, d5
vcmpe.f64 d0, d5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale
@@ -253,8 +253,8 @@ KERNEL_F1_END_\@:
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_S1_NEXT_\@
- vabs.f64 d4, d4
+ beq KERNEL_S1_NEXT_\@
+ vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
@@ -270,8 +270,8 @@ KERNEL_S1_NEXT_\@:
vcmpe.f64 d5, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_S1_END_\@
- vabs.f64 d5, d5
+ beq KERNEL_S1_END_\@
+ vabs.f64 d5, d5
vcmpe.f64 d0, d5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale
@@ -298,8 +298,8 @@ KERNEL_S1_END_\@:
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_F1_NEXT_\@
- vabs.f32 s4, s4
+ beq KERNEL_F1_NEXT_\@
+ vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
@@ -315,8 +315,8 @@ KERNEL_F1_NEXT_\@:
vcmpe.f32 s5, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_F1_END_\@
- vabs.f32 s5, s5
+ beq KERNEL_F1_END_\@
+ vabs.f32 s5, s5
vcmpe.f32 s0, s5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale
@@ -354,8 +354,8 @@ KERNEL_F1_END_\@:
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_S1_NEXT_\@
- vabs.f32 s4, s4
+ beq KERNEL_S1_NEXT_\@
+ vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
@@ -371,8 +371,8 @@ KERNEL_S1_NEXT_\@:
vcmpe.f32 s5, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
- beq KERNEL_S1_END_\@
- vabs.f32 s5, s5
+ beq KERNEL_S1_END_\@
+ vabs.f32 s5, s5
vcmpe.f32 s0, s5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale
@@ -407,13 +407,13 @@ KERNEL_S1_END_\@:
#if defined(DOUBLE)
vsub.f64 d0 , d0 , d0 // scale=0.0
vmov.f64 d1 , #1.0 // ssq=1.0
- vmov.f64 d7 , d1 // value 1.0
- vmov.f64 d6 , d0 // value 0.0
+ vmov.f64 d7 , d1 // value 1.0
+ vmov.f64 d6 , d0 // value 0.0
#else
vsub.f32 s0 , s0 , s0 // scale=0.0
vmov.f32 s1 , #1.0 // ssq=1.0
vmov.f32 s7 , s1 // value 1.0
- vmov.f32 s6 , s0 // value 0.0
+ vmov.f32 s6 , s0 // value 0.0
#endif
@@ -424,7 +424,7 @@ KERNEL_S1_END_\@:
cmp INC_X, #0
beq nrm2_kernel_L999
-
+
cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
diff --git a/kernel/arm/zswap.c b/kernel/arm/omatcopy_cn.c
similarity index 67%
copy from kernel/arm/zswap.c
copy to kernel/arm/omatcopy_cn.c
index 4e3e73d..4d11b91 100644
--- a/kernel/arm/zswap.c
+++ b/kernel/arm/omatcopy_cn.c
@@ -25,46 +25,66 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-/**************************************************************************************
-* 2013/09/14 Saar
-* BLASTEST float : OK
-* BLASTEST double : OK
-* CTEST : OK
-* TEST : OK
-*
-**************************************************************************************/
-
#include "common.h"
-#include <stdio.h>
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
{
- BLASLONG i=0;
- BLASLONG ix=0,iy=0;
- FLOAT temp[2];
+ BLASLONG i,j;
+ FLOAT *aptr,*bptr;
- if ( n < 0 ) return(0);
+ if ( rows <= 0 ) return(0);
+ if ( cols <= 0 ) return(0);
- BLASLONG inc_x2 = 2 * inc_x;
- BLASLONG inc_y2 = 2 * inc_y;
+ aptr = a;
+ bptr = b;
- while(i < n)
+ if ( alpha == 0.0 )
{
+ for ( i=0; i<cols ; i++ )
+ {
+ for(j=0; j<rows; j++)
+ {
+ bptr[j] = 0.0;
+ }
+ bptr += ldb;
+ }
+ return(0);
+ }
- temp[0] = x[ix] ;
- temp[1] = x[ix+1] ;
- x[ix] = y[iy] ;
- x[ix+1] = y[iy+1] ;
- y[iy] = temp[0] ;
- y[iy+1] = temp[1] ;
-
- ix += inc_x2 ;
- iy += inc_y2 ;
- i++ ;
+ if ( alpha == 1.0 )
+ {
+ for ( i=0; i<cols ; i++ )
+ {
+ for(j=0; j<rows; j++)
+ {
+ bptr[j] = aptr[j];
+ }
+ aptr += lda;
+ bptr += ldb;
+ }
+ return(0);
+ }
+ for ( i=0; i<cols ; i++ )
+ {
+ for(j=0; j<rows; j++)
+ {
+ bptr[j] = alpha * aptr[j];
+ }
+ aptr += lda;
+ bptr += ldb;
}
+
return(0);
}
-
+
diff --git a/kernel/arm/zswap.c b/kernel/arm/omatcopy_ct.c
similarity index 67%
copy from kernel/arm/zswap.c
copy to kernel/arm/omatcopy_ct.c
index 4e3e73d..b258781 100644
--- a/kernel/arm/zswap.c
+++ b/kernel/arm/omatcopy_ct.c
@@ -25,46 +25,65 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-/**************************************************************************************
-* 2013/09/14 Saar
-* BLASTEST float : OK
-* BLASTEST double : OK
-* CTEST : OK
-* TEST : OK
-*
-**************************************************************************************/
-
#include "common.h"
-#include <stdio.h>
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
{
- BLASLONG i=0;
- BLASLONG ix=0,iy=0;
- FLOAT temp[2];
+ BLASLONG i,j;
+ FLOAT *aptr,*bptr;
- if ( n < 0 ) return(0);
+ if ( rows <= 0 ) return(0);
+ if ( cols <= 0 ) return(0);
- BLASLONG inc_x2 = 2 * inc_x;
- BLASLONG inc_y2 = 2 * inc_y;
+ aptr = a;
- while(i < n)
+ if ( alpha == 0.0 )
{
+ for ( i=0; i<cols ; i++ )
+ {
+ bptr = &b[i];
+ for(j=0; j<rows; j++)
+ {
+ bptr[j*ldb] = 0.0;
+ }
+ }
+ return(0);
+ }
- temp[0] = x[ix] ;
- temp[1] = x[ix+1] ;
- x[ix] = y[iy] ;
- x[ix+1] = y[iy+1] ;
- y[iy] = temp[0] ;
- y[iy+1] = temp[1] ;
-
- ix += inc_x2 ;
- iy += inc_y2 ;
- i++ ;
+ if ( alpha == 1.0 )
+ {
+ for ( i=0; i<cols ; i++ )
+ {
+ bptr = &b[i];
+ for(j=0; j<rows; j++)
+ {
+ bptr[j*ldb] = aptr[j];
+ }
+ aptr += lda;
+ }
+ return(0);
+ }
+ for ( i=0; i<cols ; i++ )
+ {
+ bptr = &b[i];
+ for(j=0; j<rows; j++)
+ {
+ bptr[j*ldb] = alpha * aptr[j];
+ }
+ aptr += lda;
}
+
return(0);
}
-
+
diff --git a/kernel/arm/zswap.c b/kernel/arm/omatcopy_rn.c
similarity index 67%
copy from kernel/arm/zswap.c
copy to kernel/arm/omatcopy_rn.c
index 4e3e73d..57515e7 100644
--- a/kernel/arm/zswap.c
+++ b/kernel/arm/omatcopy_rn.c
@@ -25,46 +25,66 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-/**************************************************************************************
-* 2013/09/14 Saar
-* BLASTEST float : OK
-* BLASTEST double : OK
-* CTEST : OK
-* TEST : OK
-*
-**************************************************************************************/
-
#include "common.h"
-#include <stdio.h>
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
{
- BLASLONG i=0;
- BLASLONG ix=0,iy=0;
- FLOAT temp[2];
+ BLASLONG i,j;
+ FLOAT *aptr,*bptr;
- if ( n < 0 ) return(0);
+ if ( rows <= 0 ) return(0);
+ if ( cols <= 0 ) return(0);
- BLASLONG inc_x2 = 2 * inc_x;
- BLASLONG inc_y2 = 2 * inc_y;
+ aptr = a;
+ bptr = b;
- while(i < n)
+ if ( alpha == 0.0 )
{
+ for ( i=0; i<rows ; i++ )
+ {
+ for(j=0; j<cols; j++)
+ {
+ bptr[j] = 0.0;
+ }
+ bptr += ldb;
+ }
+ return(0);
+ }
- temp[0] = x[ix] ;
- temp[1] = x[ix+1] ;
- x[ix] = y[iy] ;
- x[ix+1] = y[iy+1] ;
- y[iy] = temp[0] ;
- y[iy+1] = temp[1] ;
-
- ix += inc_x2 ;
- iy += inc_y2 ;
- i++ ;
+ if ( alpha == 1.0 )
+ {
+ for ( i=0; i<rows ; i++ )
+ {
+ for(j=0; j<cols; j++)
+ {
+ bptr[j] = aptr[j];
+ }
+ aptr += lda;
+ bptr += ldb;
+ }
+ return(0);
+ }
+ for ( i=0; i<rows ; i++ )
+ {
+ for(j=0; j<cols; j++)
+ {
+ bptr[j] = alpha * aptr[j];
+ }
+ aptr += lda;
+ bptr += ldb;
}
+
return(0);
}
-
+
diff --git a/kernel/arm/copy.c b/kernel/arm/omatcopy_rt.c
similarity index 77%
copy from kernel/arm/copy.c
copy to kernel/arm/omatcopy_rt.c
index f742a4a..9d58350 100644
--- a/kernel/arm/copy.c
+++ b/kernel/arm/omatcopy_rt.c
@@ -25,35 +25,38 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-/**************************************************************************************
-* 2013/09/14 Saar
-* BLASTEST float : OK
-* BLASTEST double : OK
-* CTEST : OK
-* TEST : OK
-*
-**************************************************************************************/
-
#include "common.h"
-int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
-{
- BLASLONG i=0;
- BLASLONG ix=0,iy=0;
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * Trans
+ *
+******************************************************/
- if ( n < 0 ) return(0);
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+ BLASLONG i,j;
+ FLOAT *aptr,*bptr;
- while(i < n)
- {
+ if ( rows <= 0 ) return(0);
+ if ( cols <= 0 ) return(0);
- y[iy] = x[ix] ;
- ix += inc_x ;
- iy += inc_y ;
- i++ ;
+ aptr = a;
+ for ( i=0; i<rows ; i++ )
+ {
+ bptr = &b[i];
+ for(j=0; j<cols; j++)
+ {
+ bptr[j*ldb] = alpha * aptr[j];
+ }
+ aptr += lda;
}
+
return(0);
}
-
+
diff --git a/kernel/arm/rot.c b/kernel/arm/rot.c
index aa60b44..18b4ca2 100644
--- a/kernel/arm/rot.c
+++ b/kernel/arm/rot.c
@@ -58,5 +58,5 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
return(0);
}
-
+
diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S
index 663ecdf..d053423 100644
--- a/kernel/arm/rot_vfp.S
+++ b/kernel/arm/rot_vfp.S
@@ -235,7 +235,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fnmacd d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
-
+
fldmiad X, { d4 - d5 }
fldmiad Y, { d6 - d7 }
vmul.f64 d2 , d0, d4
@@ -250,7 +250,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fnmacd d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
-
+
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
@@ -268,7 +268,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fnmacd d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
-
+
fldmiad X, { d4 - d5 }
fldmiad Y, { d6 - d7 }
vmul.f64 d2 , d0, d4
@@ -283,7 +283,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fnmacd d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
-
+
.endm
@@ -303,7 +303,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fnmacd d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
-
+
.endm
@@ -352,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fnmacs s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
-
+
fldmias X, { s4 - s5 }
fldmias Y, { s6 - s7 }
vmul.f32 s2 , s0, s4
@@ -367,7 +367,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fnmacs s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
-
+
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
@@ -385,7 +385,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fnmacs s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
-
+
fldmias X, { s4 - s5 }
fldmias Y, { s6 - s7 }
vmul.f32 s2 , s0, s4
@@ -400,7 +400,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fnmacs s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
-
+
.endm
@@ -420,7 +420,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fnmacs s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
-
+
.endm
diff --git a/kernel/arm/scal.c b/kernel/arm/scal.c
index ff78e67..4593e22 100644
--- a/kernel/arm/scal.c
+++ b/kernel/arm/scal.c
@@ -55,5 +55,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
return;
}
-
+
diff --git a/kernel/arm/scal_vfp.S b/kernel/arm/scal_vfp.S
index a04b724..a8939c3 100644
--- a/kernel/arm/scal_vfp.S
+++ b/kernel/arm/scal_vfp.S
@@ -65,11 +65,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
pld [ X, #X_PRE ]
fldmiad X, { d4 - d7 }
- vmul.f64 d4, d4, d0
- vmul.f64 d5, d5, d0
- vmul.f64 d6, d6, d0
+ vmul.f64 d4, d4, d0
+ vmul.f64 d5, d5, d0
+ vmul.f64 d6, d6, d0
fstmiad X!, { d4 - d5 }
- vmul.f64 d7, d7, d0
+ vmul.f64 d7, d7, d0
fstmiad X!, { d6 - d7 }
.endm
@@ -78,7 +78,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F1
fldmiad X, { d4 }
- vmul.f64 d4, d4, d0
+ vmul.f64 d4, d4, d0
fstmiad X!, { d4 }
.endm
@@ -86,7 +86,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_S1
fldmiad X, { d4 }
- vmul.f64 d4, d4, d0
+ vmul.f64 d4, d4, d0
fstmiad X, { d4 }
add X, X, INC_X
@@ -97,11 +97,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F4
fldmias X, { s4 - s7 }
- vmul.f32 s4, s4, s0
- vmul.f32 s5, s5, s0
- vmul.f32 s6, s6, s0
+ vmul.f32 s4, s4, s0
+ vmul.f32 s5, s5, s0
+ vmul.f32 s6, s6, s0
fstmias X!, { s4 - s5 }
- vmul.f32 s7, s7, s0
+ vmul.f32 s7, s7, s0
fstmias X!, { s6 - s7 }
.endm
@@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F1
fldmias X, { s4 }
- vmul.f32 s4, s4, s0
+ vmul.f32 s4, s4, s0
fstmias X!, { s4 }
.endm
@@ -118,7 +118,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_S1
fldmias X, { s4 }
- vmul.f32 s4, s4, s0
+ vmul.f32 s4, s4, s0
fstmias X, { s4 }
add X, X, INC_X
@@ -137,14 +137,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
pld [ X, #X_PRE ]
fldmiad X, { d4 - d5 }
- vmul.f64 d2, d0, d4
+ vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }
fldmiad X, { d4 - d5 }
- vmul.f64 d2, d0, d4
+ vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
@@ -153,14 +153,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
pld [ X, #X_PRE ]
fldmiad X, { d4 - d5 }
- vmul.f64 d2, d0, d4
+ vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }
fldmiad X, { d4 - d5 }
- vmul.f64 d2, d0, d4
+ vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
@@ -172,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F1
fldmiad X, { d4 - d5 }
- vmul.f64 d2, d0, d4
+ vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
@@ -183,7 +183,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_S1
fldmiad X, { d4 - d5 }
- vmul.f64 d2, d0, d4
+ vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
@@ -200,28 +200,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
pld [ X, #X_PRE ]
fldmias X, { s4 - s5 }
- vmul.f32 s2, s0, s4
+ vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }
fldmias X, { s4 - s5 }
- vmul.f32 s2, s0, s4
+ vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }
fldmias X, { s4 - s5 }
- vmul.f32 s2, s0, s4
+ vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }
fldmias X, { s4 - s5 }
- vmul.f32 s2, s0, s4
+ vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
@@ -233,7 +233,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F1
fldmias X, { s4 - s5 }
- vmul.f32 s2, s0, s4
+ vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
@@ -244,7 +244,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_S1
fldmias X, { s4 - s5 }
- vmul.f32 s2, s0, s4
+ vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
diff --git a/kernel/arm/scopy_vfp.S b/kernel/arm/scopy_vfp.S
index e6ceaf2..0fd815d 100644
--- a/kernel/arm/scopy_vfp.S
+++ b/kernel/arm/scopy_vfp.S
@@ -136,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
-
+
cmp N, #0
ble scopy_kernel_L999
diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S
index 2d19092..a6fcf2a 100644
--- a/kernel/arm/sdot_vfp.S
+++ b/kernel/arm/sdot_vfp.S
@@ -239,7 +239,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
-
+
#if defined(DSDOT)
vsub.f64 d0 , d0 , d0
diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S
index 0e2061d..4dfb733 100644
--- a/kernel/arm/sgemm_kernel_4x2_vfp.S
+++ b/kernel/arm/sgemm_kernel_4x2_vfp.S
@@ -126,7 +126,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s5 , [CO1, #4 ]
flds s6 , [CO1, #8 ]
flds s7 , [CO1, #12 ]
-
+
fmacs s4 , s0 , s8
fmacs s5 , s0 , s9
fmacs s6 , s0 , s10
@@ -196,7 +196,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s4 , [CO1]
flds s5 , [CO1, #4 ]
-
+
fmacs s4 , s0 , s8
fmacs s5 , s0 , s9
@@ -250,7 +250,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA
flds s4 , [CO1]
-
+
fmacs s4 , s0 , s8
fsts s4 , [CO1]
@@ -307,7 +307,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s5 , [CO1, #4 ]
flds s6 , [CO1, #8 ]
flds s7 , [CO1, #12 ]
-
+
fmacs s4 , s0 , s8
fmacs s5 , s0 , s9
fmacs s6 , s0 , s10
@@ -352,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s4 , [CO1]
flds s5 , [CO1, #4 ]
-
+
fmacs s4 , s0 , s8
fmacs s5 , s0 , s9
@@ -390,7 +390,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA
flds s4 , [CO1]
-
+
fmacs s4 , s0 , s8
fsts s4 , [CO1]
@@ -480,10 +480,10 @@ sgemm_kernel_L2_M4_22:
subs L, L, #1
bgt sgemm_kernel_L2_M4_22
-
+
sgemm_kernel_L2_M4_40:
-
+
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L2_M4_100
@@ -493,7 +493,7 @@ sgemm_kernel_L2_M4_42:
subs L, L, #1
bgt sgemm_kernel_L2_M4_42
-
+
sgemm_kernel_L2_M4_100:
SAVE4x2
@@ -535,10 +535,10 @@ sgemm_kernel_L2_M2_22:
subs L, L, #1
bgt sgemm_kernel_L2_M2_22
-
+
sgemm_kernel_L2_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L2_M2_100
@@ -548,7 +548,7 @@ sgemm_kernel_L2_M2_42:
subs L, L, #1
bgt sgemm_kernel_L2_M2_42
-
+
sgemm_kernel_L2_M2_100:
SAVE2x2
@@ -582,10 +582,10 @@ sgemm_kernel_L2_M1_22:
subs L, L, #1
bgt sgemm_kernel_L2_M1_22
-
+
sgemm_kernel_L2_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L2_M1_100
@@ -595,7 +595,7 @@ sgemm_kernel_L2_M1_42:
subs L, L, #1
bgt sgemm_kernel_L2_M1_42
-
+
sgemm_kernel_L2_M1_100:
SAVE1x2
@@ -620,7 +620,7 @@ sgemm_kernel_L1_BEGIN:
tst J , #1
ble sgemm_kernel_L999
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
@@ -658,10 +658,10 @@ sgemm_kernel_L1_M4_22:
subs L, L, #1
bgt sgemm_kernel_L1_M4_22
-
+
sgemm_kernel_L1_M4_40:
-
+
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L1_M4_100
@@ -671,7 +671,7 @@ sgemm_kernel_L1_M4_42:
subs L, L, #1
bgt sgemm_kernel_L1_M4_42
-
+
sgemm_kernel_L1_M4_100:
SAVE4x1
@@ -713,10 +713,10 @@ sgemm_kernel_L1_M2_22:
subs L, L, #1
bgt sgemm_kernel_L1_M2_22
-
+
sgemm_kernel_L1_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L1_M2_100
@@ -726,7 +726,7 @@ sgemm_kernel_L1_M2_42:
subs L, L, #1
bgt sgemm_kernel_L1_M2_42
-
+
sgemm_kernel_L1_M2_100:
SAVE2x1
@@ -761,10 +761,10 @@ sgemm_kernel_L1_M1_22:
subs L, L, #1
bgt sgemm_kernel_L1_M1_22
-
+
sgemm_kernel_L1_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L1_M1_100
@@ -774,7 +774,7 @@ sgemm_kernel_L1_M1_42:
subs L, L, #1
bgt sgemm_kernel_L1_M1_42
-
+
sgemm_kernel_L1_M1_100:
SAVE1x1
diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S
index 38dc4d3..078f14a 100644
--- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S
@@ -289,7 +289,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add r4 , CO2, r3
fldmias CO1, { s8 - s11 }
-
+
fmacs s8 , s0 , s16
flds s12, [CO2]
fmacs s9 , s0 , s17
@@ -311,7 +311,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
pld [ CO1 , #C_PRE ]
fldmias r4, { s8 - s11 }
-
+
fmacs s8 , s0 , s24
fsts s12, [CO2]
fmacs s9 , s0 , s25
@@ -398,7 +398,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s8 , [CO1]
flds s9 , [CO1, #4 ]
-
+
fmacs s8 , s0 , s16
fmacs s9 , s0 , s17
@@ -416,7 +416,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s8 , [r4 ]
flds s9 , [r4 , #4 ]
-
+
fmacs s8 , s0 , s24
fmacs s9 , s0 , s25
@@ -555,7 +555,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s9 , [CO1, #4 ]
flds s10, [CO1, #8 ]
flds s11, [CO1, #12 ]
-
+
fmacs s8 , s0 , s16
fmacs s9 , s0 , s17
fmacs s10, s0 , s18
@@ -627,7 +627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s8 , [CO1]
flds s9 , [CO1, #4 ]
-
+
fmacs s8 , s0 , s16
fmacs s9 , s0 , s17
@@ -733,7 +733,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s9 , [CO1, #4 ]
flds s10, [CO1, #8 ]
flds s11, [CO1, #12 ]
-
+
fmacs s8 , s0 , s16
fmacs s9 , s0 , s17
fmacs s10, s0 , s18
@@ -784,7 +784,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s8 , [CO1]
flds s9 , [CO1, #4 ]
-
+
fmacs s8 , s0 , s16
fmacs s9 , s0 , s17
@@ -868,7 +868,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble sgemm_kernel_L2_BEGIN
sgemm_kernel_L4_BEGIN:
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #2 // LDC * 4
@@ -947,7 +947,7 @@ sgemm_kernel_L4_M4_46:
subs L, L, #1
bne sgemm_kernel_L4_M4_46
-
+
sgemm_kernel_L4_M4_100:
SAVE4x4
@@ -989,10 +989,10 @@ sgemm_kernel_L4_M2_22:
subs L, L, #1
bgt sgemm_kernel_L4_M2_22
-
+
sgemm_kernel_L4_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L4_M2_100
@@ -1002,7 +1002,7 @@ sgemm_kernel_L4_M2_42:
subs L, L, #1
bgt sgemm_kernel_L4_M2_42
-
+
sgemm_kernel_L4_M2_100:
SAVE2x4
@@ -1036,10 +1036,10 @@ sgemm_kernel_L4_M1_22:
subs L, L, #1
bgt sgemm_kernel_L4_M1_22
-
+
sgemm_kernel_L4_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L4_M1_100
@@ -1049,7 +1049,7 @@ sgemm_kernel_L4_M1_42:
subs L, L, #1
bgt sgemm_kernel_L4_M1_42
-
+
sgemm_kernel_L4_M1_100:
SAVE1x4
@@ -1062,7 +1062,7 @@ sgemm_kernel_L4_END:
lsl r4, r4, #4 // k * 4 * 4
add r3, r3, r4 // B = B + K * 4 * 4
mov BC, r3
-
+
subs J , #1 // j--
bgt sgemm_kernel_L4_BEGIN
@@ -1078,7 +1078,7 @@ sgemm_kernel_L2_BEGIN:
tst J , #2
ble sgemm_kernel_L1_BEGIN
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #1 // LDC * 2
@@ -1120,10 +1120,10 @@ sgemm_kernel_L2_M4_22:
subs L, L, #1
bgt sgemm_kernel_L2_M4_22
-
+
sgemm_kernel_L2_M4_40:
-
+
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L2_M4_100
@@ -1133,7 +1133,7 @@ sgemm_kernel_L2_M4_42:
subs L, L, #1
bgt sgemm_kernel_L2_M4_42
-
+
sgemm_kernel_L2_M4_100:
SAVE4x2
@@ -1175,10 +1175,10 @@ sgemm_kernel_L2_M2_22:
subs L, L, #1
bgt sgemm_kernel_L2_M2_22
-
+
sgemm_kernel_L2_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L2_M2_100
@@ -1188,7 +1188,7 @@ sgemm_kernel_L2_M2_42:
subs L, L, #1
bgt sgemm_kernel_L2_M2_42
-
+
sgemm_kernel_L2_M2_100:
SAVE2x2
@@ -1222,10 +1222,10 @@ sgemm_kernel_L2_M1_22:
subs L, L, #1
bgt sgemm_kernel_L2_M1_22
-
+
sgemm_kernel_L2_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L2_M1_100
@@ -1235,7 +1235,7 @@ sgemm_kernel_L2_M1_42:
subs L, L, #1
bgt sgemm_kernel_L2_M1_42
-
+
sgemm_kernel_L2_M1_100:
SAVE1x2
@@ -1257,7 +1257,7 @@ sgemm_kernel_L1_BEGIN:
tst J , #1
ble sgemm_kernel_L999
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
@@ -1298,10 +1298,10 @@ sgemm_kernel_L1_M4_22:
subs L, L, #1
bgt sgemm_kernel_L1_M4_22
-
+
sgemm_kernel_L1_M4_40:
-
+
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L1_M4_100
@@ -1311,7 +1311,7 @@ sgemm_kernel_L1_M4_42:
subs L, L, #1
bgt sgemm_kernel_L1_M4_42
-
+
sgemm_kernel_L1_M4_100:
SAVE4x1
@@ -1353,10 +1353,10 @@ sgemm_kernel_L1_M2_22:
subs L, L, #1
bgt sgemm_kernel_L1_M2_22
-
+
sgemm_kernel_L1_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L1_M2_100
@@ -1366,7 +1366,7 @@ sgemm_kernel_L1_M2_42:
subs L, L, #1
bgt sgemm_kernel_L1_M2_42
-
+
sgemm_kernel_L1_M2_100:
SAVE2x1
@@ -1400,10 +1400,10 @@ sgemm_kernel_L1_M1_22:
subs L, L, #1
bgt sgemm_kernel_L1_M1_22
-
+
sgemm_kernel_L1_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L1_M1_100
@@ -1413,7 +1413,7 @@ sgemm_kernel_L1_M1_42:
subs L, L, #1
bgt sgemm_kernel_L1_M1_42
-
+
sgemm_kernel_L1_M1_100:
SAVE1x1
diff --git a/kernel/arm/sgemm_ncopy_2_vfp.S b/kernel/arm/sgemm_ncopy_2_vfp.S
index 0546f1d..ff4ff08 100644
--- a/kernel/arm/sgemm_ncopy_2_vfp.S
+++ b/kernel/arm/sgemm_ncopy_2_vfp.S
@@ -127,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lsl LDA, OLD_LDA, #2 // lda = lda * 4
- ldr BO, B
+ ldr BO, B
/*********************************************************************************************/
@@ -152,8 +152,8 @@ sgemm_ncopy_L2_M2_20:
subs I , I , #1
bne sgemm_ncopy_L2_M2_20
-
-
+
+
sgemm_ncopy_L2_M2_40:
ands I, M , #1
@@ -165,7 +165,7 @@ sgemm_ncopy_L2_M2_60:
subs I , I , #1
bne sgemm_ncopy_L2_M2_60
-
+
sgemm_ncopy_L2_M2_END:
@@ -194,8 +194,8 @@ sgemm_ncopy_L1_M2_20:
subs I , I , #1
bne sgemm_ncopy_L1_M2_20
-
-
+
+
sgemm_ncopy_L1_M2_40:
ands I, M , #1
@@ -207,7 +207,7 @@ sgemm_ncopy_L1_M2_60:
subs I , I , #1
bne sgemm_ncopy_L1_M2_60
-
+
sgemm_ncopy_L1_M2_END:
diff --git a/kernel/arm/sgemm_ncopy_4_vfp.S b/kernel/arm/sgemm_ncopy_4_vfp.S
index 2d8fa2e..ab01313 100644
--- a/kernel/arm/sgemm_ncopy_4_vfp.S
+++ b/kernel/arm/sgemm_ncopy_4_vfp.S
@@ -197,7 +197,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
sub r4, fp, #128
vstm r4, { s8 - s15} // store floating point registers
- ldr BO, B
+ ldr BO, B
sgemm_ncopy_L4_BEGIN:
@@ -231,8 +231,8 @@ sgemm_ncopy_L4_M4_20:
subs I , I , #1
bne sgemm_ncopy_L4_M4_20
-
-
+
+
sgemm_ncopy_L4_M4_40:
ands I, M , #3
@@ -244,7 +244,7 @@ sgemm_ncopy_L4_M4_60:
subs I , I , #1
bne sgemm_ncopy_L4_M4_60
-
+
sgemm_ncopy_L4_M4_END:
@@ -279,8 +279,8 @@ sgemm_ncopy_L2_M4_20:
subs I , I , #1
bne sgemm_ncopy_L2_M4_20
-
-
+
+
sgemm_ncopy_L2_M4_40:
ands I, M , #3
@@ -292,7 +292,7 @@ sgemm_ncopy_L2_M4_60:
subs I , I , #1
bne sgemm_ncopy_L2_M4_60
-
+
sgemm_ncopy_L2_M4_END:
@@ -320,8 +320,8 @@ sgemm_ncopy_L1_M4_20:
subs I , I , #1
bne sgemm_ncopy_L1_M4_20
-
-
+
+
sgemm_ncopy_L1_M4_40:
ands I, M , #3
@@ -333,7 +333,7 @@ sgemm_ncopy_L1_M4_60:
subs I , I , #1
bne sgemm_ncopy_L1_M4_60
-
+
sgemm_ncopy_L1_M4_END:
diff --git a/kernel/arm/sgemm_tcopy_4_vfp.S b/kernel/arm/sgemm_tcopy_4_vfp.S
index b0a3278..9bb0e46 100644
--- a/kernel/arm/sgemm_tcopy_4_vfp.S
+++ b/kernel/arm/sgemm_tcopy_4_vfp.S
@@ -288,20 +288,20 @@ sgemm_tcopy_L4_M4_20:
subs I , I , #1
ble sgemm_tcopy_L4_M4_40
-
+
COPY4x4_2
subs I , I , #1
bne sgemm_tcopy_L4_M4_20
-
-
+
+
sgemm_tcopy_L4_M4_40:
tst N , #2
ble sgemm_tcopy_L4_M4_60
COPY2x4
-
+
sgemm_tcopy_L4_M4_60:
@@ -309,7 +309,7 @@ sgemm_tcopy_L4_M4_60:
ble sgemm_tcopy_L4_M4_END
COPY1x4
-
+
sgemm_tcopy_L4_M4_END:
@@ -348,8 +348,8 @@ sgemm_tcopy_L2_M4_20:
subs I , I , #1
bne sgemm_tcopy_L2_M4_20
-
-
+
+
sgemm_tcopy_L2_M4_40:
tst N , #2
@@ -395,8 +395,8 @@ sgemm_tcopy_L1_M4_20:
subs I , I , #1
bne sgemm_tcopy_L1_M4_20
-
-
+
+
sgemm_tcopy_L1_M4_40:
tst N , #2
diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S
index ab5ff7f..e7511ff 100644
--- a/kernel/arm/strmm_kernel_4x2_vfp.S
+++ b/kernel/arm/strmm_kernel_4x2_vfp.S
@@ -189,7 +189,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA
-
+
fmuls s4 , s0 , s8
fmuls s5 , s0 , s9
@@ -239,7 +239,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA
-
+
fmuls s4 , s0 , s8
fsts s4 , [CO1]
@@ -500,10 +500,10 @@ _L2_M4_22:
subs L, L, #1
bgt _L2_M4_22
-
+
_L2_M4_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L2_M4_100
@@ -513,7 +513,7 @@ _L2_M4_42:
subs L, L, #1
bgt _L2_M4_42
-
+
_L2_M4_100:
SAVE4x2
@@ -605,10 +605,10 @@ _L2_M2_22:
subs L, L, #1
bgt _L2_M2_22
-
+
_L2_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L2_M2_100
@@ -618,7 +618,7 @@ _L2_M2_42:
subs L, L, #1
bgt _L2_M2_42
-
+
_L2_M2_100:
SAVE2x2
@@ -702,10 +702,10 @@ _L2_M1_22:
subs L, L, #1
bgt _L2_M1_22
-
+
_L2_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L2_M1_100
@@ -715,7 +715,7 @@ _L2_M1_42:
subs L, L, #1
bgt _L2_M1_42
-
+
_L2_M1_100:
SAVE1x2
@@ -766,7 +766,7 @@ _L1_BEGIN:
tst J , #1
ble _L999
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
@@ -843,10 +843,10 @@ _L1_M4_22:
subs L, L, #1
bgt _L1_M4_22
-
+
_L1_M4_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L1_M4_100
@@ -856,7 +856,7 @@ _L1_M4_42:
subs L, L, #1
bgt _L1_M4_42
-
+
_L1_M4_100:
SAVE4x1
@@ -948,10 +948,10 @@ _L1_M2_22:
subs L, L, #1
bgt _L1_M2_22
-
+
_L1_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L1_M2_100
@@ -961,7 +961,7 @@ _L1_M2_42:
subs L, L, #1
bgt _L1_M2_42
-
+
_L1_M2_100:
SAVE2x1
@@ -1045,10 +1045,10 @@ _L1_M1_22:
subs L, L, #1
bgt _L1_M1_22
-
+
_L1_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L1_M1_100
@@ -1058,7 +1058,7 @@ _L1_M1_42:
subs L, L, #1
bgt _L1_M1_42
-
+
_L1_M1_100:
SAVE1x1
diff --git a/kernel/arm/strmm_kernel_4x4_vfpv3.S b/kernel/arm/strmm_kernel_4x4_vfpv3.S
index 3a0c8af..f6342a0 100644
--- a/kernel/arm/strmm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S
@@ -276,7 +276,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA
add r4 , CO2, r3
-
+
fmuls s8 , s0 , s16
fmuls s9 , s0 , s17
fmuls s10, s0 , s18
@@ -291,7 +291,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmuls s15, s0 , s23
fsts s11, [CO1, #12 ]
-
+
fmuls s8 , s0 , s24
fsts s12, [CO2]
fmuls s9 , s0 , s25
@@ -368,7 +368,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add r4 , CO2, r3
flds s0, ALPHA
-
+
fmuls s8 , s0 , s16
fmuls s9 , s0 , s17
@@ -381,7 +381,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fsts s12, [CO2]
fsts s13, [CO2, #4 ]
-
+
fmuls s8 , s0 , s24
fmuls s9 , s0 , s25
@@ -804,7 +804,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble _L2_BEGIN
_L4_BEGIN:
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #2 // LDC * 4
@@ -907,7 +907,7 @@ _L4_M4_22:
b _L4_M4_44
-
+
_L4_M4_30:
tst L, #3
ble _L4_M4_40
@@ -972,7 +972,7 @@ _L4_M4_46:
subs L, L, #1
bne _L4_M4_46
-
+
_L4_M4_100:
SAVE4x4
@@ -1065,10 +1065,10 @@ _L4_M2_22:
subs L, L, #1
bgt _L4_M2_22
-
+
_L4_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L4_M2_100
@@ -1078,7 +1078,7 @@ _L4_M2_42:
subs L, L, #1
bgt _L4_M2_42
-
+
_L4_M2_100:
SAVE2x4
@@ -1162,10 +1162,10 @@ _L4_M1_22:
subs L, L, #1
bgt _L4_M1_22
-
+
_L4_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L4_M1_100
@@ -1175,7 +1175,7 @@ _L4_M1_42:
subs L, L, #1
bgt _L4_M1_42
-
+
_L4_M1_100:
SAVE1x4
@@ -1206,7 +1206,7 @@ _L4_END:
lsl r4, r4, #4 // k * 4 * 4
add r3, r3, r4 // B = B + K * 4 * 4
mov BC, r3
-
+
#if !defined(LEFT)
ldr r3 , KK
add r3 , r3 , #4 // number of values in BO
@@ -1228,7 +1228,7 @@ _L2_BEGIN:
tst J , #2
ble _L1_BEGIN
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #1 // LDC * 2
@@ -1306,10 +1306,10 @@ _L2_M4_22:
subs L, L, #1
bgt _L2_M4_22
-
+
_L2_M4_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L2_M4_100
@@ -1319,7 +1319,7 @@ _L2_M4_42:
subs L, L, #1
bgt _L2_M4_42
-
+
_L2_M4_100:
SAVE4x2
@@ -1411,10 +1411,10 @@ _L2_M2_22:
subs L, L, #1
bgt _L2_M2_22
-
+
_L2_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L2_M2_100
@@ -1424,7 +1424,7 @@ _L2_M2_42:
subs L, L, #1
bgt _L2_M2_42
-
+
_L2_M2_100:
SAVE2x2
@@ -1508,10 +1508,10 @@ _L2_M1_22:
subs L, L, #1
bgt _L2_M1_22
-
+
_L2_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L2_M1_100
@@ -1521,7 +1521,7 @@ _L2_M1_42:
subs L, L, #1
bgt _L2_M1_42
-
+
_L2_M1_100:
SAVE1x2
@@ -1568,7 +1568,7 @@ _L1_BEGIN:
tst J , #1
ble _L999
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
@@ -1645,10 +1645,10 @@ _L1_M4_22:
subs L, L, #1
bgt _L1_M4_22
-
+
_L1_M4_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L1_M4_100
@@ -1658,7 +1658,7 @@ _L1_M4_42:
subs L, L, #1
bgt _L1_M4_42
-
+
_L1_M4_100:
SAVE4x1
@@ -1751,10 +1751,10 @@ _L1_M2_22:
subs L, L, #1
bgt _L1_M2_22
-
+
_L1_M2_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L1_M2_100
@@ -1764,7 +1764,7 @@ _L1_M2_42:
subs L, L, #1
bgt _L1_M2_42
-
+
_L1_M2_100:
SAVE2x1
@@ -1848,10 +1848,10 @@ _L1_M1_22:
subs L, L, #1
bgt _L1_M1_22
-
+
_L1_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L1_M1_100
@@ -1861,7 +1861,7 @@ _L1_M1_42:
subs L, L, #1
bgt _L1_M1_42
-
+
_L1_M1_100:
SAVE1x1
diff --git a/kernel/arm/swap.c b/kernel/arm/swap.c
index 1ca9e76..eac621f 100644
--- a/kernel/arm/swap.c
+++ b/kernel/arm/swap.c
@@ -58,5 +58,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
return(0);
}
-
+
diff --git a/kernel/arm/zamax.c b/kernel/arm/zamax.c
index 8c2a5c3..162f829 100644
--- a/kernel/arm/zamax.c
+++ b/kernel/arm/zamax.c
@@ -66,7 +66,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n)
{
- if( CABS1(x,ix) > CABS1(maxf,0) )
+ if( CABS1(x,ix) > CABS1(maxf,0) )
{
max = i;
maxf[0] = ABS(x[ix]);
@@ -77,5 +77,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
}
return(CABS1(maxf,0));
}
-
+
diff --git a/kernel/arm/zamin.c b/kernel/arm/zamin.c
index 6956ced..9e26a66 100644
--- a/kernel/arm/zamin.c
+++ b/kernel/arm/zamin.c
@@ -66,7 +66,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
while(i < n)
{
- if( CABS1(x,ix) < CABS1(minf,0) )
+ if( CABS1(x,ix) < CABS1(minf,0) )
{
min = i;
minf[0] = ABS(x[ix]);
@@ -77,5 +77,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
}
return(CABS1(minf,0));
}
-
+
diff --git a/kernel/arm/zasum.c b/kernel/arm/zasum.c
index 13acfc0..0c5d69e 100644
--- a/kernel/arm/zasum.c
+++ b/kernel/arm/zasum.c
@@ -67,5 +67,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
}
return(sumf);
}
-
+
diff --git a/kernel/arm/zaxpy.c b/kernel/arm/zaxpby.c
similarity index 63%
copy from kernel/arm/zaxpy.c
copy to kernel/arm/zaxpby.c
index 28a4380..2e0c294 100644
--- a/kernel/arm/zaxpy.c
+++ b/kernel/arm/zaxpby.c
@@ -25,25 +25,20 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-/**************************************************************************************
-* 2013/09/15 Saar
-* BLASTEST float : OK
-* BLASTEST double : OK
-* CTEST : OK
-* TEST : OK
+/***************************************************************************
+* 2014/06/07 Saar
*
-**************************************************************************************/
-
+***************************************************************************/
#include "common.h"
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix,iy;
+ FLOAT temp;
if ( n < 0 ) return(0);
- if ( da_r == 0.0 && da_i == 0.0 ) return(0);
ix = 0;
iy = 0;
@@ -51,22 +46,72 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;
- while(i < n)
+ if ( beta_r == 0.0 && beta_i == 0.0)
+ {
+ if ( alpha_r == 0.0 && alpha_i == 0.0 )
+ {
+
+ while(i < n)
+ {
+ y[iy] = 0.0 ;
+ y[iy+1] = 0.0 ;
+ iy += inc_y2 ;
+ i++ ;
+ }
+
+ }
+ else
+ {
+
+ while(i < n)
+ {
+ y[iy] = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) ;
+ y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) ;
+ ix += inc_x2 ;
+ iy += inc_y2 ;
+ i++ ;
+ }
+
+
+ }
+
+ }
+ else
{
-#if !defined(CONJ)
- y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
- y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
-#else
- y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
- y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
-#endif
- ix += inc_x2 ;
- iy += inc_y2 ;
- i++ ;
+ if ( alpha_r == 0.0 && alpha_i == 0.0 )
+ {
+
+ while(i < n)
+ {
+ temp = ( beta_r * y[iy] - beta_i * y[iy+1] ) ;
+ y[iy+1] = ( beta_r * y[iy+1] + beta_i * y[iy] ) ;
+ y[iy] = temp;
+ iy += inc_y2 ;
+ i++ ;
+ }
+
+ }
+ else
+ {
+
+ while(i < n)
+ {
+ temp = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) + ( beta_r * y[iy] - beta_i * y[iy+1] ) ;
+ y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) + ( beta_r * y[iy+1] + beta_i * y[iy] ) ;
+ y[iy] = temp;
+ ix += inc_x2 ;
+ iy += inc_y2 ;
+ i++ ;
+ }
+
+
+ }
+
+
}
return(0);
}
-
+
diff --git a/kernel/arm/zaxpy.c b/kernel/arm/zaxpy.c
index 28a4380..929ee8b 100644
--- a/kernel/arm/zaxpy.c
+++ b/kernel/arm/zaxpy.c
@@ -68,5 +68,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
return(0);
}
-
+
diff --git a/kernel/arm/zcopy.c b/kernel/arm/zcopy.c
index 6547112..f720d6e 100644
--- a/kernel/arm/zcopy.c
+++ b/kernel/arm/zcopy.c
@@ -59,5 +59,5 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
return(0);
}
-
+
diff --git a/kernel/arm/zcopy_vfp.S b/kernel/arm/zcopy_vfp.S
index 06f8924..48aee4c 100644
--- a/kernel/arm/zcopy_vfp.S
+++ b/kernel/arm/zcopy_vfp.S
@@ -135,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
-
+
cmp N, #0
ble zcopy_kernel_L999
diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c
index 096ced9..4694875 100644
--- a/kernel/arm/zdot.c
+++ b/kernel/arm/zdot.c
@@ -43,7 +43,7 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in
BLASLONG ix=0,iy=0;
FLOAT dot[2];
FLOAT _Complex result;
-
+
dot[0]=0.0;
dot[1]=0.0;
@@ -74,5 +74,5 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in
return(result);
}
-
+
diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S
index 1a78b5a..622169b 100644
--- a/kernel/arm/zdot_vfp.S
+++ b/kernel/arm/zdot_vfp.S
@@ -189,7 +189,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
-
+
vsub.f64 d0 , d0 , d0
vsub.f64 d1 , d1 , d1
vsub.f64 d2 , d2 , d2
@@ -271,11 +271,11 @@ zdot_kernel_L999:
vldm r3, { d8 - d15} // restore floating point registers
#if !defined(CONJ)
- vsub.f64 d0 , d0, d2
- vadd.f64 d1 , d1, d3
+ vsub.f64 d0 , d0, d2
+ vadd.f64 d1 , d1, d3
#else
- vadd.f64 d0 , d0, d2
- vsub.f64 d1 , d1, d3
+ vadd.f64 d0 , d0, d2
+ vsub.f64 d1 , d1, d3
#endif
sub sp, fp, #24
diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S
index 8a54018..f4134ea 100644
--- a/kernel/arm/zgemm_kernel_2x2_vfp.S
+++ b/kernel/arm/zgemm_kernel_2x2_vfp.S
@@ -81,7 +81,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define B_PRE 96
#define C_PRE 64
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define KMAC_R fnmacd
#define KMAC_I fmacd
@@ -881,7 +881,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble zgemm_kernel_L1_BEGIN
zgemm_kernel_L2_BEGIN:
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #1 // LDC * 2
@@ -950,7 +950,7 @@ zgemm_kernel_L2_M2_22:
b zgemm_kernel_L2_M2_44
-
+
zgemm_kernel_L2_M2_30:
tst L, #3
ble zgemm_kernel_L2_M2_40
@@ -1015,7 +1015,7 @@ zgemm_kernel_L2_M2_46:
subs L, L, #1
bne zgemm_kernel_L2_M2_46
-
+
zgemm_kernel_L2_M2_100:
SAVE2x2
@@ -1054,10 +1054,10 @@ zgemm_kernel_L2_M1_22:
subs L, L, #1
bgt zgemm_kernel_L2_M1_22
-
+
zgemm_kernel_L2_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble zgemm_kernel_L2_M1_100
@@ -1067,7 +1067,7 @@ zgemm_kernel_L2_M1_42:
subs L, L, #1
bgt zgemm_kernel_L2_M1_42
-
+
zgemm_kernel_L2_M1_100:
SAVE1x2
@@ -1080,7 +1080,7 @@ zgemm_kernel_L2_END:
lsl r4, r4, #5 // k * 2 * 8 * 2
add r3, r3, r4 // B = B + K * 4 * 8
mov BC, r3
-
+
subs J , #1 // j--
bgt zgemm_kernel_L2_BEGIN
@@ -1094,7 +1094,7 @@ zgemm_kernel_L1_BEGIN:
tst J , #1
ble zgemm_kernel_L999
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
@@ -1158,7 +1158,7 @@ zgemm_kernel_L1_M2_22:
b zgemm_kernel_L1_M2_44
-
+
zgemm_kernel_L1_M2_30:
tst L, #3
ble zgemm_kernel_L1_M2_40
@@ -1223,7 +1223,7 @@ zgemm_kernel_L1_M2_46:
subs L, L, #1
bne zgemm_kernel_L1_M2_46
-
+
zgemm_kernel_L1_M2_100:
SAVE2x1
@@ -1262,10 +1262,10 @@ zgemm_kernel_L1_M1_22:
subs L, L, #1
bgt zgemm_kernel_L1_M1_22
-
+
zgemm_kernel_L1_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble zgemm_kernel_L1_M1_100
@@ -1275,7 +1275,7 @@ zgemm_kernel_L1_M1_42:
subs L, L, #1
bgt zgemm_kernel_L1_M1_42
-
+
zgemm_kernel_L1_M1_100:
SAVE1x1
diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S
index 2d35028..29c3f45 100644
--- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S
@@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define B_PRE 96
#define C_PRE 64
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define FADD_R fsubd
#define FADD_I faddd
@@ -927,7 +927,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble zgemm_kernel_L1_BEGIN
zgemm_kernel_L2_BEGIN:
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #1 // LDC * 2
@@ -996,7 +996,7 @@ zgemm_kernel_L2_M2_22:
b zgemm_kernel_L2_M2_44
-
+
zgemm_kernel_L2_M2_30:
tst L, #3
ble zgemm_kernel_L2_M2_40
@@ -1061,7 +1061,7 @@ zgemm_kernel_L2_M2_46:
subs L, L, #1
bne zgemm_kernel_L2_M2_46
-
+
zgemm_kernel_L2_M2_100:
SAVE2x2
@@ -1100,10 +1100,10 @@ zgemm_kernel_L2_M1_22:
subs L, L, #1
bgt zgemm_kernel_L2_M1_22
-
+
zgemm_kernel_L2_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble zgemm_kernel_L2_M1_100
@@ -1113,7 +1113,7 @@ zgemm_kernel_L2_M1_42:
subs L, L, #1
bgt zgemm_kernel_L2_M1_42
-
+
zgemm_kernel_L2_M1_100:
SAVE1x2
@@ -1126,7 +1126,7 @@ zgemm_kernel_L2_END:
lsl r4, r4, #5 // k * 2 * 8 * 2
add r3, r3, r4 // B = B + K * 4 * 8
mov BC, r3
-
+
subs J , #1 // j--
bgt zgemm_kernel_L2_BEGIN
@@ -1140,7 +1140,7 @@ zgemm_kernel_L1_BEGIN:
tst J , #1
ble zgemm_kernel_L999
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
@@ -1204,7 +1204,7 @@ zgemm_kernel_L1_M2_22:
b zgemm_kernel_L1_M2_44
-
+
zgemm_kernel_L1_M2_30:
tst L, #3
ble zgemm_kernel_L1_M2_40
@@ -1269,7 +1269,7 @@ zgemm_kernel_L1_M2_46:
subs L, L, #1
bne zgemm_kernel_L1_M2_46
-
+
zgemm_kernel_L1_M2_100:
SAVE2x1
@@ -1308,10 +1308,10 @@ zgemm_kernel_L1_M1_22:
subs L, L, #1
bgt zgemm_kernel_L1_M1_22
-
+
zgemm_kernel_L1_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble zgemm_kernel_L1_M1_100
@@ -1321,7 +1321,7 @@ zgemm_kernel_L1_M1_42:
subs L, L, #1
bgt zgemm_kernel_L1_M1_42
-
+
zgemm_kernel_L1_M1_100:
SAVE1x1
diff --git a/kernel/arm/zgemm_ncopy_2_vfp.S b/kernel/arm/zgemm_ncopy_2_vfp.S
index 5ff8ee2..b3fa225 100644
--- a/kernel/arm/zgemm_ncopy_2_vfp.S
+++ b/kernel/arm/zgemm_ncopy_2_vfp.S
@@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
sub r4, fp, #128
vstm r4, { d8 - d15} // store floating point registers
- ldr BO, B
+ ldr BO, B
/*********************************************************************************************/
@@ -177,8 +177,8 @@ zgemm_ncopy_L2_M2_20:
subs I , I , #1
bne zgemm_ncopy_L2_M2_20
-
-
+
+
zgemm_ncopy_L2_M2_40:
ands I, M , #1
@@ -190,7 +190,7 @@ zgemm_ncopy_L2_M2_60:
subs I , I , #1
bne zgemm_ncopy_L2_M2_60
-
+
zgemm_ncopy_L2_M2_END:
@@ -221,8 +221,8 @@ zgemm_ncopy_L1_M2_20:
subs I , I , #1
bne zgemm_ncopy_L1_M2_20
-
-
+
+
zgemm_ncopy_L1_M2_40:
ands I, M , #1
@@ -234,7 +234,7 @@ zgemm_ncopy_L1_M2_60:
subs I , I , #1
bne zgemm_ncopy_L1_M2_60
-
+
zgemm_ncopy_L1_M2_END:
diff --git a/kernel/arm/zgemv_n.c b/kernel/arm/zgemv_n.c
index dc2ffa0..b9b03f7 100644
--- a/kernel/arm/zgemv_n.c
+++ b/kernel/arm/zgemv_n.c
@@ -102,7 +102,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
return(0);
}
-
+
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
@@ -153,5 +153,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
return(0);
}
-
+
diff --git a/kernel/arm/zgemv_n_vfp.S b/kernel/arm/zgemv_n_vfp.S
index 3b51d55..d4cab09 100644
--- a/kernel/arm/zgemv_n_vfp.S
+++ b/kernel/arm/zgemv_n_vfp.S
@@ -553,7 +553,7 @@ zgemvn_kernel_F1X1:
ldr AO1, A
add r3, AO1, #16
str r3, A
-
+
ldr XO , X
INIT_F1
@@ -653,7 +653,7 @@ zgemvn_kernel_S1X1:
ldr AO1, A
add r3, AO1, #16
str r3, A
-
+
ldr XO , X
INIT_S1
diff --git a/kernel/arm/zgemv_t.c b/kernel/arm/zgemv_t.c
index 6161dba..1239cf3 100644
--- a/kernel/arm/zgemv_t.c
+++ b/kernel/arm/zgemv_t.c
@@ -88,7 +88,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
a_ptr += lda2;
iy += 2;
- }
+ }
return(0);
diff --git a/kernel/arm/znrm2.c b/kernel/arm/znrm2.c
index d68e302..c590095 100644
--- a/kernel/arm/znrm2.c
+++ b/kernel/arm/znrm2.c
@@ -64,7 +64,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
n *= inc_x2;
while(i < n)
{
-
+
if ( x[i] != 0.0 )
{
temp = ABS( x[i] );
@@ -76,10 +76,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
else
{
ssq += ( temp / scale ) * ( temp / scale );
- }
+ }
}
-
+
if ( x[i+1] != 0.0 )
{
temp = ABS( x[i+1] );
@@ -91,7 +91,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
else
{
ssq += ( temp / scale ) * ( temp / scale );
- }
+ }
}
@@ -102,5 +102,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
return(scale);
}
-
+
diff --git a/kernel/arm/axpy.c b/kernel/arm/zomatcopy_cn.c
similarity index 71%
copy from kernel/arm/axpy.c
copy to kernel/arm/zomatcopy_cn.c
index dceddf7..f5a7a62 100644
--- a/kernel/arm/axpy.c
+++ b/kernel/arm/zomatcopy_cn.c
@@ -25,40 +25,46 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-/**************************************************************************************
-* 2013/09/14 Saar
-* BLASTEST float : OK
-* BLASTEST double : OK
-* CTEST : OK
-* TEST : OK
-*
-**************************************************************************************/
-
-
#include "common.h"
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
{
- BLASLONG i=0;
- BLASLONG ix,iy;
+ BLASLONG i,j,ia;
+ FLOAT *aptr,*bptr;
- if ( n < 0 ) return(0);
- if ( da == 0.0 ) return(0);
+ if ( rows <= 0 ) return(0);
+ if ( cols <= 0 ) return(0);
- ix = 0;
- iy = 0;
+ aptr = a;
+ bptr = b;
- while(i < n)
- {
+ lda *= 2;
+ ldb *= 2;
- y[iy] += da * x[ix] ;
- ix += inc_x ;
- iy += inc_y ;
- i++ ;
+ for ( i=0; i<cols ; i++ )
+ {
+ ia = 0;
+ for(j=0; j<rows; j++)
+ {
+ bptr[ia] = alpha_r * aptr[ia] - alpha_i * aptr[ia+1];
+ bptr[ia+1] = alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+ ia+=2;
+ }
+ aptr += lda;
+ bptr += ldb;
}
+
return(0);
}
-
+
diff --git a/kernel/arm/axpy.c b/kernel/arm/zomatcopy_cnc.c
similarity index 71%
copy from kernel/arm/axpy.c
copy to kernel/arm/zomatcopy_cnc.c
index dceddf7..210c3f7 100644
--- a/kernel/arm/axpy.c
+++ b/kernel/arm/zomatcopy_cnc.c
@@ -25,40 +25,45 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-/**************************************************************************************
-* 2013/09/14 Saar
-* BLASTEST float : OK
-* BLASTEST double : OK
-* CTEST : OK
-* TEST : OK
-*
-**************************************************************************************/
-
-
#include "common.h"
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * No Trans, conjugate
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
{
- BLASLONG i=0;
- BLASLONG ix,iy;
+ BLASLONG i,j,ia;
+ FLOAT *aptr,*bptr;
- if ( n < 0 ) return(0);
- if ( da == 0.0 ) return(0);
+ if ( rows <= 0 ) return(0);
+ if ( cols <= 0 ) return(0);
- ix = 0;
- iy = 0;
+ aptr = a;
+ bptr = b;
+ lda *= 2;
+ ldb *= 2;
- while(i < n)
+ for ( i=0; i<cols ; i++ )
{
+ ia = 0;
- y[iy] += da * x[ix] ;
- ix += inc_x ;
- iy += inc_y ;
- i++ ;
-
+ for(j=0; j<rows; j++)
+ {
+ bptr[ia] = alpha_r * aptr[ia] + alpha_i * aptr[ia+1];
+ bptr[ia+1] = - alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+ ia += 2;
+ }
+ aptr += lda;
+ bptr += ldb;
}
+
return(0);
}
-
+
diff --git a/kernel/arm/axpy.c b/kernel/arm/zomatcopy_ct.c
similarity index 71%
copy from kernel/arm/axpy.c
copy to kernel/arm/zomatcopy_ct.c
index dceddf7..38bc9b9 100644
--- a/kernel/arm/axpy.c
+++ b/kernel/arm/zomatcopy_ct.c
@@ -25,40 +25,47 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-/**************************************************************************************
-* 2013/09/14 Saar
-* BLASTEST float : OK
-* BLASTEST double : OK
-* CTEST : OK
-* TEST : OK
-*
-**************************************************************************************/
-
-
#include "common.h"
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
{
- BLASLONG i=0;
- BLASLONG ix,iy;
+ BLASLONG i,j,ia,ib;
+ FLOAT *aptr,*bptr;
- if ( n < 0 ) return(0);
- if ( da == 0.0 ) return(0);
+ if ( rows <= 0 ) return(0);
+ if ( cols <= 0 ) return(0);
- ix = 0;
- iy = 0;
+ aptr = a;
- while(i < n)
+ lda *= 2;
+ ldb *= 2;
+ ib = 0;
+ for ( i=0; i<cols ; i++ )
{
+ bptr = &b[ib];
+ ia = 0;
- y[iy] += da * x[ix] ;
- ix += inc_x ;
- iy += inc_y ;
- i++ ;
-
+ for(j=0; j<rows; j++)
+ {
+ bptr[0] = alpha_r * aptr[ia] - alpha_i * aptr[ia+1];
+ bptr[1] = alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+ ia += 2;
+ bptr += ldb;
+ }
+ aptr += lda;
+ ib += 2;
}
+
return(0);
}
-
+
diff --git a/kernel/arm/axpy.c b/kernel/arm/zomatcopy_ctc.c
similarity index 70%
copy from kernel/arm/axpy.c
copy to kernel/arm/zomatcopy_ctc.c
index dceddf7..34e7e91 100644
--- a/kernel/arm/axpy.c
+++ b/kernel/arm/zomatcopy_ctc.c
@@ -25,40 +25,47 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-/**************************************************************************************
-* 2013/09/14 Saar
-* BLASTEST float : OK
-* BLASTEST double : OK
-* CTEST : OK
-* TEST : OK
-*
-**************************************************************************************/
-
-
#include "common.h"
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * Trans, conjugate
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
{
- BLASLONG i=0;
- BLASLONG ix,iy;
+ BLASLONG i,j,ia,ib;
+ FLOAT *aptr,*bptr;
- if ( n < 0 ) return(0);
- if ( da == 0.0 ) return(0);
+ if ( rows <= 0 ) return(0);
+ if ( cols <= 0 ) return(0);
- ix = 0;
- iy = 0;
+ aptr = a;
- while(i < n)
+ lda *= 2;
+ ldb *= 2;
+ ib = 0;
+ for ( i=0; i<cols ; i++ )
{
+ bptr = &b[ib];
+ ia = 0;
- y[iy] += da * x[ix] ;
- ix += inc_x ;
- iy += inc_y ;
- i++ ;
-
+ for(j=0; j<rows; j++)
+ {
+ bptr[0] = alpha_r * aptr[ia] + alpha_i * aptr[ia+1];
+ bptr[1] = - alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+ ia += 2;
+ bptr += ldb;
+ }
+ aptr += lda;
+ ib += 2;
}
+
return(0);
}
-
+
diff --git a/kernel/arm/axpy.c b/kernel/arm/zomatcopy_rn.c
similarity index 71%
copy from kernel/arm/axpy.c
copy to kernel/arm/zomatcopy_rn.c
index dceddf7..ded381e 100644
--- a/kernel/arm/axpy.c
+++ b/kernel/arm/zomatcopy_rn.c
@@ -25,40 +25,46 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-/**************************************************************************************
-* 2013/09/14 Saar
-* BLASTEST float : OK
-* BLASTEST double : OK
-* CTEST : OK
-* TEST : OK
-*
-**************************************************************************************/
-
-
#include "common.h"
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
{
- BLASLONG i=0;
- BLASLONG ix,iy;
+ BLASLONG i,j,ia;
+ FLOAT *aptr,*bptr;
- if ( n < 0 ) return(0);
- if ( da == 0.0 ) return(0);
+ if ( rows <= 0 ) return(0);
+ if ( cols <= 0 ) return(0);
- ix = 0;
- iy = 0;
+ aptr = a;
+ bptr = b;
- while(i < n)
- {
+ lda *=2;
+ ldb *=2;
- y[iy] += da * x[ix] ;
- ix += inc_x ;
- iy += inc_y ;
- i++ ;
+ for ( i=0; i<rows ; i++ )
+ {
+ ia = 0;
+ for(j=0; j<cols; j++)
+ {
+ bptr[ia] = alpha_r * aptr[ia] - alpha_i * aptr[ia+1];
+ bptr[ia+1] = alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+ ia += 2;
+ }
+ aptr += lda;
+ bptr += ldb;
}
+
return(0);
}
-
+
diff --git a/kernel/arm/axpy.c b/kernel/arm/zomatcopy_rnc.c
similarity index 71%
copy from kernel/arm/axpy.c
copy to kernel/arm/zomatcopy_rnc.c
index dceddf7..fc27f17 100644
--- a/kernel/arm/axpy.c
+++ b/kernel/arm/zomatcopy_rnc.c
@@ -25,40 +25,45 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-/**************************************************************************************
-* 2013/09/14 Saar
-* BLASTEST float : OK
-* BLASTEST double : OK
-* CTEST : OK
-* TEST : OK
-*
-**************************************************************************************/
-
-
#include "common.h"
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
-{
- BLASLONG i=0;
- BLASLONG ix,iy;
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * No Trans , conjugate
+ *
+******************************************************/
- if ( n < 0 ) return(0);
- if ( da == 0.0 ) return(0);
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+ BLASLONG i,j,ia;
+ FLOAT *aptr,*bptr;
- ix = 0;
- iy = 0;
+ if ( rows <= 0 ) return(0);
+ if ( cols <= 0 ) return(0);
- while(i < n)
- {
+ aptr = a;
+ bptr = b;
- y[iy] += da * x[ix] ;
- ix += inc_x ;
- iy += inc_y ;
- i++ ;
+ lda *=2;
+ ldb *=2;
+ for ( i=0; i<rows ; i++ )
+ {
+ ia = 0;
+ for(j=0; j<cols; j++)
+ {
+ bptr[ia] = alpha_r * aptr[ia] + alpha_i * aptr[ia+1];
+ bptr[ia+1] = - alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+ ia += 2;
+ }
+ aptr += lda;
+ bptr += ldb;
}
+
return(0);
}
-
+
diff --git a/kernel/arm/axpy.c b/kernel/arm/zomatcopy_rt.c
similarity index 71%
copy from kernel/arm/axpy.c
copy to kernel/arm/zomatcopy_rt.c
index dceddf7..d34db24 100644
--- a/kernel/arm/axpy.c
+++ b/kernel/arm/zomatcopy_rt.c
@@ -25,40 +25,48 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-/**************************************************************************************
-* 2013/09/14 Saar
-* BLASTEST float : OK
-* BLASTEST double : OK
-* CTEST : OK
-* TEST : OK
-*
-**************************************************************************************/
-
-
#include "common.h"
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
{
- BLASLONG i=0;
- BLASLONG ix,iy;
+ BLASLONG i,j,ia,ib;
+ FLOAT *aptr,*bptr;
- if ( n < 0 ) return(0);
- if ( da == 0.0 ) return(0);
+ if ( rows <= 0 ) return(0);
+ if ( cols <= 0 ) return(0);
- ix = 0;
- iy = 0;
+ aptr = a;
- while(i < n)
- {
+ lda *= 2;
+ ldb *= 2;
+ ib = 0;
- y[iy] += da * x[ix] ;
- ix += inc_x ;
- iy += inc_y ;
- i++ ;
+ for ( i=0; i<rows ; i++ )
+ {
+ bptr = &b[ib];
+ ia = 0;
+ for(j=0; j<cols; j++)
+ {
+ bptr[0] = alpha_r * aptr[ia] - alpha_i * aptr[ia+1];
+ bptr[1] = alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+ ia += 2;
+ bptr += ldb;
+ }
+ aptr += lda;
+ ib += 2;
}
+
return(0);
}
-
+
diff --git a/kernel/arm/axpy.c b/kernel/arm/zomatcopy_rtc.c
similarity index 70%
copy from kernel/arm/axpy.c
copy to kernel/arm/zomatcopy_rtc.c
index dceddf7..a80ee6d 100644
--- a/kernel/arm/axpy.c
+++ b/kernel/arm/zomatcopy_rtc.c
@@ -25,40 +25,48 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-/**************************************************************************************
-* 2013/09/14 Saar
-* BLASTEST float : OK
-* BLASTEST double : OK
-* CTEST : OK
-* TEST : OK
-*
-**************************************************************************************/
-
-
#include "common.h"
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * Trans, conjugate
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
{
- BLASLONG i=0;
- BLASLONG ix,iy;
+ BLASLONG i,j,ia,ib;
+ FLOAT *aptr,*bptr;
- if ( n < 0 ) return(0);
- if ( da == 0.0 ) return(0);
+ if ( rows <= 0 ) return(0);
+ if ( cols <= 0 ) return(0);
- ix = 0;
- iy = 0;
+ aptr = a;
- while(i < n)
- {
+ lda *= 2;
+ ldb *= 2;
+ ib = 0;
- y[iy] += da * x[ix] ;
- ix += inc_x ;
- iy += inc_y ;
- i++ ;
+ for ( i=0; i<rows ; i++ )
+ {
+ bptr = &b[ib];
+ ia = 0;
+ for(j=0; j<cols; j++)
+ {
+ bptr[0] = alpha_r * aptr[ia] + alpha_i * aptr[ia+1];
+ bptr[1] = - alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+ ia += 2;
+ bptr += ldb;
+ }
+ aptr += lda;
+ ib += 2;
}
+
return(0);
}
-
+
diff --git a/kernel/arm/zrot.c b/kernel/arm/zrot.c
index 4a2f37f..356a4df 100644
--- a/kernel/arm/zrot.c
+++ b/kernel/arm/zrot.c
@@ -64,5 +64,5 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
return(0);
}
-
+
diff --git a/kernel/arm/zscal.c b/kernel/arm/zscal.c
index 569e250..f543edc 100644
--- a/kernel/arm/zscal.c
+++ b/kernel/arm/zscal.c
@@ -80,5 +80,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
return(0);
}
-
+
diff --git a/kernel/arm/zswap.c b/kernel/arm/zswap.c
index 4e3e73d..fcfb385 100644
--- a/kernel/arm/zswap.c
+++ b/kernel/arm/zswap.c
@@ -66,5 +66,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
return(0);
}
-
+
diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S
index 59039c3..109ee07 100644
--- a/kernel/arm/ztrmm_kernel_2x2_vfp.S
+++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S
@@ -90,7 +90,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define KMAC_R fnmacd
#define KMAC_I fmacd
@@ -905,7 +905,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble _L1_BEGIN
_L2_BEGIN:
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #1 // LDC * 2
@@ -1010,7 +1010,7 @@ _L2_M2_22:
b _L2_M2_44
-
+
_L2_M2_30:
tst L, #3
ble _L2_M2_40
@@ -1075,7 +1075,7 @@ _L2_M2_46:
subs L, L, #1
bne _L2_M2_46
-
+
_L2_M2_100:
SAVE2x2
@@ -1164,10 +1164,10 @@ _L2_M1_22:
subs L, L, #1
bgt _L2_M1_22
-
+
_L2_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L2_M1_100
@@ -1177,7 +1177,7 @@ _L2_M1_42:
subs L, L, #1
bgt _L2_M1_42
-
+
_L2_M1_100:
SAVE1x2
@@ -1208,7 +1208,7 @@ _L2_END:
lsl r4, r4, #5 // k * 2 * 8 * 2
add r3, r3, r4 // B = B + K * 4 * 8
mov BC, r3
-
+
#if !defined(LEFT)
ldr r3 , KK
add r3 , r3 , #2 // number of values in BO
@@ -1229,7 +1229,7 @@ _L1_BEGIN:
tst J , #1
ble _L999
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
@@ -1328,7 +1328,7 @@ _L1_M2_22:
b _L1_M2_44
-
+
_L1_M2_30:
tst L, #3
ble _L1_M2_40
@@ -1393,7 +1393,7 @@ _L1_M2_46:
subs L, L, #1
bne _L1_M2_46
-
+
_L1_M2_100:
SAVE2x1
@@ -1481,10 +1481,10 @@ _L1_M1_22:
subs L, L, #1
bgt _L1_M1_22
-
+
_L1_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L1_M1_100
@@ -1494,7 +1494,7 @@ _L1_M1_42:
subs L, L, #1
bgt _L1_M1_42
-
+
_L1_M1_100:
SAVE1x1
diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S
index 917ce61..761dbcc 100644
--- a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S
@@ -84,7 +84,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define B_PRE 96
#define C_PRE 64
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define FADD_R fsubd
#define FADD_I faddd
@@ -906,7 +906,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble _L1_BEGIN
_L2_BEGIN:
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #1 // LDC * 2
@@ -1011,7 +1011,7 @@ _L2_M2_22:
b _L2_M2_44
-
+
_L2_M2_30:
tst L, #3
ble _L2_M2_40
@@ -1076,7 +1076,7 @@ _L2_M2_46:
subs L, L, #1
bne _L2_M2_46
-
+
_L2_M2_100:
SAVE2x2
@@ -1165,10 +1165,10 @@ _L2_M1_22:
subs L, L, #1
bgt _L2_M1_22
-
+
_L2_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L2_M1_100
@@ -1178,7 +1178,7 @@ _L2_M1_42:
subs L, L, #1
bgt _L2_M1_42
-
+
_L2_M1_100:
SAVE1x2
@@ -1209,7 +1209,7 @@ _L2_END:
lsl r4, r4, #5 // k * 2 * 8 * 2
add r3, r3, r4 // B = B + K * 4 * 8
mov BC, r3
-
+
#if !defined(LEFT)
ldr r3 , KK
add r3 , r3 , #2 // number of values in BO
@@ -1230,7 +1230,7 @@ _L1_BEGIN:
tst J , #1
ble _L999
-
+
ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
@@ -1329,7 +1329,7 @@ _L1_M2_22:
b _L1_M2_44
-
+
_L1_M2_30:
tst L, #3
ble _L1_M2_40
@@ -1394,7 +1394,7 @@ _L1_M2_46:
subs L, L, #1
bne _L1_M2_46
-
+
_L1_M2_100:
SAVE2x1
@@ -1482,10 +1482,10 @@ _L1_M1_22:
subs L, L, #1
bgt _L1_M1_22
-
+
_L1_M1_40:
-
+
ands L , K1, #7 // L = L % 8
ble _L1_M1_100
@@ -1495,7 +1495,7 @@ _L1_M1_42:
subs L, L, #1
bgt _L1_M1_42
-
+
_L1_M1_100:
SAVE1x1
diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8
index ecf278c..27157da 100644
--- a/kernel/arm64/KERNEL.ARMV8
+++ b/kernel/arm64/KERNEL.ARMV8
@@ -85,13 +85,13 @@ DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
-SGEMMKERNEL = ../generic/gemmkernel_2x2.c
+SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
-DGEMMKERNEL = ../generic/gemmkernel_2x2.c
+DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
diff --git a/kernel/arm/dot.c b/kernel/generic/dot.c
similarity index 73%
copy from kernel/arm/dot.c
copy to kernel/generic/dot.c
index 30490e2..bc07bc7 100644
--- a/kernel/arm/dot.c
+++ b/kernel/generic/dot.c
@@ -1,5 +1,5 @@
/***************************************************************************
-Copyright (c) 2013, The OpenBLAS Project
+Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -25,14 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-/**************************************************************************************
-* 2013/09/14 Saar
-* BLASTEST float : OK
-* BLASTEST double : OK
-* CTEST : OK
-* TEST : OK
-*
-**************************************************************************************/
#include "common.h"
@@ -44,14 +36,62 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
+
+#if defined(DSDOT)
double dot = 0.0 ;
+#else
+ FLOAT dot = 0.0 ;
+#endif
if ( n < 0 ) return(dot);
+ if ( (inc_x == 1) && (inc_y == 1) )
+ {
+
+ int n1 = n & -4;
+
+ while(i < n1)
+ {
+
+#if defined(DSDOT)
+ dot += (double) y[i] * (double) x[i]
+ + (double) y[i+1] * (double) x[i+1]
+ + (double) y[i+2] * (double) x[i+2]
+ + (double) y[i+3] * (double) x[i+3] ;
+#else
+ dot += y[i] * x[i]
+ + y[i+1] * x[i+1]
+ + y[i+2] * x[i+2]
+ + y[i+3] * x[i+3] ;
+#endif
+ i+=4 ;
+
+ }
+
+ while(i < n)
+ {
+
+#if defined(DSDOT)
+ dot += (double) y[i] * (double) x[i] ;
+#else
+ dot += y[i] * x[i] ;
+#endif
+ i++ ;
+
+ }
+ return(dot);
+
+
+ }
+
while(i < n)
{
+#if defined(DSDOT)
+ dot += (double) y[iy] * (double) x[ix] ;
+#else
dot += y[iy] * x[ix] ;
+#endif
ix += inc_x ;
iy += inc_y ;
i++ ;
@@ -60,5 +100,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
return(dot);
}
-
+
diff --git a/kernel/generic/gemm_beta.c b/kernel/generic/gemm_beta.c
index 525ff94..c4e4f7a 100644
--- a/kernel/generic/gemm_beta.c
+++ b/kernel/generic/gemm_beta.c
@@ -38,7 +38,7 @@
#include "common.h"
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5,
FLOAT *c, BLASLONG ldc){
@@ -89,7 +89,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
do {
c_offset1 = c_offset;
c_offset += ldc;
-
+
i = (m >> 3);
if (i > 0){
do {
diff --git a/kernel/generic/gemm_ncopy_1.c b/kernel/generic/gemm_ncopy_1.c
index e990de7..ac99037 100644
--- a/kernel/generic/gemm_ncopy_1.c
+++ b/kernel/generic/gemm_ncopy_1.c
@@ -55,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset += lda;
i = (m >> 3);
-
+
if (i > 0){
do {
*(b_offset + 0) = *(a_offset1 + 0);
@@ -73,7 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
}
i = (m & 7);
-
+
if (i > 0){
do {
*(b_offset + 0) = *(a_offset1 + 0);
diff --git a/kernel/generic/gemm_ncopy_16.c b/kernel/generic/gemm_ncopy_16.c
index 4a9269e..9bd40f1 100644
--- a/kernel/generic/gemm_ncopy_16.c
+++ b/kernel/generic/gemm_ncopy_16.c
@@ -60,7 +60,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset = a;
boffset = b;
-
+
j = (n >> 4);
if (j > 0){
do{
@@ -81,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset15 = aoffset14 + lda;
aoffset16 = aoffset15 + lda;
aoffset += 16 * lda;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -89,7 +89,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
-
+
ctemp05 = *(aoffset3 + 0);
ctemp06 = *(aoffset3 + 1);
ctemp07 = *(aoffset4 + 0);
@@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp10 = *(aoffset5 + 1);
ctemp11 = *(aoffset6 + 0);
ctemp12 = *(aoffset6 + 1);
-
+
ctemp13 = *(aoffset7 + 0);
ctemp14 = *(aoffset7 + 1);
ctemp15 = *(aoffset8 + 0);
@@ -109,7 +109,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp18 = *(aoffset9 + 1);
ctemp19 = *(aoffset10 + 0);
ctemp20 = *(aoffset10 + 1);
-
+
ctemp21 = *(aoffset11 + 0);
ctemp22 = *(aoffset11 + 1);
ctemp23 = *(aoffset12 + 0);
@@ -119,7 +119,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp26 = *(aoffset13 + 1);
ctemp27 = *(aoffset14 + 0);
ctemp28 = *(aoffset14 + 1);
-
+
ctemp29 = *(aoffset15 + 0);
ctemp30 = *(aoffset15 + 1);
ctemp31 = *(aoffset16 + 0);
@@ -133,7 +133,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp11;
*(boffset + 6) = ctemp13;
*(boffset + 7) = ctemp15;
-
+
*(boffset + 8) = ctemp17;
*(boffset + 9) = ctemp19;
*(boffset + 10) = ctemp21;
@@ -142,7 +142,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 13) = ctemp27;
*(boffset + 14) = ctemp29;
*(boffset + 15) = ctemp31;
-
+
*(boffset + 16) = ctemp02;
*(boffset + 17) = ctemp04;
*(boffset + 18) = ctemp06;
@@ -193,7 +193,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp11 = *(aoffset6 + 0);
ctemp13 = *(aoffset7 + 0);
ctemp15 = *(aoffset8 + 0);
-
+
ctemp17 = *(aoffset9 + 0);
ctemp19 = *(aoffset10 + 0);
ctemp21 = *(aoffset11 + 0);
@@ -202,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp27 = *(aoffset14 + 0);
ctemp29 = *(aoffset15 + 0);
ctemp31 = *(aoffset16 + 0);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp03;
*(boffset + 2) = ctemp05;
@@ -211,7 +211,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp11;
*(boffset + 6) = ctemp13;
*(boffset + 7) = ctemp15;
-
+
*(boffset + 8) = ctemp17;
*(boffset + 9) = ctemp19;
*(boffset + 10) = ctemp21;
@@ -220,13 +220,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 13) = ctemp27;
*(boffset + 14) = ctemp29;
*(boffset + 15) = ctemp31;
-
+
boffset += 16;
}
j--;
}while(j > 0);
} /* end of if(j > 0) */
-
+
if (n & 8){
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
@@ -237,7 +237,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset7 = aoffset6 + lda;
aoffset8 = aoffset7 + lda;
aoffset += 8 * lda;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -245,22 +245,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
-
+
ctemp05 = *(aoffset3 + 0);
ctemp06 = *(aoffset3 + 1);
ctemp07 = *(aoffset4 + 0);
ctemp08 = *(aoffset4 + 1);
-
+
ctemp09 = *(aoffset5 + 0);
ctemp10 = *(aoffset5 + 1);
ctemp11 = *(aoffset6 + 0);
ctemp12 = *(aoffset6 + 1);
-
+
ctemp13 = *(aoffset7 + 0);
ctemp14 = *(aoffset7 + 1);
ctemp15 = *(aoffset8 + 0);
ctemp16 = *(aoffset8 + 1);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp03;
*(boffset + 2) = ctemp05;
@@ -269,7 +269,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp11;
*(boffset + 6) = ctemp13;
*(boffset + 7) = ctemp15;
-
+
*(boffset + 8) = ctemp02;
*(boffset + 9) = ctemp04;
*(boffset + 10) = ctemp06;
@@ -278,7 +278,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 13) = ctemp12;
*(boffset + 14) = ctemp14;
*(boffset + 15) = ctemp16;
-
+
aoffset1 += 2;
aoffset2 += 2;
aoffset3 += 2;
@@ -287,13 +287,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset6 += 2;
aoffset7 += 2;
aoffset8 += 2;
-
+
boffset += 16;
-
+
i --;
}while(i > 0);
}
-
+
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp03 = *(aoffset2 + 0);
@@ -303,7 +303,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp11 = *(aoffset6 + 0);
ctemp13 = *(aoffset7 + 0);
ctemp15 = *(aoffset8 + 0);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp03;
*(boffset + 2) = ctemp05;
@@ -312,7 +312,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp11;
*(boffset + 6) = ctemp13;
*(boffset + 7) = ctemp15;
-
+
boffset += 8;
}
}
@@ -323,7 +323,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset += 4 * lda;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -331,12 +331,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
-
+
ctemp05 = *(aoffset3 + 0);
ctemp06 = *(aoffset3 + 1);
ctemp07 = *(aoffset4 + 0);
ctemp08 = *(aoffset4 + 1);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp03;
*(boffset + 2) = ctemp05;
@@ -345,23 +345,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp04;
*(boffset + 6) = ctemp06;
*(boffset + 7) = ctemp08;
-
+
aoffset1 += 2;
aoffset2 += 2;
aoffset3 += 2;
aoffset4 += 2;
boffset += 8;
-
+
i --;
}while(i > 0);
}
-
+
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp03 = *(aoffset2 + 0);
ctemp05 = *(aoffset3 + 0);
ctemp07 = *(aoffset4 + 0);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp03;
*(boffset + 2) = ctemp05;
@@ -374,7 +374,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset += 2 * lda;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -382,7 +382,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp03;
*(boffset + 2) = ctemp02;
@@ -391,15 +391,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 += 2;
aoffset2 += 2;
boffset += 4;
-
+
i --;
}while(i > 0);
}
-
+
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp03 = *(aoffset2 + 0);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp03;
boffset += 2;
@@ -408,26 +408,26 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
if (n & 1){
aoffset1 = aoffset;
-
+
i = (m >> 1);
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
aoffset1 += 2;
boffset += 2;
-
+
i --;
}while(i > 0);
}
-
+
if (m & 1){
ctemp01 = *(aoffset1 + 0);
-
+
*(boffset + 0) = ctemp01;
boffset += 1;
}
diff --git a/kernel/generic/gemm_ncopy_2.c b/kernel/generic/gemm_ncopy_2.c
index 0ec807c..b728c71 100644
--- a/kernel/generic/gemm_ncopy_2.c
+++ b/kernel/generic/gemm_ncopy_2.c
@@ -56,7 +56,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset += 2 * lda;
i = (m >> 2);
-
+
if (i > 0){
do {
*(b_offset + 0) = *(a_offset1 + 0);
@@ -75,7 +75,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
}
i = (m & 3);
-
+
if (i > 0){
do {
*(b_offset + 0) = *(a_offset1 + 0);
@@ -108,9 +108,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
i --;
} while (i > 0);
}
-
+
i = (m & 7);
-
+
if (i > 0){
do {
*(b_offset + 0) = *(a_offset + 0);
diff --git a/kernel/generic/gemm_ncopy_4.c b/kernel/generic/gemm_ncopy_4.c
index 1ecb93c..1551b03 100644
--- a/kernel/generic/gemm_ncopy_4.c
+++ b/kernel/generic/gemm_ncopy_4.c
@@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset = a;
b_offset = b;
-
+
j = (n >> 2);
if (j > 0){
do{
@@ -60,7 +60,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset += 4 * lda;
-
+
i = (m >> 2);
if (i > 0){
do{
@@ -68,47 +68,47 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
-
+
ctemp5 = *(a_offset2 + 0);
ctemp6 = *(a_offset2 + 1);
ctemp7 = *(a_offset2 + 2);
ctemp8 = *(a_offset2 + 3);
-
+
ctemp9 = *(a_offset3 + 0);
ctemp10 = *(a_offset3 + 1);
ctemp11 = *(a_offset3 + 2);
ctemp12 = *(a_offset3 + 3);
-
+
ctemp13 = *(a_offset4 + 0);
ctemp14 = *(a_offset4 + 1);
ctemp15 = *(a_offset4 + 2);
ctemp16 = *(a_offset4 + 3);
-
+
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp5;
*(b_offset + 2) = ctemp9;
*(b_offset + 3) = ctemp13;
-
+
*(b_offset + 4) = ctemp2;
*(b_offset + 5) = ctemp6;
*(b_offset + 6) = ctemp10;
*(b_offset + 7) = ctemp14;
-
+
*(b_offset + 8) = ctemp3;
*(b_offset + 9) = ctemp7;
*(b_offset + 10) = ctemp11;
*(b_offset + 11) = ctemp15;
-
+
*(b_offset + 12) = ctemp4;
*(b_offset + 13) = ctemp8;
*(b_offset + 14) = ctemp12;
*(b_offset + 15) = ctemp16;
-
+
a_offset1 += 4;
a_offset2 += 4;
a_offset3 += 4;
a_offset4 += 4;
-
+
b_offset += 16;
i --;
}while(i > 0);
@@ -121,17 +121,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp5 = *(a_offset2 + 0);
ctemp9 = *(a_offset3 + 0);
ctemp13 = *(a_offset4 + 0);
-
+
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp5;
*(b_offset + 2) = ctemp9;
*(b_offset + 3) = ctemp13;
-
+
a_offset1 ++;
a_offset2 ++;
a_offset3 ++;
a_offset4 ++;
-
+
b_offset += 4;
i --;
}while(i > 0);
@@ -139,12 +139,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
j--;
}while(j > 0);
} /* end of if(j > 0) */
-
+
if (n & 2){
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
-
+
i = (m >> 2);
if (i > 0){
do{
@@ -152,38 +152,38 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
-
+
ctemp5 = *(a_offset2 + 0);
ctemp6 = *(a_offset2 + 1);
ctemp7 = *(a_offset2 + 2);
ctemp8 = *(a_offset2 + 3);
-
+
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp5;
*(b_offset + 2) = ctemp2;
*(b_offset + 3) = ctemp6;
-
+
*(b_offset + 4) = ctemp3;
*(b_offset + 5) = ctemp7;
*(b_offset + 6) = ctemp4;
*(b_offset + 7) = ctemp8;
-
+
a_offset1 += 4;
a_offset2 += 4;
b_offset += 8;
i --;
}while(i > 0);
}
-
+
i = (m & 3);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
ctemp5 = *(a_offset2 + 0);
-
+
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp5;
-
+
a_offset1 ++;
a_offset2 ++;
b_offset += 2;
@@ -191,10 +191,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
}while(i > 0);
}
} /* end of if(j > 0) */
-
+
if (n & 1){
a_offset1 = a_offset;
-
+
i = (m >> 2);
if (i > 0){
do{
@@ -202,18 +202,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
-
+
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp2;
*(b_offset + 2) = ctemp3;
*(b_offset + 3) = ctemp4;
-
+
a_offset1 += 4;
b_offset += 4;
i --;
}while(i > 0);
}
-
+
i = (m & 3);
if (i > 0){
do{
diff --git a/kernel/generic/gemm_ncopy_6.c b/kernel/generic/gemm_ncopy_6.c
index 1ecb93c..1551b03 100644
--- a/kernel/generic/gemm_ncopy_6.c
+++ b/kernel/generic/gemm_ncopy_6.c
@@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset = a;
b_offset = b;
-
+
j = (n >> 2);
if (j > 0){
do{
@@ -60,7 +60,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset += 4 * lda;
-
+
i = (m >> 2);
if (i > 0){
do{
@@ -68,47 +68,47 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
-
+
ctemp5 = *(a_offset2 + 0);
ctemp6 = *(a_offset2 + 1);
ctemp7 = *(a_offset2 + 2);
ctemp8 = *(a_offset2 + 3);
-
+
ctemp9 = *(a_offset3 + 0);
ctemp10 = *(a_offset3 + 1);
ctemp11 = *(a_offset3 + 2);
ctemp12 = *(a_offset3 + 3);
-
+
ctemp13 = *(a_offset4 + 0);
ctemp14 = *(a_offset4 + 1);
ctemp15 = *(a_offset4 + 2);
ctemp16 = *(a_offset4 + 3);
-
+
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp5;
*(b_offset + 2) = ctemp9;
*(b_offset + 3) = ctemp13;
-
+
*(b_offset + 4) = ctemp2;
*(b_offset + 5) = ctemp6;
*(b_offset + 6) = ctemp10;
*(b_offset + 7) = ctemp14;
-
+
*(b_offset + 8) = ctemp3;
*(b_offset + 9) = ctemp7;
*(b_offset + 10) = ctemp11;
*(b_offset + 11) = ctemp15;
-
+
*(b_offset + 12) = ctemp4;
*(b_offset + 13) = ctemp8;
*(b_offset + 14) = ctemp12;
*(b_offset + 15) = ctemp16;
-
+
a_offset1 += 4;
a_offset2 += 4;
a_offset3 += 4;
a_offset4 += 4;
-
+
b_offset += 16;
i --;
}while(i > 0);
@@ -121,17 +121,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp5 = *(a_offset2 + 0);
ctemp9 = *(a_offset3 + 0);
ctemp13 = *(a_offset4 + 0);
-
+
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp5;
*(b_offset + 2) = ctemp9;
*(b_offset + 3) = ctemp13;
-
+
a_offset1 ++;
a_offset2 ++;
a_offset3 ++;
a_offset4 ++;
-
+
b_offset += 4;
i --;
}while(i > 0);
@@ -139,12 +139,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
j--;
}while(j > 0);
} /* end of if(j > 0) */
-
+
if (n & 2){
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
-
+
i = (m >> 2);
if (i > 0){
do{
@@ -152,38 +152,38 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
-
+
ctemp5 = *(a_offset2 + 0);
ctemp6 = *(a_offset2 + 1);
ctemp7 = *(a_offset2 + 2);
ctemp8 = *(a_offset2 + 3);
-
+
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp5;
*(b_offset + 2) = ctemp2;
*(b_offset + 3) = ctemp6;
-
+
*(b_offset + 4) = ctemp3;
*(b_offset + 5) = ctemp7;
*(b_offset + 6) = ctemp4;
*(b_offset + 7) = ctemp8;
-
+
a_offset1 += 4;
a_offset2 += 4;
b_offset += 8;
i --;
}while(i > 0);
}
-
+
i = (m & 3);
if (i > 0){
do{
ctemp1 = *(a_offset1 + 0);
ctemp5 = *(a_offset2 + 0);
-
+
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp5;
-
+
a_offset1 ++;
a_offset2 ++;
b_offset += 2;
@@ -191,10 +191,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
}while(i > 0);
}
} /* end of if(j > 0) */
-
+
if (n & 1){
a_offset1 = a_offset;
-
+
i = (m >> 2);
if (i > 0){
do{
@@ -202,18 +202,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
-
+
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp2;
*(b_offset + 2) = ctemp3;
*(b_offset + 3) = ctemp4;
-
+
a_offset1 += 4;
b_offset += 4;
i --;
}while(i > 0);
}
-
+
i = (m & 3);
if (i > 0){
do{
diff --git a/kernel/generic/gemm_ncopy_8.c b/kernel/generic/gemm_ncopy_8.c
index bdaaba1..a49a778 100644
--- a/kernel/generic/gemm_ncopy_8.c
+++ b/kernel/generic/gemm_ncopy_8.c
@@ -67,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset = a;
boffset = b;
-
+
j = (n >> 3);
if (j > 0){
do{
@@ -80,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset7 = aoffset6 + lda;
aoffset8 = aoffset7 + lda;
aoffset += 8 * lda;
-
+
i = (m >> 3);
if (i > 0){
do{
@@ -92,7 +92,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
-
+
ctemp09 = *(aoffset2 + 0);
ctemp10 = *(aoffset2 + 1);
ctemp11 = *(aoffset2 + 2);
@@ -101,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp14 = *(aoffset2 + 5);
ctemp15 = *(aoffset2 + 6);
ctemp16 = *(aoffset2 + 7);
-
+
ctemp17 = *(aoffset3 + 0);
ctemp18 = *(aoffset3 + 1);
ctemp19 = *(aoffset3 + 2);
@@ -110,7 +110,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp22 = *(aoffset3 + 5);
ctemp23 = *(aoffset3 + 6);
ctemp24 = *(aoffset3 + 7);
-
+
ctemp25 = *(aoffset4 + 0);
ctemp26 = *(aoffset4 + 1);
ctemp27 = *(aoffset4 + 2);
@@ -119,7 +119,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp30 = *(aoffset4 + 5);
ctemp31 = *(aoffset4 + 6);
ctemp32 = *(aoffset4 + 7);
-
+
ctemp33 = *(aoffset5 + 0);
ctemp34 = *(aoffset5 + 1);
ctemp35 = *(aoffset5 + 2);
@@ -128,7 +128,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp38 = *(aoffset5 + 5);
ctemp39 = *(aoffset5 + 6);
ctemp40 = *(aoffset5 + 7);
-
+
ctemp41 = *(aoffset6 + 0);
ctemp42 = *(aoffset6 + 1);
ctemp43 = *(aoffset6 + 2);
@@ -137,7 +137,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp46 = *(aoffset6 + 5);
ctemp47 = *(aoffset6 + 6);
ctemp48 = *(aoffset6 + 7);
-
+
ctemp49 = *(aoffset7 + 0);
ctemp50 = *(aoffset7 + 1);
ctemp51 = *(aoffset7 + 2);
@@ -146,7 +146,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp54 = *(aoffset7 + 5);
ctemp55 = *(aoffset7 + 6);
ctemp56 = *(aoffset7 + 7);
-
+
ctemp57 = *(aoffset8 + 0);
ctemp58 = *(aoffset8 + 1);
ctemp59 = *(aoffset8 + 2);
@@ -155,7 +155,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp62 = *(aoffset8 + 5);
ctemp63 = *(aoffset8 + 6);
ctemp64 = *(aoffset8 + 7);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp09;
*(boffset + 2) = ctemp17;
@@ -164,7 +164,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp41;
*(boffset + 6) = ctemp49;
*(boffset + 7) = ctemp57;
-
+
*(boffset + 8) = ctemp02;
*(boffset + 9) = ctemp10;
*(boffset + 10) = ctemp18;
@@ -173,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 13) = ctemp42;
*(boffset + 14) = ctemp50;
*(boffset + 15) = ctemp58;
-
+
*(boffset + 16) = ctemp03;
*(boffset + 17) = ctemp11;
*(boffset + 18) = ctemp19;
@@ -252,7 +252,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp41 = *(aoffset6 + 0);
ctemp49 = *(aoffset7 + 0);
ctemp57 = *(aoffset8 + 0);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp09;
*(boffset + 2) = ctemp17;
@@ -270,7 +270,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset6 ++;
aoffset7 ++;
aoffset8 ++;
-
+
boffset += 8;
i --;
}while(i > 0);
@@ -278,14 +278,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
j--;
}while(j > 0);
} /* end of if(j > 0) */
-
+
if (n & 4){
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset += 4 * lda;
-
+
i = (m >> 2);
if (i > 0){
do{
@@ -293,42 +293,42 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
-
+
ctemp05 = *(aoffset2 + 0);
ctemp06 = *(aoffset2 + 1);
ctemp07 = *(aoffset2 + 2);
ctemp08 = *(aoffset2 + 3);
-
+
ctemp09 = *(aoffset3 + 0);
ctemp10 = *(aoffset3 + 1);
ctemp11 = *(aoffset3 + 2);
ctemp12 = *(aoffset3 + 3);
-
+
ctemp13 = *(aoffset4 + 0);
ctemp14 = *(aoffset4 + 1);
ctemp15 = *(aoffset4 + 2);
ctemp16 = *(aoffset4 + 3);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp05;
*(boffset + 2) = ctemp09;
*(boffset + 3) = ctemp13;
-
+
*(boffset + 4) = ctemp02;
*(boffset + 5) = ctemp06;
*(boffset + 6) = ctemp10;
*(boffset + 7) = ctemp14;
-
+
*(boffset + 8) = ctemp03;
*(boffset + 9) = ctemp07;
*(boffset + 10) = ctemp11;
*(boffset + 11) = ctemp15;
-
+
*(boffset + 12) = ctemp04;
*(boffset + 13) = ctemp08;
*(boffset + 14) = ctemp12;
*(boffset + 15) = ctemp16;
-
+
aoffset1 += 4;
aoffset2 += 4;
aoffset3 += 4;
@@ -337,7 +337,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
i --;
}while(i > 0);
}
-
+
i = (m & 3);
if (i > 0){
do{
@@ -345,7 +345,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp02 = *(aoffset2 + 0);
ctemp03 = *(aoffset3 + 0);
ctemp04 = *(aoffset4 + 0);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
@@ -366,7 +366,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset += 2 * lda;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -374,26 +374,26 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp03;
*(boffset + 2) = ctemp02;
*(boffset + 3) = ctemp04;
-
+
aoffset1 += 2;
aoffset2 += 2;
boffset += 4;
i --;
}while(i > 0);
}
-
+
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset2 + 0);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
-
+
aoffset1 ++;
aoffset2 ++;
boffset += 2;
@@ -402,7 +402,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
if (n & 1){
aoffset1 = aoffset;
-
+
i = m;
if (i > 0){
do{
@@ -415,7 +415,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
i --;
}while(i > 0);
}
-
+
} /* end of if(j > 0) */
return 0;
diff --git a/kernel/generic/gemm_tcopy_1.c b/kernel/generic/gemm_tcopy_1.c
index c0c8bd0..d0018bf 100644
--- a/kernel/generic/gemm_tcopy_1.c
+++ b/kernel/generic/gemm_tcopy_1.c
@@ -57,7 +57,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
b_offset1 = b_offset;
b_offset ++;
-
+
j = n;
if (j > 0) {
do {
diff --git a/kernel/generic/gemm_tcopy_16.c b/kernel/generic/gemm_tcopy_16.c
index e573225..6528d94 100644
--- a/kernel/generic/gemm_tcopy_16.c
+++ b/kernel/generic/gemm_tcopy_16.c
@@ -69,7 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 16;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -115,7 +115,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp06;
*(boffset + 6) = ctemp07;
*(boffset + 7) = ctemp08;
-
+
*(boffset + 8) = ctemp09;
*(boffset + 9) = ctemp10;
*(boffset + 10) = ctemp11;
@@ -124,7 +124,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 13) = ctemp14;
*(boffset + 14) = ctemp15;
*(boffset + 15) = ctemp16;
-
+
*(boffset + 16) = ctemp17;
*(boffset + 17) = ctemp18;
*(boffset + 18) = ctemp19;
@@ -168,7 +168,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp14 = *(aoffset1 + 13);
ctemp15 = *(aoffset1 + 14);
ctemp16 = *(aoffset1 + 15);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
@@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp06;
*(boffset + 6) = ctemp07;
*(boffset + 7) = ctemp08;
-
+
*(boffset + 8) = ctemp09;
*(boffset + 9) = ctemp10;
*(boffset + 10) = ctemp11;
@@ -198,7 +198,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 8;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -210,7 +210,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
-
+
ctemp09 = *(aoffset2 + 0);
ctemp10 = *(aoffset2 + 1);
ctemp11 = *(aoffset2 + 2);
@@ -228,7 +228,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp06;
*(boffset + 6) = ctemp07;
*(boffset + 7) = ctemp08;
-
+
*(boffset + 8) = ctemp09;
*(boffset + 9) = ctemp10;
*(boffset + 10) = ctemp11;
@@ -237,15 +237,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 13) = ctemp14;
*(boffset + 14) = ctemp15;
*(boffset + 15) = ctemp16;
-
+
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 16;
-
+
i --;
}while(i > 0);
}
-
+
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
@@ -255,7 +255,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
@@ -264,7 +264,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp06;
*(boffset + 6) = ctemp07;
*(boffset + 7) = ctemp08;
-
+
boffset += 8;
}
}
@@ -273,7 +273,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 4;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -295,15 +295,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp06;
*(boffset + 6) = ctemp07;
*(boffset + 7) = ctemp08;
-
+
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 8;
-
+
i --;
}while(i > 0);
}
-
+
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
@@ -314,7 +314,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
*(boffset + 3) = ctemp04;
-
+
boffset += 4;
}
}
@@ -323,7 +323,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 2;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -336,15 +336,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
*(boffset + 3) = ctemp04;
-
+
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 4;
-
+
i --;
}while(i > 0);
}
-
+
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
@@ -358,7 +358,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
if (n & 1){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -371,11 +371,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 2;
-
+
i --;
}while(i > 0);
}
-
+
if (m & 1){
ctemp01 = *(aoffset1 + 0);
*(boffset + 0) = ctemp01;
diff --git a/kernel/generic/gemm_tcopy_2.c b/kernel/generic/gemm_tcopy_2.c
index 0aa9c2e..5695b13 100644
--- a/kernel/generic/gemm_tcopy_2.c
+++ b/kernel/generic/gemm_tcopy_2.c
@@ -59,7 +59,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
b_offset1 = b_offset;
b_offset += 4;
-
+
j = (n >> 1);
if (j > 0){
do {
@@ -94,7 +94,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
j--;
} while (j > 0);
}
-
+
if (n & 1){
*(b_offset2 + 0) = *(a_offset + 0);
}
diff --git a/kernel/generic/gemm_tcopy_4.c b/kernel/generic/gemm_tcopy_4.c
index bd32090..df4c221 100644
--- a/kernel/generic/gemm_tcopy_4.c
+++ b/kernel/generic/gemm_tcopy_4.c
@@ -64,7 +64,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset += 4 * lda;
-
+
b_offset1 = b_offset;
b_offset += 16;
@@ -75,17 +75,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
-
+
ctemp5 = *(a_offset2 + 0);
ctemp6 = *(a_offset2 + 1);
ctemp7 = *(a_offset2 + 2);
ctemp8 = *(a_offset2 + 3);
-
+
ctemp9 = *(a_offset3 + 0);
ctemp10 = *(a_offset3 + 1);
ctemp11 = *(a_offset3 + 2);
ctemp12 = *(a_offset3 + 3);
-
+
ctemp13 = *(a_offset4 + 0);
ctemp14 = *(a_offset4 + 1);
ctemp15 = *(a_offset4 + 2);
@@ -95,27 +95,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset2 += 4;
a_offset3 += 4;
a_offset4 += 4;
-
+
*(b_offset1 + 0) = ctemp1;
*(b_offset1 + 1) = ctemp2;
*(b_offset1 + 2) = ctemp3;
*(b_offset1 + 3) = ctemp4;
-
+
*(b_offset1 + 4) = ctemp5;
*(b_offset1 + 5) = ctemp6;
*(b_offset1 + 6) = ctemp7;
*(b_offset1 + 7) = ctemp8;
-
+
*(b_offset1 + 8) = ctemp9;
*(b_offset1 + 9) = ctemp10;
*(b_offset1 + 10) = ctemp11;
*(b_offset1 + 11) = ctemp12;
-
+
*(b_offset1 + 12) = ctemp13;
*(b_offset1 + 13) = ctemp14;
*(b_offset1 + 14) = ctemp15;
*(b_offset1 + 15) = ctemp16;
-
+
b_offset1 += m * 4;
i --;
}while(i > 0);
@@ -127,28 +127,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp3 = *(a_offset2 + 0);
ctemp4 = *(a_offset2 + 1);
-
+
ctemp5 = *(a_offset3 + 0);
ctemp6 = *(a_offset3 + 1);
-
+
ctemp7 = *(a_offset4 + 0);
ctemp8 = *(a_offset4 + 1);
-
+
a_offset1 += 2;
a_offset2 += 2;
a_offset3 += 2;
a_offset4 += 2;
-
+
*(b_offset2 + 0) = ctemp1;
*(b_offset2 + 1) = ctemp2;
*(b_offset2 + 2) = ctemp3;
*(b_offset2 + 3) = ctemp4;
-
+
*(b_offset2 + 4) = ctemp5;
*(b_offset2 + 5) = ctemp6;
*(b_offset2 + 6) = ctemp7;
*(b_offset2 + 7) = ctemp8;
-
+
b_offset2 += 8;
}
@@ -157,12 +157,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset2 + 0);
ctemp3 = *(a_offset3 + 0);
ctemp4 = *(a_offset4 + 0);
-
+
*(b_offset3 + 0) = ctemp1;
*(b_offset3 + 1) = ctemp2;
*(b_offset3 + 2) = ctemp3;
*(b_offset3 + 3) = ctemp4;
-
+
b_offset3 += 4;
}
@@ -174,10 +174,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
-
+
b_offset1 = b_offset;
b_offset += 8;
-
+
i = (n >> 2);
if (i > 0){
do{
@@ -185,20 +185,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
-
+
ctemp5 = *(a_offset2 + 0);
ctemp6 = *(a_offset2 + 1);
ctemp7 = *(a_offset2 + 2);
ctemp8 = *(a_offset2 + 3);
-
+
a_offset1 += 4;
a_offset2 += 4;
-
+
*(b_offset1 + 0) = ctemp1;
*(b_offset1 + 1) = ctemp2;
*(b_offset1 + 2) = ctemp3;
*(b_offset1 + 3) = ctemp4;
-
+
*(b_offset1 + 4) = ctemp5;
*(b_offset1 + 5) = ctemp6;
*(b_offset1 + 6) = ctemp7;
@@ -212,25 +212,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
if (n & 2) {
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
-
+
ctemp3 = *(a_offset2 + 0);
ctemp4 = *(a_offset2 + 1);
-
+
a_offset1 += 2;
a_offset2 += 2;
-
+
*(b_offset2 + 0) = ctemp1;
*(b_offset2 + 1) = ctemp2;
*(b_offset2 + 2) = ctemp3;
*(b_offset2 + 3) = ctemp4;
-
+
b_offset2 += 4;
}
-
+
if (n & 1) {
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset2 + 0);
-
+
*(b_offset3 + 0) = ctemp1;
*(b_offset3 + 1) = ctemp2;
b_offset3 += 2;
@@ -240,7 +240,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
if (m & 1){
a_offset1 = a_offset;
b_offset1 = b_offset;
-
+
i = (n >> 2);
if (i > 0){
do{
@@ -248,9 +248,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
-
+
a_offset1 += 4;
-
+
*(b_offset1 + 0) = ctemp1;
*(b_offset1 + 1) = ctemp2;
*(b_offset1 + 2) = ctemp3;
@@ -266,11 +266,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
a_offset1 += 2;
-
+
*(b_offset2 + 0) = ctemp1;
*(b_offset2 + 1) = ctemp2;
}
-
+
if (n & 1) {
ctemp1 = *(a_offset1 + 0);
*(b_offset3 + 0) = ctemp1;
diff --git a/kernel/generic/gemm_tcopy_6.c b/kernel/generic/gemm_tcopy_6.c
index bd32090..df4c221 100644
--- a/kernel/generic/gemm_tcopy_6.c
+++ b/kernel/generic/gemm_tcopy_6.c
@@ -64,7 +64,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset += 4 * lda;
-
+
b_offset1 = b_offset;
b_offset += 16;
@@ -75,17 +75,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
-
+
ctemp5 = *(a_offset2 + 0);
ctemp6 = *(a_offset2 + 1);
ctemp7 = *(a_offset2 + 2);
ctemp8 = *(a_offset2 + 3);
-
+
ctemp9 = *(a_offset3 + 0);
ctemp10 = *(a_offset3 + 1);
ctemp11 = *(a_offset3 + 2);
ctemp12 = *(a_offset3 + 3);
-
+
ctemp13 = *(a_offset4 + 0);
ctemp14 = *(a_offset4 + 1);
ctemp15 = *(a_offset4 + 2);
@@ -95,27 +95,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset2 += 4;
a_offset3 += 4;
a_offset4 += 4;
-
+
*(b_offset1 + 0) = ctemp1;
*(b_offset1 + 1) = ctemp2;
*(b_offset1 + 2) = ctemp3;
*(b_offset1 + 3) = ctemp4;
-
+
*(b_offset1 + 4) = ctemp5;
*(b_offset1 + 5) = ctemp6;
*(b_offset1 + 6) = ctemp7;
*(b_offset1 + 7) = ctemp8;
-
+
*(b_offset1 + 8) = ctemp9;
*(b_offset1 + 9) = ctemp10;
*(b_offset1 + 10) = ctemp11;
*(b_offset1 + 11) = ctemp12;
-
+
*(b_offset1 + 12) = ctemp13;
*(b_offset1 + 13) = ctemp14;
*(b_offset1 + 14) = ctemp15;
*(b_offset1 + 15) = ctemp16;
-
+
b_offset1 += m * 4;
i --;
}while(i > 0);
@@ -127,28 +127,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp3 = *(a_offset2 + 0);
ctemp4 = *(a_offset2 + 1);
-
+
ctemp5 = *(a_offset3 + 0);
ctemp6 = *(a_offset3 + 1);
-
+
ctemp7 = *(a_offset4 + 0);
ctemp8 = *(a_offset4 + 1);
-
+
a_offset1 += 2;
a_offset2 += 2;
a_offset3 += 2;
a_offset4 += 2;
-
+
*(b_offset2 + 0) = ctemp1;
*(b_offset2 + 1) = ctemp2;
*(b_offset2 + 2) = ctemp3;
*(b_offset2 + 3) = ctemp4;
-
+
*(b_offset2 + 4) = ctemp5;
*(b_offset2 + 5) = ctemp6;
*(b_offset2 + 6) = ctemp7;
*(b_offset2 + 7) = ctemp8;
-
+
b_offset2 += 8;
}
@@ -157,12 +157,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset2 + 0);
ctemp3 = *(a_offset3 + 0);
ctemp4 = *(a_offset4 + 0);
-
+
*(b_offset3 + 0) = ctemp1;
*(b_offset3 + 1) = ctemp2;
*(b_offset3 + 2) = ctemp3;
*(b_offset3 + 3) = ctemp4;
-
+
b_offset3 += 4;
}
@@ -174,10 +174,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
-
+
b_offset1 = b_offset;
b_offset += 8;
-
+
i = (n >> 2);
if (i > 0){
do{
@@ -185,20 +185,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
-
+
ctemp5 = *(a_offset2 + 0);
ctemp6 = *(a_offset2 + 1);
ctemp7 = *(a_offset2 + 2);
ctemp8 = *(a_offset2 + 3);
-
+
a_offset1 += 4;
a_offset2 += 4;
-
+
*(b_offset1 + 0) = ctemp1;
*(b_offset1 + 1) = ctemp2;
*(b_offset1 + 2) = ctemp3;
*(b_offset1 + 3) = ctemp4;
-
+
*(b_offset1 + 4) = ctemp5;
*(b_offset1 + 5) = ctemp6;
*(b_offset1 + 6) = ctemp7;
@@ -212,25 +212,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
if (n & 2) {
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
-
+
ctemp3 = *(a_offset2 + 0);
ctemp4 = *(a_offset2 + 1);
-
+
a_offset1 += 2;
a_offset2 += 2;
-
+
*(b_offset2 + 0) = ctemp1;
*(b_offset2 + 1) = ctemp2;
*(b_offset2 + 2) = ctemp3;
*(b_offset2 + 3) = ctemp4;
-
+
b_offset2 += 4;
}
-
+
if (n & 1) {
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset2 + 0);
-
+
*(b_offset3 + 0) = ctemp1;
*(b_offset3 + 1) = ctemp2;
b_offset3 += 2;
@@ -240,7 +240,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
if (m & 1){
a_offset1 = a_offset;
b_offset1 = b_offset;
-
+
i = (n >> 2);
if (i > 0){
do{
@@ -248,9 +248,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
-
+
a_offset1 += 4;
-
+
*(b_offset1 + 0) = ctemp1;
*(b_offset1 + 1) = ctemp2;
*(b_offset1 + 2) = ctemp3;
@@ -266,11 +266,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
a_offset1 += 2;
-
+
*(b_offset2 + 0) = ctemp1;
*(b_offset2 + 1) = ctemp2;
}
-
+
if (n & 1) {
ctemp1 = *(a_offset1 + 0);
*(b_offset3 + 0) = ctemp1;
diff --git a/kernel/generic/gemm_tcopy_8.c b/kernel/generic/gemm_tcopy_8.c
index 8f6e33c..9770d11 100644
--- a/kernel/generic/gemm_tcopy_8.c
+++ b/kernel/generic/gemm_tcopy_8.c
@@ -105,7 +105,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
aoffset1 += 8;
-
+
ctemp09 = *(aoffset2 + 0);
ctemp10 = *(aoffset2 + 1);
ctemp11 = *(aoffset2 + 2);
@@ -115,7 +115,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp15 = *(aoffset2 + 6);
ctemp16 = *(aoffset2 + 7);
aoffset2 += 8;
-
+
ctemp17 = *(aoffset3 + 0);
ctemp18 = *(aoffset3 + 1);
ctemp19 = *(aoffset3 + 2);
@@ -125,7 +125,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp23 = *(aoffset3 + 6);
ctemp24 = *(aoffset3 + 7);
aoffset3 += 8;
-
+
ctemp25 = *(aoffset4 + 0);
ctemp26 = *(aoffset4 + 1);
ctemp27 = *(aoffset4 + 2);
@@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp39 = *(aoffset5 + 6);
ctemp40 = *(aoffset5 + 7);
aoffset5 += 8;
-
+
ctemp41 = *(aoffset6 + 0);
ctemp42 = *(aoffset6 + 1);
ctemp43 = *(aoffset6 + 2);
@@ -155,7 +155,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp47 = *(aoffset6 + 6);
ctemp48 = *(aoffset6 + 7);
aoffset6 += 8;
-
+
ctemp49 = *(aoffset7 + 0);
ctemp50 = *(aoffset7 + 1);
ctemp51 = *(aoffset7 + 2);
@@ -165,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp55 = *(aoffset7 + 6);
ctemp56 = *(aoffset7 + 7);
aoffset7 += 8;
-
+
ctemp57 = *(aoffset8 + 0);
ctemp58 = *(aoffset8 + 1);
ctemp59 = *(aoffset8 + 2);
@@ -175,7 +175,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp63 = *(aoffset8 + 6);
ctemp64 = *(aoffset8 + 7);
aoffset8 += 8;
-
+
*(boffset1 + 0) = ctemp01;
*(boffset1 + 1) = ctemp02;
*(boffset1 + 2) = ctemp03;
@@ -184,7 +184,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 5) = ctemp06;
*(boffset1 + 6) = ctemp07;
*(boffset1 + 7) = ctemp08;
-
+
*(boffset1 + 8) = ctemp09;
*(boffset1 + 9) = ctemp10;
*(boffset1 + 10) = ctemp11;
@@ -202,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 21) = ctemp22;
*(boffset1 + 22) = ctemp23;
*(boffset1 + 23) = ctemp24;
-
+
*(boffset1 + 24) = ctemp25;
*(boffset1 + 25) = ctemp26;
*(boffset1 + 26) = ctemp27;
@@ -220,7 +220,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 37) = ctemp38;
*(boffset1 + 38) = ctemp39;
*(boffset1 + 39) = ctemp40;
-
+
*(boffset1 + 40) = ctemp41;
*(boffset1 + 41) = ctemp42;
*(boffset1 + 42) = ctemp43;
@@ -238,7 +238,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 53) = ctemp54;
*(boffset1 + 54) = ctemp55;
*(boffset1 + 55) = ctemp56;
-
+
*(boffset1 + 56) = ctemp57;
*(boffset1 + 57) = ctemp58;
*(boffset1 + 58) = ctemp59;
@@ -247,7 +247,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 61) = ctemp62;
*(boffset1 + 62) = ctemp63;
*(boffset1 + 63) = ctemp64;
-
+
boffset1 += m * 8;
i --;
}while(i > 0);
@@ -259,49 +259,49 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
aoffset1 += 4;
-
+
ctemp05 = *(aoffset2 + 0);
ctemp06 = *(aoffset2 + 1);
ctemp07 = *(aoffset2 + 2);
ctemp08 = *(aoffset2 + 3);
aoffset2 += 4;
-
+
ctemp09 = *(aoffset3 + 0);
ctemp10 = *(aoffset3 + 1);
ctemp11 = *(aoffset3 + 2);
ctemp12 = *(aoffset3 + 3);
aoffset3 += 4;
-
+
ctemp13 = *(aoffset4 + 0);
ctemp14 = *(aoffset4 + 1);
ctemp15 = *(aoffset4 + 2);
ctemp16 = *(aoffset4 + 3);
aoffset4 += 4;
-
+
ctemp17 = *(aoffset5 + 0);
ctemp18 = *(aoffset5 + 1);
ctemp19 = *(aoffset5 + 2);
ctemp20 = *(aoffset5 + 3);
aoffset5 += 4;
-
+
ctemp21 = *(aoffset6 + 0);
ctemp22 = *(aoffset6 + 1);
ctemp23 = *(aoffset6 + 2);
ctemp24 = *(aoffset6 + 3);
aoffset6 += 4;
-
+
ctemp25 = *(aoffset7 + 0);
ctemp26 = *(aoffset7 + 1);
ctemp27 = *(aoffset7 + 2);
ctemp28 = *(aoffset7 + 3);
aoffset7 += 4;
-
+
ctemp29 = *(aoffset8 + 0);
ctemp30 = *(aoffset8 + 1);
ctemp31 = *(aoffset8 + 2);
ctemp32 = *(aoffset8 + 3);
aoffset8 += 4;
-
+
*(boffset2 + 0) = ctemp01;
*(boffset2 + 1) = ctemp02;
*(boffset2 + 2) = ctemp03;
@@ -343,35 +343,35 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
aoffset1 += 2;
-
+
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
aoffset2 += 2;
-
+
ctemp05 = *(aoffset3 + 0);
ctemp06 = *(aoffset3 + 1);
aoffset3 += 2;
-
+
ctemp07 = *(aoffset4 + 0);
ctemp08 = *(aoffset4 + 1);
aoffset4 += 2;
-
+
ctemp09 = *(aoffset5 + 0);
ctemp10 = *(aoffset5 + 1);
aoffset5 += 2;
-
+
ctemp11 = *(aoffset6 + 0);
ctemp12 = *(aoffset6 + 1);
aoffset6 += 2;
-
+
ctemp13 = *(aoffset7 + 0);
ctemp14 = *(aoffset7 + 1);
aoffset7 += 2;
-
+
ctemp15 = *(aoffset8 + 0);
ctemp16 = *(aoffset8 + 1);
aoffset8 += 2;
-
+
*(boffset3 + 0) = ctemp01;
*(boffset3 + 1) = ctemp02;
*(boffset3 + 2) = ctemp03;
@@ -408,7 +408,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset7 ++;
ctemp08 = *(aoffset8 + 0);
aoffset8 ++;
-
+
*(boffset4 + 0) = ctemp01;
*(boffset4 + 1) = ctemp02;
*(boffset4 + 2) = ctemp03;
@@ -431,10 +431,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset += 4 * lda;
-
+
boffset1 = boffset;
boffset += 32;
-
+
i = (n >> 3);
if (i > 0){
@@ -448,7 +448,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
aoffset1 += 8;
-
+
ctemp09 = *(aoffset2 + 0);
ctemp10 = *(aoffset2 + 1);
ctemp11 = *(aoffset2 + 2);
@@ -458,7 +458,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp15 = *(aoffset2 + 6);
ctemp16 = *(aoffset2 + 7);
aoffset2 += 8;
-
+
ctemp17 = *(aoffset3 + 0);
ctemp18 = *(aoffset3 + 1);
ctemp19 = *(aoffset3 + 2);
@@ -468,7 +468,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp23 = *(aoffset3 + 6);
ctemp24 = *(aoffset3 + 7);
aoffset3 += 8;
-
+
ctemp25 = *(aoffset4 + 0);
ctemp26 = *(aoffset4 + 1);
ctemp27 = *(aoffset4 + 2);
@@ -478,7 +478,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp31 = *(aoffset4 + 6);
ctemp32 = *(aoffset4 + 7);
aoffset4 += 8;
-
+
*(boffset1 + 0) = ctemp01;
*(boffset1 + 1) = ctemp02;
*(boffset1 + 2) = ctemp03;
@@ -487,7 +487,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 5) = ctemp06;
*(boffset1 + 6) = ctemp07;
*(boffset1 + 7) = ctemp08;
-
+
*(boffset1 + 8) = ctemp09;
*(boffset1 + 9) = ctemp10;
*(boffset1 + 10) = ctemp11;
@@ -496,7 +496,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 13) = ctemp14;
*(boffset1 + 14) = ctemp15;
*(boffset1 + 15) = ctemp16;
-
+
*(boffset1 + 16) = ctemp17;
*(boffset1 + 17) = ctemp18;
*(boffset1 + 18) = ctemp19;
@@ -526,25 +526,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
aoffset1 += 4;
-
+
ctemp05 = *(aoffset2 + 0);
ctemp06 = *(aoffset2 + 1);
ctemp07 = *(aoffset2 + 2);
ctemp08 = *(aoffset2 + 3);
aoffset2 += 4;
-
+
ctemp09 = *(aoffset3 + 0);
ctemp10 = *(aoffset3 + 1);
ctemp11 = *(aoffset3 + 2);
ctemp12 = *(aoffset3 + 3);
aoffset3 += 4;
-
+
ctemp13 = *(aoffset4 + 0);
ctemp14 = *(aoffset4 + 1);
ctemp15 = *(aoffset4 + 2);
ctemp16 = *(aoffset4 + 3);
aoffset4 += 4;
-
+
*(boffset2 + 0) = ctemp01;
*(boffset2 + 1) = ctemp02;
*(boffset2 + 2) = ctemp03;
@@ -553,7 +553,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset2 + 5) = ctemp06;
*(boffset2 + 6) = ctemp07;
*(boffset2 + 7) = ctemp08;
-
+
*(boffset2 + 8) = ctemp09;
*(boffset2 + 9) = ctemp10;
*(boffset2 + 10) = ctemp11;
@@ -564,24 +564,24 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset2 + 15) = ctemp16;
boffset2 += 16;
}
-
+
if (n & 2){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
aoffset1 += 2;
-
+
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
aoffset2 += 2;
-
+
ctemp05 = *(aoffset3 + 0);
ctemp06 = *(aoffset3 + 1);
aoffset3 += 2;
-
+
ctemp07 = *(aoffset4 + 0);
ctemp08 = *(aoffset4 + 1);
aoffset4 += 2;
-
+
*(boffset3 + 0) = ctemp01;
*(boffset3 + 1) = ctemp02;
*(boffset3 + 2) = ctemp03;
@@ -592,7 +592,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset3 + 7) = ctemp08;
boffset3 += 8;
}
-
+
if (n & 1){
ctemp01 = *(aoffset1 + 0);
aoffset1 ++;
@@ -602,7 +602,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset3 ++;
ctemp04 = *(aoffset4 + 0);
aoffset4 ++;
-
+
*(boffset4 + 0) = ctemp01;
*(boffset4 + 1) = ctemp02;
*(boffset4 + 2) = ctemp03;
@@ -610,15 +610,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
boffset4 += 4;
}
}
-
+
if (m & 2){
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset += 2 * lda;
-
+
boffset1 = boffset;
boffset += 16;
-
+
i = (n >> 3);
if (i > 0){
do{
@@ -631,7 +631,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
aoffset1 += 8;
-
+
ctemp09 = *(aoffset2 + 0);
ctemp10 = *(aoffset2 + 1);
ctemp11 = *(aoffset2 + 2);
@@ -641,7 +641,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp15 = *(aoffset2 + 6);
ctemp16 = *(aoffset2 + 7);
aoffset2 += 8;
-
+
*(boffset1 + 0) = ctemp01;
*(boffset1 + 1) = ctemp02;
*(boffset1 + 2) = ctemp03;
@@ -650,7 +650,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 5) = ctemp06;
*(boffset1 + 6) = ctemp07;
*(boffset1 + 7) = ctemp08;
-
+
*(boffset1 + 8) = ctemp09;
*(boffset1 + 9) = ctemp10;
*(boffset1 + 10) = ctemp11;
@@ -659,25 +659,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 13) = ctemp14;
*(boffset1 + 14) = ctemp15;
*(boffset1 + 15) = ctemp16;
-
+
boffset1 += 8 * m;
i --;
}while(i > 0);
}
-
+
if (n & 4){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
aoffset1 += 4;
-
+
ctemp05 = *(aoffset2 + 0);
ctemp06 = *(aoffset2 + 1);
ctemp07 = *(aoffset2 + 2);
ctemp08 = *(aoffset2 + 3);
aoffset2 += 4;
-
+
*(boffset2 + 0) = ctemp01;
*(boffset2 + 1) = ctemp02;
*(boffset2 + 2) = ctemp03;
@@ -688,29 +688,29 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset2 + 7) = ctemp08;
boffset2 += 8;
}
-
+
if (n & 2){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
aoffset1 += 2;
-
+
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
aoffset2 += 2;
-
+
*(boffset3 + 0) = ctemp01;
*(boffset3 + 1) = ctemp02;
*(boffset3 + 2) = ctemp03;
*(boffset3 + 3) = ctemp04;
boffset3 += 4;
}
-
+
if (n & 1){
ctemp01 = *(aoffset1 + 0);
aoffset1 ++;
ctemp02 = *(aoffset2 + 0);
aoffset2 ++;
-
+
*(boffset4 + 0) = ctemp01;
*(boffset4 + 1) = ctemp02;
boffset4 += 2;
@@ -720,10 +720,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
if (m & 1){
aoffset1 = aoffset;
aoffset += lda;
-
+
boffset1 = boffset;
boffset += 8;
-
+
i = (n >> 3);
if (i > 0){
do{
@@ -736,7 +736,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
aoffset1 += 8;
-
+
*(boffset1 + 0) = ctemp01;
*(boffset1 + 1) = ctemp02;
*(boffset1 + 2) = ctemp03;
@@ -745,7 +745,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 5) = ctemp06;
*(boffset1 + 6) = ctemp07;
*(boffset1 + 7) = ctemp08;
-
+
boffset1 += 8 * m;
i --;
}while(i > 0);
@@ -774,7 +774,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset3 + 1) = ctemp02;
boffset3 += 2;
}
-
+
if (n & 1){
ctemp01 = *(aoffset1 + 0);
aoffset1 ++;
diff --git a/kernel/generic/gemmkernel_2x2.c b/kernel/generic/gemmkernel_2x2.c
index 3645ef1..01f1c67 100644
--- a/kernel/generic/gemmkernel_2x2.c
+++ b/kernel/generic/gemmkernel_2x2.c
@@ -3,24 +3,24 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#ifdef TRMMKERNEL
,BLASLONG offset
#endif
- )
+ )
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
- for (j=0; j<bn/2; j+=1)
+ for (j=0; j<bn/2; j+=1)
{
C0 = C;
C1 = C0+ldc;
ptrba = ba;
- for (i=0; i<bm/2; i+=1)
+ for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
- for (k=0; k<bk/4; k+=1)
+ for (k=0; k<bk/4; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
@@ -57,7 +57,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
ptrba = ptrba+8;
ptrbb = ptrbb+8;
}
- for (k=0; k<(bk&3); k+=1)
+ for (k=0; k<(bk&3); k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
@@ -81,12 +81,12 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C0 = C0+2;
C1 = C1+2;
}
- for (i=0; i<(bm&1); i+=1)
+ for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
- for (k=0; k<bk; k+=1)
+ for (k=0; k<bk; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[2*0+0];
@@ -108,16 +108,16 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
i = (ldc<<1);
C = C+i;
}
- for (j=0; j<(bn&1); j+=1)
+ for (j=0; j<(bn&1); j+=1)
{
C0 = C;
ptrba = ba;
- for (i=0; i<bm/2; i+=1)
+ for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
- for (k=0; k<bk; k+=1)
+ for (k=0; k<bk; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[0+0];
@@ -133,11 +133,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C0[1] = C0[1]+res1;
C0 = C0+2;
}
- for (i=0; i<(bm&1); i+=1)
+ for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
- for (k=0; k<bk; k+=1)
+ for (k=0; k<bk; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[0+0];
diff --git a/kernel/generic/ger.c b/kernel/generic/ger.c
index 2438786..f11738c 100644
--- a/kernel/generic/ger.c
+++ b/kernel/generic/ger.c
@@ -39,7 +39,7 @@
#include <stdio.h>
#include "common.h"
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
FLOAT *x, BLASLONG incx,
FLOAT *y, BLASLONG incy,
FLOAT *a, BLASLONG lda, FLOAT *buffer){
@@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
COPY_K(m, x, incx, X, 1);
}
- while (n > 0) {
+ while (n > 0) {
AXPYU_K(m, 0, 0, alpha * *y, X, 1, a, 1, NULL, 0);
a += lda;
y += incy;
diff --git a/kernel/generic/laswp_ncopy_1.c b/kernel/generic/laswp_ncopy_1.c
index 4394474..90fe173 100644
--- a/kernel/generic/laswp_ncopy_1.c
+++ b/kernel/generic/laswp_ncopy_1.c
@@ -53,36 +53,36 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
k1 --;
ipiv += k1;
-
+
if (n <= 0) return 0;
-
-
+
+
j = n;
do {
piv = ipiv;
-
+
a1 = a + k1 + 1;
-
+
ip1 = *(piv + 0);
ip2 = *(piv + 1);
piv += 2;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
i = ((k2 - k1) >> 1);
-
+
if (i > 0) {
do {
A1 = *a1;
A2 = *a2;
B1 = *b1;
B2 = *b2;
-
+
ip1 = *(piv + 0);
ip2 = *(piv + 1);
piv += 2;
-
+
if (b1 == a1) {
if (b2 == a2) {
*(buffer + 0) = A1;
@@ -93,7 +93,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*b2 = A2;
}
- } else
+ } else
if (b1 == a2) {
if (b2 == a2) {
*(buffer + 0) = A2;
@@ -108,7 +108,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(buffer + 0) = B1;
*(buffer + 1) = A2;
*b1 = A1;
- } else
+ } else
if (b2 == b1) {
*(buffer + 0) = B1;
*(buffer + 1) = A1;
@@ -120,24 +120,24 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*b2 = A2;
}
}
-
+
buffer += 2;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
a1 += 2;
-
+
i --;
} while (i > 0);
}
-
+
i = ((k2 - k1) & 1);
-
+
if (i > 0) {
A1 = *a1;
B1 = *b1;
-
+
if (a1 == b1) {
*(buffer + 0) = A1;
} else {
@@ -150,5 +150,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
} while (j > 0);
return 0;
-}
+}
diff --git a/kernel/generic/laswp_ncopy_2.c b/kernel/generic/laswp_ncopy_2.c
index 806a1e1..a29562d 100644
--- a/kernel/generic/laswp_ncopy_2.c
+++ b/kernel/generic/laswp_ncopy_2.c
@@ -58,27 +58,27 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
ipiv += k1;
if (n <= 0) return 0;
-
+
j = (n >> 1);
if (j > 0) {
do {
piv = ipiv;
-
+
a1 = a + k1 + 1;
a3 = a1 + 1 * lda;
-
+
ip1 = *(piv + 0);
ip2 = *(piv + 1);
piv += 2;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
-
+
i = ((k2 - k1) >> 1);
-
+
if (i > 0) {
do {
@@ -91,16 +91,16 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
A2 = *a2;
A3 = *a3;
A4 = *a4;
-
+
B1 = *b1;
B2 = *b2;
B3 = *b3;
B4 = *b4;
-
+
ip1 = *(piv + 0);
ip2 = *(piv + 1);
piv += 2;
-
+
if (b1 == a1) {
if (b2 == a2) {
*(buffer + 0) = A1;
@@ -112,11 +112,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(buffer + 1) = A3;
*(buffer + 2) = B2;
*(buffer + 3) = B4;
-
+
*b2 = A2;
*b4 = A4;
}
- } else
+ } else
if (b1 == a2) {
if (b2 == a2) {
*(buffer + 0) = A2;
@@ -139,7 +139,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(buffer + 3) = A4;
*b1 = A1;
*b3 = A3;
- } else
+ } else
if (b2 == b1) {
*(buffer + 0) = B1;
*(buffer + 1) = B3;
@@ -158,30 +158,30 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*b4 = A4;
}
}
-
+
buffer += 4;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
-
+
a1 += 2;
a3 += 2;
-
+
i --;
} while (i > 0);
}
-
+
i = ((k2 - k1) & 1);
-
+
if (i > 0) {
A1 = *a1;
B1 = *b1;
A3 = *a3;
B3 = *b3;
-
+
if (a1 == b1) {
*(buffer + 0) = A1;
*(buffer + 1) = A3;
@@ -193,37 +193,37 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
}
buffer += 2;
}
-
+
a += 2 * lda;
j --;
} while (j > 0);
}
-
+
if (n & 1) {
piv = ipiv;
a1 = a + k1 + 1;
-
+
ip1 = *(piv + 0);
ip2 = *(piv + 1);
piv += 2;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
i = ((k2 - k1) >> 1);
-
+
if (i > 0) {
do {
A1 = *a1;
A2 = *a2;
B1 = *b1;
B2 = *b2;
-
+
ip1 = *(piv + 0);
ip2 = *(piv + 1);
piv += 2;
-
+
if (b1 == a1) {
if (b2 == a2) {
*(buffer + 0) = A1;
@@ -234,7 +234,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*b2 = A2;
}
- } else
+ } else
if (b1 == a2) {
if (b2 == a2) {
*(buffer + 0) = A2;
@@ -249,7 +249,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(buffer + 0) = B1;
*(buffer + 1) = A2;
*b1 = A1;
- } else
+ } else
if (b2 == b1) {
*(buffer + 0) = B1;
*(buffer + 1) = A1;
@@ -261,20 +261,20 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*b2 = A2;
}
}
-
+
buffer += 2;
b1 = a + ip1;
b2 = a + ip2;
-
+
a1 += 2;
i --;
} while (i > 0);
}
-
+
i = ((k2 - k1) & 1);
-
+
if (i > 0) {
A1 = *a1;
B1 = *b1;
@@ -289,5 +289,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
}
return 0;
-}
+}
diff --git a/kernel/generic/laswp_ncopy_4.c b/kernel/generic/laswp_ncopy_4.c
index 0736f07..761d158 100644
--- a/kernel/generic/laswp_ncopy_4.c
+++ b/kernel/generic/laswp_ncopy_4.c
@@ -69,7 +69,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
piv = ipiv;
a1 = a + k1 + 1;
-
+
a3 = a1 + 1 * lda;
a5 = a1 + 2 * lda;
a7 = a1 + 3 * lda;
@@ -77,10 +77,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
ip1 = *(piv + 0);
ip2 = *(piv + 1);
piv += 2;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
b5 = b1 + 2 * lda;
@@ -89,7 +89,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
b8 = b2 + 3 * lda;
i = ((k2 - k1) >> 1);
-
+
if (i > 0) {
do {
@@ -117,11 +117,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
B6 = *b6;
B7 = *b7;
B8 = *b8;
-
+
ip1 = *(piv + 0);
ip2 = *(piv + 1);
piv += 2;
-
+
if (b1 == a1) {
if (b2 == a2) {
*(buffer + 0) = A1;
@@ -149,7 +149,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*b6 = A6;
*b8 = A8;
}
- } else
+ } else
if (b1 == a2) {
if (b2 == a2) {
*(buffer + 0) = A2;
@@ -188,7 +188,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*b3 = A3;
*b5 = A5;
*b7 = A7;
- } else
+ } else
if (b2 == b1) {
*(buffer + 0) = B1;
*(buffer + 1) = B3;
@@ -221,19 +221,19 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*b8 = A8;
}
}
-
+
buffer += 8;
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
b5 = b1 + 2 * lda;
b6 = b2 + 2 * lda;
b7 = b1 + 3 * lda;
b8 = b2 + 3 * lda;
-
+
a1 += 2;
a3 += 2;
a5 += 2;
@@ -242,9 +242,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
i --;
} while (i > 0);
}
-
+
i = ((k2 - k1) & 1);
-
+
if (i > 0) {
A1 = *a1;
B1 = *b1;
@@ -274,7 +274,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
}
a += 4 * lda;
-
+
j --;
} while (j > 0);
}
@@ -284,35 +284,35 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
a1 = a + k1 + 1;
a3 = a1 + 1 * lda;
-
+
ip1 = *(piv + 0);
ip2 = *(piv + 1);
piv += 2;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
-
+
i = ((k2 - k1) >> 1);
-
+
if (i > 0) {
do {
A1 = *a1;
A2 = *a2;
A3 = *a3;
A4 = *a4;
-
+
B1 = *b1;
B2 = *b2;
B3 = *b3;
B4 = *b4;
-
+
ip1 = *(piv + 0);
ip2 = *(piv + 1);
piv += 2;
-
+
if (b1 == a1) {
if (b2 == a2) {
*(buffer + 0) = A1;
@@ -328,7 +328,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*b2 = A2;
*b4 = A4;
}
- } else
+ } else
if (b1 == a2) {
if (b2 == a2) {
*(buffer + 0) = A2;
@@ -351,7 +351,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(buffer + 3) = A4;
*b1 = A1;
*b3 = A3;
- } else
+ } else
if (b2 == b1) {
*(buffer + 0) = B1;
*(buffer + 1) = B3;
@@ -370,24 +370,24 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*b4 = A4;
}
}
-
+
buffer += 4;
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
-
+
a1 += 2;
a3 += 2;
i --;
} while (i > 0);
}
-
+
i = ((k2 - k1) & 1);
-
+
if (i > 0) {
A1 = *a1;
B1 = *b1;
@@ -405,7 +405,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
}
buffer += 2;
}
-
+
a += 2 * lda;
}
@@ -413,27 +413,27 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
piv = ipiv;
a1 = a + k1 + 1;
-
+
ip1 = *(piv + 0);
ip2 = *(piv + 1);
piv += 2;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
i = ((k2 - k1) >> 1);
-
+
if (i > 0) {
do {
A1 = *a1;
A2 = *a2;
B1 = *b1;
B2 = *b2;
-
+
ip1 = *(piv + 0);
ip2 = *(piv + 1);
piv += 2;
-
+
if (b1 == a1) {
if (b2 == a2) {
*(buffer + 0) = A1;
@@ -444,7 +444,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*b2 = A2;
}
- } else
+ } else
if (b1 == a2) {
if (b2 == a2) {
*(buffer + 0) = A2;
@@ -459,7 +459,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(buffer + 0) = B1;
*(buffer + 1) = A2;
*b1 = A1;
- } else
+ } else
if (b2 == b1) {
*(buffer + 0) = B1;
*(buffer + 1) = A1;
@@ -471,20 +471,20 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*b2 = A2;
}
}
-
+
buffer += 2;
b1 = a + ip1;
b2 = a + ip2;
-
+
a1 += 2;
i --;
} while (i > 0);
}
-
+
i = ((k2 - k1) & 1);
-
+
if (i > 0) {
A1 = *a1;
B1 = *b1;
@@ -499,5 +499,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
}
return 0;
-}
+}
diff --git a/kernel/generic/laswp_ncopy_8.c b/kernel/generic/laswp_ncopy_8.c
index e08c8ce..bb7408c 100644
--- a/kernel/generic/laswp_ncopy_8.c
+++ b/kernel/generic/laswp_ncopy_8.c
@@ -77,7 +77,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
do {
ip = *piv;
piv ++;
-
+
dx1 = a + i;
dy1 = a + ip;
dx2 = a + i + lda * 1;
@@ -123,7 +123,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
btemp7 = *dy7;
atemp8 = *dx8;
btemp8 = *dy8;
-
+
if (ip != i) {
*dy1 = atemp1;
*dy2 = atemp2;
@@ -151,12 +151,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(buffer + 6) = atemp7;
*(buffer + 7) = atemp8;
}
-
+
buffer += 8;
i++;
} while (i <= k2);
-
+
a += 8 * lda;
j --;
} while (j > 0);
@@ -164,10 +164,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
if (n & 4) {
piv = ipiv;
-
+
ip = *piv;
piv ++;
-
+
dx1 = a + k1;
dy1 = a + ip;
dx2 = a + k1 + lda * 1;
@@ -178,7 +178,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
dy4 = a + ip + lda * 3;
i = k1;
-
+
do {
atemp1 = *dx1;
atemp2 = *dx2;
@@ -189,7 +189,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
btemp2 = *dy2;
btemp3 = *dy3;
btemp4 = *dy4;
-
+
if (ip != i) {
*dy1 = atemp1;
*dy2 = atemp2;
@@ -205,10 +205,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(buffer + 2) = atemp3;
*(buffer + 3) = atemp4;
}
-
+
ip = *piv;
piv ++;
-
+
i++;
dx1 = a + i;
dy1 = a + ip;
@@ -222,18 +222,18 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
buffer += 4;
} while (i <= k2);
-
+
a += 4 * lda;
}
if (n & 2) {
piv = ipiv;
-
+
i = k1;
do {
ip = *piv;
piv ++;
-
+
dx1 = a + i;
dy1 = a + ip;
dx2 = a + i + lda;
@@ -243,7 +243,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
btemp1 = *dy1;
atemp2 = *dx2;
btemp2 = *dy2;
-
+
if (ip != i) {
*dy1 = atemp1;
*dy2 = atemp2;
@@ -253,44 +253,44 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(buffer + 0) = atemp1;
*(buffer + 1) = atemp2;
}
-
+
buffer += 2;
i++;
} while (i <= k2);
-
+
a += 2 * lda;
}
if (n & 1) {
piv = ipiv;
-
+
i = k1;
do {
ip = *piv;
piv ++;
-
+
dx1 = a + i;
dy1 = a + ip;
atemp1 = *dx1;
btemp1 = *dy1;
-
+
if (ip != i) {
*dy1 = atemp1;
*buffer = btemp1;
} else {
*buffer = atemp1;
}
-
+
buffer ++;
i++;
} while (i <= k2);
-
+
a += lda;
}
return 0;
-}
+}
diff --git a/kernel/generic/neg_tcopy_1.c b/kernel/generic/neg_tcopy_1.c
index 3845f04..ff2339c 100644
--- a/kernel/generic/neg_tcopy_1.c
+++ b/kernel/generic/neg_tcopy_1.c
@@ -57,7 +57,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
b_offset1 = b_offset;
b_offset ++;
-
+
j = n;
if (j > 0) {
do {
diff --git a/kernel/generic/neg_tcopy_16.c b/kernel/generic/neg_tcopy_16.c
index 2d47b27..a93372a 100644
--- a/kernel/generic/neg_tcopy_16.c
+++ b/kernel/generic/neg_tcopy_16.c
@@ -69,7 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 16;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -115,7 +115,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
-
+
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
@@ -124,7 +124,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 13) = -ctemp14;
*(boffset + 14) = -ctemp15;
*(boffset + 15) = -ctemp16;
-
+
*(boffset + 16) = -ctemp17;
*(boffset + 17) = -ctemp18;
*(boffset + 18) = -ctemp19;
@@ -168,7 +168,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp14 = *(aoffset1 + 13);
ctemp15 = *(aoffset1 + 14);
ctemp16 = *(aoffset1 + 15);
-
+
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
@@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
-
+
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
@@ -198,7 +198,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 8;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -210,7 +210,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
-
+
ctemp09 = *(aoffset2 + 0);
ctemp10 = *(aoffset2 + 1);
ctemp11 = *(aoffset2 + 2);
@@ -228,7 +228,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
-
+
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
@@ -237,15 +237,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 13) = -ctemp14;
*(boffset + 14) = -ctemp15;
*(boffset + 15) = -ctemp16;
-
+
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 16;
-
+
i --;
}while(i > 0);
}
-
+
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
@@ -255,7 +255,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
-
+
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
@@ -264,7 +264,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
-
+
boffset += 8;
}
}
@@ -273,7 +273,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 4;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -295,15 +295,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
-
+
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 8;
-
+
i --;
}while(i > 0);
}
-
+
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
@@ -314,7 +314,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
-
+
boffset += 4;
}
}
@@ -323,7 +323,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 2;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -336,15 +336,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
-
+
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 4;
-
+
i --;
}while(i > 0);
}
-
+
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
@@ -358,7 +358,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
if (n & 1){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -371,11 +371,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 2;
-
+
i --;
}while(i > 0);
}
-
+
if (m & 1){
ctemp01 = *(aoffset1 + 0);
*(boffset + 0) = -ctemp01;
diff --git a/kernel/generic/neg_tcopy_2.c b/kernel/generic/neg_tcopy_2.c
index e4dfa0b..572f6eb 100644
--- a/kernel/generic/neg_tcopy_2.c
+++ b/kernel/generic/neg_tcopy_2.c
@@ -60,7 +60,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
b_offset1 = b_offset;
b_offset += 4;
-
+
j = (n >> 1);
if (j > 0){
do {
@@ -95,7 +95,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
j--;
} while (j > 0);
}
-
+
if (n & 1){
*(b_offset2 + 0) = -*(a_offset + 0);
}
diff --git a/kernel/generic/neg_tcopy_4.c b/kernel/generic/neg_tcopy_4.c
index 9fb1dc7..a080e0e 100644
--- a/kernel/generic/neg_tcopy_4.c
+++ b/kernel/generic/neg_tcopy_4.c
@@ -64,7 +64,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset += 4 * lda;
-
+
b_offset1 = b_offset;
b_offset += 16;
@@ -75,17 +75,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
-
+
ctemp5 = *(a_offset2 + 0);
ctemp6 = *(a_offset2 + 1);
ctemp7 = *(a_offset2 + 2);
ctemp8 = *(a_offset2 + 3);
-
+
ctemp9 = *(a_offset3 + 0);
ctemp10 = *(a_offset3 + 1);
ctemp11 = *(a_offset3 + 2);
ctemp12 = *(a_offset3 + 3);
-
+
ctemp13 = *(a_offset4 + 0);
ctemp14 = *(a_offset4 + 1);
ctemp15 = *(a_offset4 + 2);
@@ -95,27 +95,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset2 += 4;
a_offset3 += 4;
a_offset4 += 4;
-
+
*(b_offset1 + 0) = -ctemp1;
*(b_offset1 + 1) = -ctemp2;
*(b_offset1 + 2) = -ctemp3;
*(b_offset1 + 3) = -ctemp4;
-
+
*(b_offset1 + 4) = -ctemp5;
*(b_offset1 + 5) = -ctemp6;
*(b_offset1 + 6) = -ctemp7;
*(b_offset1 + 7) = -ctemp8;
-
+
*(b_offset1 + 8) = -ctemp9;
*(b_offset1 + 9) = -ctemp10;
*(b_offset1 + 10) = -ctemp11;
*(b_offset1 + 11) = -ctemp12;
-
+
*(b_offset1 + 12) = -ctemp13;
*(b_offset1 + 13) = -ctemp14;
*(b_offset1 + 14) = -ctemp15;
*(b_offset1 + 15) = -ctemp16;
-
+
b_offset1 += m * 4;
i --;
}while(i > 0);
@@ -127,28 +127,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp3 = *(a_offset2 + 0);
ctemp4 = *(a_offset2 + 1);
-
+
ctemp5 = *(a_offset3 + 0);
ctemp6 = *(a_offset3 + 1);
-
+
ctemp7 = *(a_offset4 + 0);
ctemp8 = *(a_offset4 + 1);
-
+
a_offset1 += 2;
a_offset2 += 2;
a_offset3 += 2;
a_offset4 += 2;
-
+
*(b_offset2 + 0) = -ctemp1;
*(b_offset2 + 1) = -ctemp2;
*(b_offset2 + 2) = -ctemp3;
*(b_offset2 + 3) = -ctemp4;
-
+
*(b_offset2 + 4) = -ctemp5;
*(b_offset2 + 5) = -ctemp6;
*(b_offset2 + 6) = -ctemp7;
*(b_offset2 + 7) = -ctemp8;
-
+
b_offset2 += 8;
}
@@ -157,12 +157,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset2 + 0);
ctemp3 = *(a_offset3 + 0);
ctemp4 = *(a_offset4 + 0);
-
+
*(b_offset3 + 0) = -ctemp1;
*(b_offset3 + 1) = -ctemp2;
*(b_offset3 + 2) = -ctemp3;
*(b_offset3 + 3) = -ctemp4;
-
+
b_offset3 += 4;
}
@@ -174,10 +174,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
-
+
b_offset1 = b_offset;
b_offset += 8;
-
+
i = (n >> 2);
if (i > 0){
do{
@@ -185,20 +185,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
-
+
ctemp5 = *(a_offset2 + 0);
ctemp6 = *(a_offset2 + 1);
ctemp7 = *(a_offset2 + 2);
ctemp8 = *(a_offset2 + 3);
-
+
a_offset1 += 4;
a_offset2 += 4;
-
+
*(b_offset1 + 0) = -ctemp1;
*(b_offset1 + 1) = -ctemp2;
*(b_offset1 + 2) = -ctemp3;
*(b_offset1 + 3) = -ctemp4;
-
+
*(b_offset1 + 4) = -ctemp5;
*(b_offset1 + 5) = -ctemp6;
*(b_offset1 + 6) = -ctemp7;
@@ -212,25 +212,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
if (n & 2) {
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
-
+
ctemp3 = *(a_offset2 + 0);
ctemp4 = *(a_offset2 + 1);
-
+
a_offset1 += 2;
a_offset2 += 2;
-
+
*(b_offset2 + 0) = -ctemp1;
*(b_offset2 + 1) = -ctemp2;
*(b_offset2 + 2) = -ctemp3;
*(b_offset2 + 3) = -ctemp4;
-
+
b_offset2 += 4;
}
-
+
if (n & 1) {
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset2 + 0);
-
+
*(b_offset3 + 0) = -ctemp1;
*(b_offset3 + 1) = -ctemp2;
b_offset3 += 2;
@@ -240,7 +240,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
if (m & 1){
a_offset1 = a_offset;
b_offset1 = b_offset;
-
+
i = (n >> 2);
if (i > 0){
do{
@@ -248,9 +248,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
-
+
a_offset1 += 4;
-
+
*(b_offset1 + 0) = -ctemp1;
*(b_offset1 + 1) = -ctemp2;
*(b_offset1 + 2) = -ctemp3;
@@ -266,11 +266,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
a_offset1 += 2;
-
+
*(b_offset2 + 0) = -ctemp1;
*(b_offset2 + 1) = -ctemp2;
}
-
+
if (n & 1) {
ctemp1 = *(a_offset1 + 0);
*(b_offset3 + 0) = -ctemp1;
diff --git a/kernel/generic/neg_tcopy_8.c b/kernel/generic/neg_tcopy_8.c
index 97fec3b..a45ecc7 100644
--- a/kernel/generic/neg_tcopy_8.c
+++ b/kernel/generic/neg_tcopy_8.c
@@ -105,7 +105,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
aoffset1 += 8;
-
+
ctemp09 = *(aoffset2 + 0);
ctemp10 = *(aoffset2 + 1);
ctemp11 = *(aoffset2 + 2);
@@ -115,7 +115,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp15 = *(aoffset2 + 6);
ctemp16 = *(aoffset2 + 7);
aoffset2 += 8;
-
+
ctemp17 = *(aoffset3 + 0);
ctemp18 = *(aoffset3 + 1);
ctemp19 = *(aoffset3 + 2);
@@ -125,7 +125,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp23 = *(aoffset3 + 6);
ctemp24 = *(aoffset3 + 7);
aoffset3 += 8;
-
+
ctemp25 = *(aoffset4 + 0);
ctemp26 = *(aoffset4 + 1);
ctemp27 = *(aoffset4 + 2);
@@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp39 = *(aoffset5 + 6);
ctemp40 = *(aoffset5 + 7);
aoffset5 += 8;
-
+
ctemp41 = *(aoffset6 + 0);
ctemp42 = *(aoffset6 + 1);
ctemp43 = *(aoffset6 + 2);
@@ -155,7 +155,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp47 = *(aoffset6 + 6);
ctemp48 = *(aoffset6 + 7);
aoffset6 += 8;
-
+
ctemp49 = *(aoffset7 + 0);
ctemp50 = *(aoffset7 + 1);
ctemp51 = *(aoffset7 + 2);
@@ -165,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp55 = *(aoffset7 + 6);
ctemp56 = *(aoffset7 + 7);
aoffset7 += 8;
-
+
ctemp57 = *(aoffset8 + 0);
ctemp58 = *(aoffset8 + 1);
ctemp59 = *(aoffset8 + 2);
@@ -175,7 +175,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp63 = *(aoffset8 + 6);
ctemp64 = *(aoffset8 + 7);
aoffset8 += 8;
-
+
*(boffset1 + 0) = -ctemp01;
*(boffset1 + 1) = -ctemp02;
*(boffset1 + 2) = -ctemp03;
@@ -184,7 +184,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 5) = -ctemp06;
*(boffset1 + 6) = -ctemp07;
*(boffset1 + 7) = -ctemp08;
-
+
*(boffset1 + 8) = -ctemp09;
*(boffset1 + 9) = -ctemp10;
*(boffset1 + 10) = -ctemp11;
@@ -202,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 21) = -ctemp22;
*(boffset1 + 22) = -ctemp23;
*(boffset1 + 23) = -ctemp24;
-
+
*(boffset1 + 24) = -ctemp25;
*(boffset1 + 25) = -ctemp26;
*(boffset1 + 26) = -ctemp27;
@@ -220,7 +220,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 37) = -ctemp38;
*(boffset1 + 38) = -ctemp39;
*(boffset1 + 39) = -ctemp40;
-
+
*(boffset1 + 40) = -ctemp41;
*(boffset1 + 41) = -ctemp42;
*(boffset1 + 42) = -ctemp43;
@@ -238,7 +238,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 53) = -ctemp54;
*(boffset1 + 54) = -ctemp55;
*(boffset1 + 55) = -ctemp56;
-
+
*(boffset1 + 56) = -ctemp57;
*(boffset1 + 57) = -ctemp58;
*(boffset1 + 58) = -ctemp59;
@@ -247,7 +247,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 61) = -ctemp62;
*(boffset1 + 62) = -ctemp63;
*(boffset1 + 63) = -ctemp64;
-
+
boffset1 += m * 8;
i --;
}while(i > 0);
@@ -259,49 +259,49 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
aoffset1 += 4;
-
+
ctemp05 = *(aoffset2 + 0);
ctemp06 = *(aoffset2 + 1);
ctemp07 = *(aoffset2 + 2);
ctemp08 = *(aoffset2 + 3);
aoffset2 += 4;
-
+
ctemp09 = *(aoffset3 + 0);
ctemp10 = *(aoffset3 + 1);
ctemp11 = *(aoffset3 + 2);
ctemp12 = *(aoffset3 + 3);
aoffset3 += 4;
-
+
ctemp13 = *(aoffset4 + 0);
ctemp14 = *(aoffset4 + 1);
ctemp15 = *(aoffset4 + 2);
ctemp16 = *(aoffset4 + 3);
aoffset4 += 4;
-
+
ctemp17 = *(aoffset5 + 0);
ctemp18 = *(aoffset5 + 1);
ctemp19 = *(aoffset5 + 2);
ctemp20 = *(aoffset5 + 3);
aoffset5 += 4;
-
+
ctemp21 = *(aoffset6 + 0);
ctemp22 = *(aoffset6 + 1);
ctemp23 = *(aoffset6 + 2);
ctemp24 = *(aoffset6 + 3);
aoffset6 += 4;
-
+
ctemp25 = *(aoffset7 + 0);
ctemp26 = *(aoffset7 + 1);
ctemp27 = *(aoffset7 + 2);
ctemp28 = *(aoffset7 + 3);
aoffset7 += 4;
-
+
ctemp29 = *(aoffset8 + 0);
ctemp30 = *(aoffset8 + 1);
ctemp31 = *(aoffset8 + 2);
ctemp32 = *(aoffset8 + 3);
aoffset8 += 4;
-
+
*(boffset2 + 0) = -ctemp01;
*(boffset2 + 1) = -ctemp02;
*(boffset2 + 2) = -ctemp03;
@@ -343,35 +343,35 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
aoffset1 += 2;
-
+
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
aoffset2 += 2;
-
+
ctemp05 = *(aoffset3 + 0);
ctemp06 = *(aoffset3 + 1);
aoffset3 += 2;
-
+
ctemp07 = *(aoffset4 + 0);
ctemp08 = *(aoffset4 + 1);
aoffset4 += 2;
-
+
ctemp09 = *(aoffset5 + 0);
ctemp10 = *(aoffset5 + 1);
aoffset5 += 2;
-
+
ctemp11 = *(aoffset6 + 0);
ctemp12 = *(aoffset6 + 1);
aoffset6 += 2;
-
+
ctemp13 = *(aoffset7 + 0);
ctemp14 = *(aoffset7 + 1);
aoffset7 += 2;
-
+
ctemp15 = *(aoffset8 + 0);
ctemp16 = *(aoffset8 + 1);
aoffset8 += 2;
-
+
*(boffset3 + 0) = -ctemp01;
*(boffset3 + 1) = -ctemp02;
*(boffset3 + 2) = -ctemp03;
@@ -408,7 +408,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset7 ++;
ctemp08 = *(aoffset8 + 0);
aoffset8 ++;
-
+
*(boffset4 + 0) = -ctemp01;
*(boffset4 + 1) = -ctemp02;
*(boffset4 + 2) = -ctemp03;
@@ -431,10 +431,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset += 4 * lda;
-
+
boffset1 = boffset;
boffset += 32;
-
+
i = (n >> 3);
if (i > 0){
@@ -448,7 +448,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
aoffset1 += 8;
-
+
ctemp09 = *(aoffset2 + 0);
ctemp10 = *(aoffset2 + 1);
ctemp11 = *(aoffset2 + 2);
@@ -458,7 +458,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp15 = *(aoffset2 + 6);
ctemp16 = *(aoffset2 + 7);
aoffset2 += 8;
-
+
ctemp17 = *(aoffset3 + 0);
ctemp18 = *(aoffset3 + 1);
ctemp19 = *(aoffset3 + 2);
@@ -468,7 +468,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp23 = *(aoffset3 + 6);
ctemp24 = *(aoffset3 + 7);
aoffset3 += 8;
-
+
ctemp25 = *(aoffset4 + 0);
ctemp26 = *(aoffset4 + 1);
ctemp27 = *(aoffset4 + 2);
@@ -478,7 +478,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp31 = *(aoffset4 + 6);
ctemp32 = *(aoffset4 + 7);
aoffset4 += 8;
-
+
*(boffset1 + 0) = -ctemp01;
*(boffset1 + 1) = -ctemp02;
*(boffset1 + 2) = -ctemp03;
@@ -487,7 +487,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 5) = -ctemp06;
*(boffset1 + 6) = -ctemp07;
*(boffset1 + 7) = -ctemp08;
-
+
*(boffset1 + 8) = -ctemp09;
*(boffset1 + 9) = -ctemp10;
*(boffset1 + 10) = -ctemp11;
@@ -496,7 +496,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 13) = -ctemp14;
*(boffset1 + 14) = -ctemp15;
*(boffset1 + 15) = -ctemp16;
-
+
*(boffset1 + 16) = -ctemp17;
*(boffset1 + 17) = -ctemp18;
*(boffset1 + 18) = -ctemp19;
@@ -526,25 +526,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
aoffset1 += 4;
-
+
ctemp05 = *(aoffset2 + 0);
ctemp06 = *(aoffset2 + 1);
ctemp07 = *(aoffset2 + 2);
ctemp08 = *(aoffset2 + 3);
aoffset2 += 4;
-
+
ctemp09 = *(aoffset3 + 0);
ctemp10 = *(aoffset3 + 1);
ctemp11 = *(aoffset3 + 2);
ctemp12 = *(aoffset3 + 3);
aoffset3 += 4;
-
+
ctemp13 = *(aoffset4 + 0);
ctemp14 = *(aoffset4 + 1);
ctemp15 = *(aoffset4 + 2);
ctemp16 = *(aoffset4 + 3);
aoffset4 += 4;
-
+
*(boffset2 + 0) = -ctemp01;
*(boffset2 + 1) = -ctemp02;
*(boffset2 + 2) = -ctemp03;
@@ -553,7 +553,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset2 + 5) = -ctemp06;
*(boffset2 + 6) = -ctemp07;
*(boffset2 + 7) = -ctemp08;
-
+
*(boffset2 + 8) = -ctemp09;
*(boffset2 + 9) = -ctemp10;
*(boffset2 + 10) = -ctemp11;
@@ -564,24 +564,24 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset2 + 15) = -ctemp16;
boffset2 += 16;
}
-
+
if (n & 2){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
aoffset1 += 2;
-
+
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
aoffset2 += 2;
-
+
ctemp05 = *(aoffset3 + 0);
ctemp06 = *(aoffset3 + 1);
aoffset3 += 2;
-
+
ctemp07 = *(aoffset4 + 0);
ctemp08 = *(aoffset4 + 1);
aoffset4 += 2;
-
+
*(boffset3 + 0) = -ctemp01;
*(boffset3 + 1) = -ctemp02;
*(boffset3 + 2) = -ctemp03;
@@ -592,7 +592,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset3 + 7) = -ctemp08;
boffset3 += 8;
}
-
+
if (n & 1){
ctemp01 = *(aoffset1 + 0);
aoffset1 ++;
@@ -602,7 +602,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset3 ++;
ctemp04 = *(aoffset4 + 0);
aoffset4 ++;
-
+
*(boffset4 + 0) = -ctemp01;
*(boffset4 + 1) = -ctemp02;
*(boffset4 + 2) = -ctemp03;
@@ -610,15 +610,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
boffset4 += 4;
}
}
-
+
if (m & 2){
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset += 2 * lda;
-
+
boffset1 = boffset;
boffset += 16;
-
+
i = (n >> 3);
if (i > 0){
do{
@@ -631,7 +631,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
aoffset1 += 8;
-
+
ctemp09 = *(aoffset2 + 0);
ctemp10 = *(aoffset2 + 1);
ctemp11 = *(aoffset2 + 2);
@@ -641,7 +641,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp15 = *(aoffset2 + 6);
ctemp16 = *(aoffset2 + 7);
aoffset2 += 8;
-
+
*(boffset1 + 0) = -ctemp01;
*(boffset1 + 1) = -ctemp02;
*(boffset1 + 2) = -ctemp03;
@@ -650,7 +650,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 5) = -ctemp06;
*(boffset1 + 6) = -ctemp07;
*(boffset1 + 7) = -ctemp08;
-
+
*(boffset1 + 8) = -ctemp09;
*(boffset1 + 9) = -ctemp10;
*(boffset1 + 10) = -ctemp11;
@@ -659,25 +659,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 13) = -ctemp14;
*(boffset1 + 14) = -ctemp15;
*(boffset1 + 15) = -ctemp16;
-
+
boffset1 += 8 * m;
i --;
}while(i > 0);
}
-
+
if (n & 4){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
aoffset1 += 4;
-
+
ctemp05 = *(aoffset2 + 0);
ctemp06 = *(aoffset2 + 1);
ctemp07 = *(aoffset2 + 2);
ctemp08 = *(aoffset2 + 3);
aoffset2 += 4;
-
+
*(boffset2 + 0) = -ctemp01;
*(boffset2 + 1) = -ctemp02;
*(boffset2 + 2) = -ctemp03;
@@ -688,29 +688,29 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset2 + 7) = -ctemp08;
boffset2 += 8;
}
-
+
if (n & 2){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
aoffset1 += 2;
-
+
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
aoffset2 += 2;
-
+
*(boffset3 + 0) = -ctemp01;
*(boffset3 + 1) = -ctemp02;
*(boffset3 + 2) = -ctemp03;
*(boffset3 + 3) = -ctemp04;
boffset3 += 4;
}
-
+
if (n & 1){
ctemp01 = *(aoffset1 + 0);
aoffset1 ++;
ctemp02 = *(aoffset2 + 0);
aoffset2 ++;
-
+
*(boffset4 + 0) = -ctemp01;
*(boffset4 + 1) = -ctemp02;
boffset4 += 2;
@@ -720,10 +720,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
if (m & 1){
aoffset1 = aoffset;
aoffset += lda;
-
+
boffset1 = boffset;
boffset += 8;
-
+
i = (n >> 3);
if (i > 0){
do{
@@ -736,7 +736,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
aoffset1 += 8;
-
+
*(boffset1 + 0) = -ctemp01;
*(boffset1 + 1) = -ctemp02;
*(boffset1 + 2) = -ctemp03;
@@ -745,7 +745,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 5) = -ctemp06;
*(boffset1 + 6) = -ctemp07;
*(boffset1 + 7) = -ctemp08;
-
+
boffset1 += 8 * m;
i --;
}while(i > 0);
@@ -774,7 +774,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset3 + 1) = -ctemp02;
boffset3 += 2;
}
-
+
if (n & 1){
ctemp01 = *(aoffset1 + 0);
aoffset1 ++;
diff --git a/kernel/generic/symm_lcopy_1.c b/kernel/generic/symm_lcopy_1.c
index 7b6cfea..6ec51b8 100644
--- a/kernel/generic/symm_lcopy_1.c
+++ b/kernel/generic/symm_lcopy_1.c
@@ -50,14 +50,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
-
+
if (offset > 0) ao1 += lda; else ao1 ++;
b[ 0] = data01;
diff --git a/kernel/generic/symm_lcopy_16.c b/kernel/generic/symm_lcopy_16.c
index 2c8ad81..477546f 100644
--- a/kernel/generic/symm_lcopy_16.c
+++ b/kernel/generic/symm_lcopy_16.c
@@ -52,7 +52,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda;
if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda;
@@ -106,7 +106,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (offset > -13) ao14 += lda; else ao14 ++;
if (offset > -14) ao15 += lda; else ao15 ++;
if (offset > -15) ao16 += lda; else ao16 ++;
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -137,7 +137,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 8) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda;
if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda;
@@ -158,7 +158,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao6 + 0);
data07 = *(ao7 + 0);
data08 = *(ao8 + 0);
-
+
if (offset > 0) ao1 += lda; else ao1 ++;
if (offset > -1) ao2 += lda; else ao2 ++;
if (offset > -2) ao3 += lda; else ao3 ++;
@@ -188,7 +188,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 4) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda;
if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda;
@@ -201,7 +201,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao2 + 0);
data03 = *(ao3 + 0);
data04 = *(ao4 + 0);
-
+
if (offset > 0) ao1 += lda; else ao1 ++;
if (offset > -1) ao2 += lda; else ao2 ++;
if (offset > -2) ao3 += lda; else ao3 ++;
@@ -223,7 +223,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda;
@@ -232,7 +232,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
-
+
if (offset > 0) ao1 += lda; else ao1 ++;
if (offset > -1) ao2 += lda; else ao2 ++;
@@ -250,14 +250,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
-
+
if (offset > 0) ao1 += lda; else ao1 ++;
b[ 0] = data01;
diff --git a/kernel/generic/symm_lcopy_2.c b/kernel/generic/symm_lcopy_2.c
index e7944c4..2337d5c 100644
--- a/kernel/generic/symm_lcopy_2.c
+++ b/kernel/generic/symm_lcopy_2.c
@@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda;
@@ -59,7 +59,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
-
+
if (offset > 0) ao1 += lda; else ao1 ++;
if (offset > -1) ao2 += lda; else ao2 ++;
@@ -79,14 +79,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
-
+
if (offset > 0) ao1 += lda; else ao1 ++;
b[ 0] = data01;
diff --git a/kernel/generic/symm_lcopy_4.c b/kernel/generic/symm_lcopy_4.c
index ac04943..ca730e1 100644
--- a/kernel/generic/symm_lcopy_4.c
+++ b/kernel/generic/symm_lcopy_4.c
@@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda;
if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda;
@@ -63,7 +63,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao2 + 0);
data03 = *(ao3 + 0);
data04 = *(ao4 + 0);
-
+
if (offset > 0) ao1 += lda; else ao1 ++;
if (offset > -1) ao2 += lda; else ao2 ++;
if (offset > -2) ao3 += lda; else ao3 ++;
@@ -87,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda;
@@ -96,7 +96,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
-
+
if (offset > 0) ao1 += lda; else ao1 ++;
if (offset > -1) ao2 += lda; else ao2 ++;
@@ -115,14 +115,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
-
+
if (offset > 0) ao1 += lda; else ao1 ++;
b[ 0] = data01;
diff --git a/kernel/generic/symm_lcopy_6.c b/kernel/generic/symm_lcopy_6.c
index ac04943..ca730e1 100644
--- a/kernel/generic/symm_lcopy_6.c
+++ b/kernel/generic/symm_lcopy_6.c
@@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda;
if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda;
@@ -63,7 +63,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao2 + 0);
data03 = *(ao3 + 0);
data04 = *(ao4 + 0);
-
+
if (offset > 0) ao1 += lda; else ao1 ++;
if (offset > -1) ao2 += lda; else ao2 ++;
if (offset > -2) ao3 += lda; else ao3 ++;
@@ -87,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda;
@@ -96,7 +96,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
-
+
if (offset > 0) ao1 += lda; else ao1 ++;
if (offset > -1) ao2 += lda; else ao2 ++;
@@ -115,14 +115,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
-
+
if (offset > 0) ao1 += lda; else ao1 ++;
b[ 0] = data01;
diff --git a/kernel/generic/symm_lcopy_8.c b/kernel/generic/symm_lcopy_8.c
index c315574..11dae9a 100644
--- a/kernel/generic/symm_lcopy_8.c
+++ b/kernel/generic/symm_lcopy_8.c
@@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda;
if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda;
@@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao6 + 0);
data07 = *(ao7 + 0);
data08 = *(ao8 + 0);
-
+
if (offset > 0) ao1 += lda; else ao1 ++;
if (offset > -1) ao2 += lda; else ao2 ++;
if (offset > -2) ao3 += lda; else ao3 ++;
@@ -102,7 +102,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 4) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda;
if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda;
@@ -115,7 +115,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao2 + 0);
data03 = *(ao3 + 0);
data04 = *(ao4 + 0);
-
+
if (offset > 0) ao1 += lda; else ao1 ++;
if (offset > -1) ao2 += lda; else ao2 ++;
if (offset > -2) ao3 += lda; else ao3 ++;
@@ -137,7 +137,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda;
@@ -146,7 +146,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
-
+
if (offset > 0) ao1 += lda; else ao1 ++;
if (offset > -1) ao2 += lda; else ao2 ++;
@@ -164,14 +164,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
-
+
if (offset > 0) ao1 += lda; else ao1 ++;
b[ 0] = data01;
diff --git a/kernel/generic/symm_ucopy_1.c b/kernel/generic/symm_ucopy_1.c
index 4ab9bb4..d87500f 100644
--- a/kernel/generic/symm_ucopy_1.c
+++ b/kernel/generic/symm_ucopy_1.c
@@ -50,14 +50,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
-
+
if (offset > 0) ao1 ++; else ao1 += lda;
b[ 0] = data01;
diff --git a/kernel/generic/symm_ucopy_16.c b/kernel/generic/symm_ucopy_16.c
index 094810b..9b671db 100644
--- a/kernel/generic/symm_ucopy_16.c
+++ b/kernel/generic/symm_ucopy_16.c
@@ -52,7 +52,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda;
if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda;
@@ -89,7 +89,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data14 = *(ao14 + 0);
data15 = *(ao15 + 0);
data16 = *(ao16 + 0);
-
+
if (offset > 0) ao1 ++; else ao1 += lda;
if (offset > -1) ao2 ++; else ao2 += lda;
if (offset > -2) ao3 ++; else ao3 += lda;
@@ -137,7 +137,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 8) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda;
if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda;
@@ -158,7 +158,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao6 + 0);
data07 = *(ao7 + 0);
data08 = *(ao8 + 0);
-
+
if (offset > 0) ao1 ++; else ao1 += lda;
if (offset > -1) ao2 ++; else ao2 += lda;
if (offset > -2) ao3 ++; else ao3 += lda;
@@ -189,7 +189,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 4) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda;
if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda;
@@ -202,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao2 + 0);
data03 = *(ao3 + 0);
data04 = *(ao4 + 0);
-
+
if (offset > 0) ao1 ++; else ao1 += lda;
if (offset > -1) ao2 ++; else ao2 += lda;
if (offset > -2) ao3 ++; else ao3 += lda;
@@ -224,7 +224,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda;
@@ -233,7 +233,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
-
+
if (offset > 0) ao1 ++; else ao1 += lda;
if (offset > -1) ao2 ++; else ao2 += lda;
@@ -245,20 +245,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
offset --;
i --;
}
-
+
posX += 2;
}
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
-
+
if (offset > 0) ao1 ++; else ao1 += lda;
b[ 0] = data01;
diff --git a/kernel/generic/symm_ucopy_2.c b/kernel/generic/symm_ucopy_2.c
index 6396b74..56df894 100644
--- a/kernel/generic/symm_ucopy_2.c
+++ b/kernel/generic/symm_ucopy_2.c
@@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda;
@@ -59,7 +59,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
-
+
if (offset > 0) ao1 ++; else ao1 += lda;
if (offset > -1) ao2 ++; else ao2 += lda;
@@ -78,14 +78,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
-
+
if (offset > 0) ao1 ++; else ao1 += lda;
b[ 0] = data01;
diff --git a/kernel/generic/symm_ucopy_4.c b/kernel/generic/symm_ucopy_4.c
index 9b9cff8..6dbb861 100644
--- a/kernel/generic/symm_ucopy_4.c
+++ b/kernel/generic/symm_ucopy_4.c
@@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda;
if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda;
@@ -63,7 +63,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao2 + 0);
data03 = *(ao3 + 0);
data04 = *(ao4 + 0);
-
+
if (offset > 0) ao1 ++; else ao1 += lda;
if (offset > -1) ao2 ++; else ao2 += lda;
if (offset > -2) ao3 ++; else ao3 += lda;
@@ -86,7 +86,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda;
@@ -95,7 +95,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
-
+
if (offset > 0) ao1 ++; else ao1 += lda;
if (offset > -1) ao2 ++; else ao2 += lda;
@@ -107,20 +107,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
offset --;
i --;
}
-
+
posX += 2;
}
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
-
+
if (offset > 0) ao1 ++; else ao1 += lda;
b[ 0] = data01;
diff --git a/kernel/generic/symm_ucopy_6.c b/kernel/generic/symm_ucopy_6.c
index 9b9cff8..6dbb861 100644
--- a/kernel/generic/symm_ucopy_6.c
+++ b/kernel/generic/symm_ucopy_6.c
@@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda;
if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda;
@@ -63,7 +63,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao2 + 0);
data03 = *(ao3 + 0);
data04 = *(ao4 + 0);
-
+
if (offset > 0) ao1 ++; else ao1 += lda;
if (offset > -1) ao2 ++; else ao2 += lda;
if (offset > -2) ao3 ++; else ao3 += lda;
@@ -86,7 +86,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda;
@@ -95,7 +95,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
-
+
if (offset > 0) ao1 ++; else ao1 += lda;
if (offset > -1) ao2 ++; else ao2 += lda;
@@ -107,20 +107,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
offset --;
i --;
}
-
+
posX += 2;
}
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
-
+
if (offset > 0) ao1 ++; else ao1 += lda;
b[ 0] = data01;
diff --git a/kernel/generic/symm_ucopy_8.c b/kernel/generic/symm_ucopy_8.c
index 411768b..3da9385 100644
--- a/kernel/generic/symm_ucopy_8.c
+++ b/kernel/generic/symm_ucopy_8.c
@@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda;
if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda;
@@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao6 + 0);
data07 = *(ao7 + 0);
data08 = *(ao8 + 0);
-
+
if (offset > 0) ao1 ++; else ao1 += lda;
if (offset > -1) ao2 ++; else ao2 += lda;
if (offset > -2) ao3 ++; else ao3 += lda;
@@ -103,7 +103,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 4) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda;
if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda;
@@ -116,7 +116,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao2 + 0);
data03 = *(ao3 + 0);
data04 = *(ao4 + 0);
-
+
if (offset > 0) ao1 ++; else ao1 += lda;
if (offset > -1) ao2 ++; else ao2 += lda;
if (offset > -2) ao3 ++; else ao3 += lda;
@@ -138,7 +138,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda;
@@ -147,7 +147,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
-
+
if (offset > 0) ao1 ++; else ao1 += lda;
if (offset > -1) ao2 ++; else ao2 += lda;
@@ -159,20 +159,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
offset --;
i --;
}
-
+
posX += 2;
}
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
-
+
if (offset > 0) ao1 ++; else ao1 += lda;
b[ 0] = data01;
diff --git a/kernel/generic/symv_k.c b/kernel/generic/symv_k.c
index bd882fe..c5817e7 100644
--- a/kernel/generic/symv_k.c
+++ b/kernel/generic/symv_k.c
@@ -72,15 +72,15 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda,
for(is = 0; is < offset; is += SYMV_P){
min_i = MIN(offset - is, SYMV_P);
#endif
-
+
#ifndef LOWER
if (is >0){
- GEMV_T(is, min_i, 0, alpha,
+ GEMV_T(is, min_i, 0, alpha,
a + is * lda, lda,
X, 1,
Y + is, 1, gemvbuffer);
- GEMV_N(is, min_i, 0, alpha,
+ GEMV_N(is, min_i, 0, alpha,
a + is * lda, lda,
X + is, 1,
Y, 1, gemvbuffer);
@@ -92,20 +92,20 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda,
#else
SYMCOPY_U(min_i, a + is + is * lda, lda, symbuffer);
#endif
-
- GEMV_N(min_i, min_i, 0, alpha,
+
+ GEMV_N(min_i, min_i, 0, alpha,
symbuffer, min_i,
- X + is, 1,
+ X + is, 1,
Y + is, 1, gemvbuffer);
#ifdef LOWER
if (m - is > min_i){
- GEMV_T(m - is - min_i, min_i, 0, alpha,
+ GEMV_T(m - is - min_i, min_i, 0, alpha,
a + (is + min_i) + is * lda, lda,
X + (is + min_i), 1,
Y + is, 1, gemvbuffer);
-
- GEMV_N(m - is - min_i, min_i, 0, alpha,
+
+ GEMV_N(m - is - min_i, min_i, 0, alpha,
a + (is + min_i) + is * lda, lda,
X + is, 1,
Y + (is + min_i), 1, gemvbuffer);
diff --git a/kernel/generic/trmm_lncopy_1.c b/kernel/generic/trmm_lncopy_1.c
index 66e407f..542c4c3 100644
--- a/kernel/generic/trmm_lncopy_1.c
+++ b/kernel/generic/trmm_lncopy_1.c
@@ -64,7 +64,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 0] = data01;
ao1 += 1;
b += 1;
- } else
+ } else
if (X < posY) {
ao1 += lda;
b += 1;
diff --git a/kernel/generic/trmm_lncopy_16.c b/kernel/generic/trmm_lncopy_16.c
index a183402..0795a83 100644
--- a/kernel/generic/trmm_lncopy_16.c
+++ b/kernel/generic/trmm_lncopy_16.c
@@ -88,13 +88,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a15 = a + posX + (posY + 14) * lda;
a16 = a + posX + (posY + 15) * lda;
}
-
+
i = (m >> 4);
if (i > 0) {
do {
if (X > posY) {
for (ii = 0; ii < 16; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a02 + 0);
b[ 2] = *(a03 + 0);
@@ -103,7 +103,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a06 + 0);
b[ 6] = *(a07 + 0);
b[ 7] = *(a08 + 0);
-
+
b[ 8] = *(a09 + 0);
b[ 9] = *(a10 + 0);
b[ 10] = *(a11 + 0);
@@ -112,7 +112,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(a14 + 0);
b[ 14] = *(a15 + 0);
b[ 15] = *(a16 + 0);
-
+
a01 ++;
a02 ++;
a03 ++;
@@ -131,7 +131,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a16 ++;
b += 16;
}
- } else
+ } else
if (X < posY) {
a01 += 16 * lda;
a02 += 16 * lda;
@@ -171,7 +171,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = ZERO;
b[ 14] = ZERO;
b[ 15] = ZERO;
-
+
b[ 16] = *(a01 + 1);
#ifdef UNIT
b[ 17] = ONE;
@@ -504,7 +504,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a15 += 16;
a16 += 16;
b += 256;
-
+
}
X += 16;
@@ -514,10 +514,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 15);
if (i) {
-
+
if (X > posY) {
for (ii = 0; ii < i; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a02 + 0);
b[ 2] = *(a03 + 0);
@@ -526,7 +526,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a06 + 0);
b[ 6] = *(a07 + 0);
b[ 7] = *(a08 + 0);
-
+
b[ 8] = *(a09 + 0);
b[ 9] = *(a10 + 0);
b[ 10] = *(a11 + 0);
@@ -535,7 +535,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(a14 + 0);
b[ 14] = *(a15 + 0);
b[ 15] = *(a16 + 0);
-
+
a01 ++;
a02 ++;
a03 ++;
@@ -554,7 +554,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a16 ++;
b += 16;
}
- } else
+ } else
if (X < posY) {
a01 += i * lda;
a02 += i * lda;
@@ -968,7 +968,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
do {
if (X > posY) {
for (ii = 0; ii < 8; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a02 + 0);
b[ 2] = *(a03 + 0);
@@ -977,7 +977,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a06 + 0);
b[ 6] = *(a07 + 0);
b[ 7] = *(a08 + 0);
-
+
a01 ++;
a02 ++;
a03 ++;
@@ -988,7 +988,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a08 ++;
b += 8;
}
- } else
+ } else
if (X < posY) {
a01 += 8 * lda;
a02 += 8 * lda;
@@ -1012,7 +1012,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ZERO;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = *(a01 + 1);
#ifdef UNIT
b[ 9] = ONE;
@@ -1122,10 +1122,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 7);
if (i) {
-
+
if (X > posY) {
for (ii = 0; ii < i; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a02 + 0);
b[ 2] = *(a03 + 0);
@@ -1134,7 +1134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a06 + 0);
b[ 6] = *(a07 + 0);
b[ 7] = *(a08 + 0);
-
+
a01 ++;
a02 ++;
a03 ++;
@@ -1145,7 +1145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a08 ++;
b += 8;
}
- } else
+ } else
if (X < posY) {
a01 += i * lda;
a02 += i * lda;
@@ -1293,19 +1293,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
do {
if (X > posY) {
for (ii = 0; ii < 4; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a02 + 0);
b[ 2] = *(a03 + 0);
b[ 3] = *(a04 + 0);
-
+
a01 ++;
a02 ++;
a03 ++;
a04 ++;
b += 4;
}
- } else
+ } else
if (X < posY) {
a01 += 4 * lda;
a02 += 4 * lda;
@@ -1321,7 +1321,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 1] = ZERO;
b[ 2] = ZERO;
b[ 3] = ZERO;
-
+
b[ 4] = *(a01 + 1);
#ifdef UNIT
b[ 5] = ONE;
@@ -1363,22 +1363,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X > posY) {
for (ii = 0; ii < i; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a02 + 0);
b[ 2] = *(a03 + 0);
b[ 3] = *(a04 + 0);
-
+
a01 ++;
a02 ++;
a03 ++;
a04 ++;
b += 4;
}
- } else
+ } else
if (X < posY) {
a01 += i * lda;
a02 += i * lda;
@@ -1447,7 +1447,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a01 += 2;
a02 += 2;
b += 4;
- } else
+ } else
if (X < posY) {
a01 += 2 * lda;
a02 += 2 * lda;
@@ -1478,7 +1478,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
if (m & 1) {
-
+
if (X > posY) {
b[ 0] = *(a01 + 0);
b[ 1] = *(a02 + 0);
@@ -1486,7 +1486,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a01 ++;
a02 ++;
b += 2;
- } else
+ } else
if (X < posY) {
a01 += lda;
a02 += lda;
@@ -1520,7 +1520,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 0] = *(a01 + 0);
a01 += 1;
b += 1;
- } else
+ } else
if (X < posY) {
a01 += lda;
b += 1;
diff --git a/kernel/generic/trmm_lncopy_2.c b/kernel/generic/trmm_lncopy_2.c
index f7fefaa..ed28b66 100644
--- a/kernel/generic/trmm_lncopy_2.c
+++ b/kernel/generic/trmm_lncopy_2.c
@@ -69,7 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
b[ 0] = data01;
b[ 1] = data03;
b[ 2] = data02;
@@ -78,7 +78,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += 2;
ao2 += 2;
b += 4;
- } else
+ } else
if (X < posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
@@ -113,31 +113,31 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
if (m & 1) {
-
+
if (X > posY) {
data01 = *(ao1 + 0);
data03 = *(ao2 + 0);
-
+
b[ 0] = data01;
b[ 1] = data03;
ao1 += 1;
ao2 += 1;
b += 2;
- } else
+ } else
if (X < posY) {
ao1 += lda;
b += 2;
} else {
#ifdef UNIT
data03 = *(ao2 + 0);
-
+
b[ 0] = ONE;
b[ 1] = data03;
#else
data01 = *(ao1 + 0);
data03 = *(ao2 + 0);
-
+
b[ 0] = data01;
b[ 1] = data03;
#endif
@@ -171,7 +171,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 0] = data01;
ao1 += 1;
b += 1;
- } else
+ } else
if (X < posY) {
ao1 += lda;
b += 1;
diff --git a/kernel/generic/trmm_lncopy_4.c b/kernel/generic/trmm_lncopy_4.c
index 6cd1667..0dcfb96 100644
--- a/kernel/generic/trmm_lncopy_4.c
+++ b/kernel/generic/trmm_lncopy_4.c
@@ -74,22 +74,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
-
+
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);
-
+
b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data09;
@@ -98,7 +98,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data10;
b[ 7] = data14;
-
+
b[ 8] = data03;
b[ 9] = data07;
b[10] = data11;
@@ -107,14 +107,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data08;
b[14] = data12;
b[15] = data16;
-
+
ao1 += 4;
ao2 += 4;
ao3 += 4;
ao4 += 4;
b += 16;
- } else
+ } else
if (X < posY) {
ao1 += 4 * lda;
ao2 += 4 * lda;
@@ -127,10 +127,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
data12 = *(ao3 + 3);
b[ 0] = ONE;
@@ -141,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ONE;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = data03;
b[ 9] = data07;
b[10] = ONE;
@@ -155,16 +155,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
-
+
data16 = *(ao4 + 3);
-
+
b[ 0] = data01;
b[ 1] = ZERO;
b[ 2] = ZERO;
@@ -173,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = data03;
b[ 9] = data07;
b[10] = data11;
@@ -197,7 +197,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X > posY) {
if (m & 2) {
@@ -225,7 +225,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao4 += 2;
b += 8;
}
-
+
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
@@ -236,28 +236,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
-
+
ao1 += 1;
ao2 += 1;
ao3 += 1;
ao4 += 1;
b += 4;
}
-
- } else
+
+ } else
if (X < posY) {
if (m & 2) {
ao1 += 2 * lda;
ao2 += 2 * lda;
-
+
b += 8;
}
-
+
if (m & 1) {
ao1 += lda;
b += 4;
}
-
+
} else {
#ifdef UNIT
data05 = *(ao2 + 0);
@@ -272,13 +272,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (i >= 3) {
data15 = *(ao4 + 2);
}
-
+
b[ 0] = ONE;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;
b += 4;
-
+
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = ONE;
@@ -286,7 +286,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = data14;
b += 4;
}
-
+
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -310,13 +310,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data11 = *(ao3 + 2);
data15 = *(ao4 + 2);
}
-
+
b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;
b += 4;
-
+
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = data06;
@@ -324,7 +324,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = data14;
b += 4;
}
-
+
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -361,7 +361,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
-
+
b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data02;
@@ -371,7 +371,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 2;
b += 4;
- } else
+ } else
if (X < posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
@@ -396,7 +396,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#endif
ao1 += 2;
ao2 += 2;
-
+
b += 4;
}
@@ -407,7 +407,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 1);
if (i) {
-
+
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
@@ -417,7 +417,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += 1;
ao2 += 1;
b += 2;
- } else
+ } else
if (X < posY) {
ao1 += lda;
b += 2;
diff --git a/kernel/generic/trmm_lncopy_6.c b/kernel/generic/trmm_lncopy_6.c
index 6cd1667..0dcfb96 100644
--- a/kernel/generic/trmm_lncopy_6.c
+++ b/kernel/generic/trmm_lncopy_6.c
@@ -74,22 +74,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
-
+
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);
-
+
b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data09;
@@ -98,7 +98,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data10;
b[ 7] = data14;
-
+
b[ 8] = data03;
b[ 9] = data07;
b[10] = data11;
@@ -107,14 +107,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data08;
b[14] = data12;
b[15] = data16;
-
+
ao1 += 4;
ao2 += 4;
ao3 += 4;
ao4 += 4;
b += 16;
- } else
+ } else
if (X < posY) {
ao1 += 4 * lda;
ao2 += 4 * lda;
@@ -127,10 +127,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
data12 = *(ao3 + 3);
b[ 0] = ONE;
@@ -141,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ONE;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = data03;
b[ 9] = data07;
b[10] = ONE;
@@ -155,16 +155,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
-
+
data16 = *(ao4 + 3);
-
+
b[ 0] = data01;
b[ 1] = ZERO;
b[ 2] = ZERO;
@@ -173,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = data03;
b[ 9] = data07;
b[10] = data11;
@@ -197,7 +197,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X > posY) {
if (m & 2) {
@@ -225,7 +225,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao4 += 2;
b += 8;
}
-
+
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
@@ -236,28 +236,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
-
+
ao1 += 1;
ao2 += 1;
ao3 += 1;
ao4 += 1;
b += 4;
}
-
- } else
+
+ } else
if (X < posY) {
if (m & 2) {
ao1 += 2 * lda;
ao2 += 2 * lda;
-
+
b += 8;
}
-
+
if (m & 1) {
ao1 += lda;
b += 4;
}
-
+
} else {
#ifdef UNIT
data05 = *(ao2 + 0);
@@ -272,13 +272,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (i >= 3) {
data15 = *(ao4 + 2);
}
-
+
b[ 0] = ONE;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;
b += 4;
-
+
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = ONE;
@@ -286,7 +286,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = data14;
b += 4;
}
-
+
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -310,13 +310,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data11 = *(ao3 + 2);
data15 = *(ao4 + 2);
}
-
+
b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;
b += 4;
-
+
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = data06;
@@ -324,7 +324,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = data14;
b += 4;
}
-
+
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -361,7 +361,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
-
+
b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data02;
@@ -371,7 +371,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 2;
b += 4;
- } else
+ } else
if (X < posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
@@ -396,7 +396,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#endif
ao1 += 2;
ao2 += 2;
-
+
b += 4;
}
@@ -407,7 +407,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 1);
if (i) {
-
+
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao2 + 0);
@@ -417,7 +417,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += 1;
ao2 += 1;
b += 2;
- } else
+ } else
if (X < posY) {
ao1 += lda;
b += 2;
diff --git a/kernel/generic/trmm_lncopy_8.c b/kernel/generic/trmm_lncopy_8.c
index 4a1964b..8f5fbce 100644
--- a/kernel/generic/trmm_lncopy_8.c
+++ b/kernel/generic/trmm_lncopy_8.c
@@ -102,7 +102,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data14 = *(ao2 + 5);
data15 = *(ao2 + 6);
data16 = *(ao2 + 7);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data19 = *(ao3 + 2);
@@ -111,7 +111,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data22 = *(ao3 + 5);
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
@@ -120,7 +120,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data30 = *(ao4 + 5);
data31 = *(ao4 + 6);
data32 = *(ao4 + 7);
-
+
data33 = *(ao5 + 0);
data34 = *(ao5 + 1);
data35 = *(ao5 + 2);
@@ -129,7 +129,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data38 = *(ao5 + 5);
data39 = *(ao5 + 6);
data40 = *(ao5 + 7);
-
+
data41 = *(ao6 + 0);
data42 = *(ao6 + 1);
data43 = *(ao6 + 2);
@@ -138,7 +138,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data46 = *(ao6 + 5);
data47 = *(ao6 + 6);
data48 = *(ao6 + 7);
-
+
data49 = *(ao7 + 0);
data50 = *(ao7 + 1);
data51 = *(ao7 + 2);
@@ -147,7 +147,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data54 = *(ao7 + 5);
data55 = *(ao7 + 6);
data56 = *(ao7 + 7);
-
+
data57 = *(ao8 + 0);
data58 = *(ao8 + 1);
data59 = *(ao8 + 2);
@@ -240,7 +240,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 64;
- } else
+ } else
if (X < posY) {
ao1 += 8 * lda;
ao2 += 8 * lda;
@@ -250,7 +250,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao6 += 8 * lda;
ao7 += 8 * lda;
ao8 += 8 * lda;
-
+
b += 64;
} else {
@@ -264,7 +264,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
#ifndef UNIT
data10 = *(ao2 + 1);
#endif
@@ -274,7 +274,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data14 = *(ao2 + 5);
data15 = *(ao2 + 6);
data16 = *(ao2 + 7);
-
+
#ifndef UNIT
data19 = *(ao3 + 2);
#endif
@@ -283,7 +283,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data22 = *(ao3 + 5);
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
-
+
#ifndef UNIT
data28 = *(ao4 + 3);
#endif
@@ -291,25 +291,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data30 = *(ao4 + 5);
data31 = *(ao4 + 6);
data32 = *(ao4 + 7);
-
+
#ifndef UNIT
data37 = *(ao5 + 4);
#endif
data38 = *(ao5 + 5);
data39 = *(ao5 + 6);
data40 = *(ao5 + 7);
-
+
#ifndef UNIT
data46 = *(ao6 + 5);
#endif
data47 = *(ao6 + 6);
data48 = *(ao6 + 7);
-
+
#ifndef UNIT
data55 = *(ao7 + 6);
#endif
data56 = *(ao7 + 7);
-
+
#ifndef UNIT
data64 = *(ao8 + 7);
#endif
@@ -326,7 +326,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ZERO;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = data02;
#ifdef UNIT
b[ 9] = ONE;
@@ -352,7 +352,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = ZERO;
b[22] = ZERO;
b[23] = ZERO;
-
+
b[24] = data04;
b[25] = data12;
b[26] = data20;
@@ -378,7 +378,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[37] = ZERO;
b[38] = ZERO;
b[39] = ZERO;
-
+
b[40] = data06;
b[41] = data14;
b[42] = data22;
@@ -417,7 +417,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#else
b[63] = data64;
#endif
-
+
ao1 += 8;
ao2 += 8;
ao3 += 8;
@@ -426,7 +426,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao6 += 8;
ao7 += 8;
ao8 += 8;
-
+
b += 64;
}
@@ -437,7 +437,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 7);
if (i) {
-
+
if (X > posY) {
if (m & 4) {
@@ -445,42 +445,42 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data19 = *(ao3 + 2);
data20 = *(ao3 + 3);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
data28 = *(ao4 + 3);
-
+
data33 = *(ao5 + 0);
data34 = *(ao5 + 1);
data35 = *(ao5 + 2);
data36 = *(ao5 + 3);
-
+
data41 = *(ao6 + 0);
data42 = *(ao6 + 1);
data43 = *(ao6 + 2);
data44 = *(ao6 + 3);
-
+
data49 = *(ao7 + 0);
data50 = *(ao7 + 1);
data51 = *(ao7 + 2);
data52 = *(ao7 + 3);
-
+
data57 = *(ao8 + 0);
data58 = *(ao8 + 1);
data59 = *(ao8 + 2);
data60 = *(ao8 + 3);
-
+
b[ 0] = data01;
b[ 1] = data09;
b[ 2] = data17;
@@ -489,7 +489,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data41;
b[ 6] = data49;
b[ 7] = data57;
-
+
b[ 8] = data02;
b[ 9] = data10;
b[10] = data18;
@@ -498,7 +498,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data42;
b[14] = data50;
b[15] = data58;
-
+
b[16] = data03;
b[17] = data11;
b[18] = data19;
@@ -507,7 +507,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = data43;
b[22] = data51;
b[23] = data59;
-
+
b[24] = data04;
b[25] = data12;
b[26] = data20;
@@ -525,35 +525,35 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao6 += 4;
ao7 += 4;
ao8 += 4;
-
+
b += 32;
}
-
+
if (m & 2) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
-
+
data33 = *(ao5 + 0);
data34 = *(ao5 + 1);
-
+
data41 = *(ao6 + 0);
data42 = *(ao6 + 1);
-
+
data49 = *(ao7 + 0);
data50 = *(ao7 + 1);
-
+
data57 = *(ao8 + 0);
data58 = *(ao8 + 1);
-
+
b[ 0] = data01;
b[ 1] = data09;
b[ 2] = data17;
@@ -562,7 +562,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data41;
b[ 6] = data49;
b[ 7] = data57;
-
+
b[ 8] = data02;
b[ 9] = data10;
b[10] = data18;
@@ -571,7 +571,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data42;
b[14] = data50;
b[15] = data58;
-
+
ao1 += 2;
ao2 += 2;
ao3 += 2;
@@ -580,10 +580,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao6 += 2;
ao7 += 2;
ao8 += 2;
-
+
b += 16;
}
-
+
if (m & 1) {
data01 = *(ao1 + 0);
data09 = *(ao2 + 0);
@@ -593,7 +593,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data41 = *(ao6 + 0);
data49 = *(ao7 + 0);
data57 = *(ao8 + 0);
-
+
b[ 0] = data01;
b[ 1] = data09;
b[ 2] = data17;
@@ -602,25 +602,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data41;
b[ 6] = data49;
b[ 7] = data57;
-
+
b += 8;
}
- } else
+ } else
if (X < posY) {
if (m & 4) {
ao1 += 4 * lda;
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
-
+
b += 32;
}
-
+
if (m & 2) {
ao1 += 2 * lda;
b += 16;
}
-
+
if (m & 1) {
b += 8;
}
@@ -659,7 +659,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
}
-
+
if (i >= 4) {
#ifndef UNIT
data28 = *(ao4 + 3);
@@ -707,7 +707,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 6] = ZERO;
b[ 7] = ZERO;
b += 8;
-
+
if(i >= 2) {
b[ 0] = data02;
#ifdef UNIT
@@ -723,7 +723,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = ZERO;
b += 8;
}
-
+
if (i >= 3) {
b[ 0] = data03;
b[ 1] = data11;
@@ -739,8 +739,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = ZERO;
b += 8;
}
-
- if (i >= 4) {
+
+ if (i >= 4) {
b[ 0] = data04;
b[ 1] = data12;
b[ 2] = data20;
@@ -771,7 +771,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = ZERO;
b += 8;
}
-
+
if (i >= 6) {
b[ 0] = data06;
b[ 1] = data14;
@@ -835,37 +835,37 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data19 = *(ao3 + 2);
data20 = *(ao3 + 3);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
data28 = *(ao4 + 3);
-
+
b[ 0] = data01;
b[ 1] = data09;
b[ 2] = data17;
b[ 3] = data25;
-
+
b[ 4] = data02;
b[ 5] = data10;
b[ 6] = data18;
b[ 7] = data26;
-
+
b[ 8] = data03;
b[ 9] = data11;
b[10] = data19;
b[11] = data27;
-
+
b[12] = data04;
b[13] = data12;
b[14] = data20;
@@ -878,7 +878,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 16;
- } else
+ } else
if (X < posY) {
ao1 += 4 * lda;
ao2 += 4 * lda;
@@ -957,7 +957,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 4;
ao3 += 4;
ao4 += 4;
-
+
b += 16;
}
@@ -968,60 +968,60 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X > posY) {
if (m & 2) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
-
+
b[ 0] = data01;
b[ 1] = data09;
b[ 2] = data17;
b[ 3] = data25;
-
+
b[ 4] = data02;
b[ 5] = data10;
b[ 6] = data18;
b[ 7] = data26;
-
+
ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2;
-
+
b += 8;
}
-
+
if (m & 1) {
data01 = *(ao1 + 0);
data09 = *(ao2 + 0);
data17 = *(ao3 + 0);
data25 = *(ao4 + 0);
-
+
b[ 0] = data01;
b[ 1] = data09;
b[ 2] = data17;
b[ 3] = data25;
-
+
b += 4;
}
- } else
+ } else
if (X < posY) {
if (m & 2) {
ao1 += 2 * lda;
b += 8;
}
-
+
if (m & 1) {
b += 4;
}
@@ -1049,7 +1049,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#endif
data20 = *(ao3 + 3);
}
-
+
#ifdef UNIT
b[ 0] = ONE;
#else
@@ -1059,7 +1059,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 2] = ZERO;
b[ 3] = ZERO;
b += 4;
-
+
if(i >= 2) {
b[ 0] = data02;
#ifdef UNIT
@@ -1071,7 +1071,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = ZERO;
b += 4;
}
-
+
if (i >= 3) {
b[ 0] = data03;
b[ 1] = data11;
@@ -1109,7 +1109,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
-
+
b[ 0] = data01;
b[ 1] = data09;
b[ 2] = data02;
@@ -1119,7 +1119,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 2;
b += 4;
- } else
+ } else
if (X < posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
@@ -1156,15 +1156,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
if (m & 1) {
-
+
if (X > posY) {
data01 = *(ao1 + 0);
data09 = *(ao2 + 0);
-
+
b[ 0] = data01;
b[ 1] = data09;
b += 2;
- } else
+ } else
if (X < posY) {
b += 2;
} else {
@@ -1201,7 +1201,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 0] = data01;
ao1 += 1;
b += 1;
- } else
+ } else
if (X < posY) {
ao1 += lda;
b += 1;
diff --git a/kernel/generic/trmm_ltcopy_1.c b/kernel/generic/trmm_ltcopy_1.c
index ab5e9d8..d79f1a7 100644
--- a/kernel/generic/trmm_ltcopy_1.c
+++ b/kernel/generic/trmm_ltcopy_1.c
@@ -58,11 +58,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = m;
if (i > 0) {
do {
-
+
if (X > posY) {
ao1 += 1;
b += 1;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
b[ 0] = data01;
diff --git a/kernel/generic/trmm_ltcopy_16.c b/kernel/generic/trmm_ltcopy_16.c
index 0598de8..b8469d0 100644
--- a/kernel/generic/trmm_ltcopy_16.c
+++ b/kernel/generic/trmm_ltcopy_16.c
@@ -110,11 +110,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a15 += 16;
a16 += 16;
b += 256;
- } else
+ } else
if (X < posY) {
for (ii = 0; ii < 16; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
b[ 2] = *(a01 + 2);
@@ -123,7 +123,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a01 + 5);
b[ 6] = *(a01 + 6);
b[ 7] = *(a01 + 7);
-
+
b[ 8] = *(a01 + 8);
b[ 9] = *(a01 + 9);
b[ 10] = *(a01 + 10);
@@ -132,7 +132,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(a01 + 13);
b[ 14] = *(a01 + 14);
b[ 15] = *(a01 + 15);
-
+
a01 += lda;
b += 16;
}
@@ -152,7 +152,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a14 += 16 * lda;
a15 += 16 * lda;
a16 += 16 * lda;
-
+
} else {
#ifdef UNIT
b[ 0] = ONE;
@@ -174,7 +174,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(a01 + 13);
b[ 14] = *(a01 + 14);
b[ 15] = *(a01 + 15);
-
+
b[ 16] = ZERO;
#ifdef UNIT
b[ 17] = ONE;
@@ -506,7 +506,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a14 += 16;
a15 += 16;
a16 += 16;
-
+
b += 256;
}
@@ -535,11 +535,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a15 += i;
a16 += i;
b += 16 * i;
- } else
+ } else
if (X < posY) {
-
+
for (ii = 0; ii < i; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
b[ 2] = *(a01 + 2);
@@ -548,7 +548,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a01 + 5);
b[ 6] = *(a01 + 6);
b[ 7] = *(a01 + 7);
-
+
b[ 8] = *(a01 + 8);
b[ 9] = *(a01 + 9);
b[ 10] = *(a01 + 10);
@@ -557,7 +557,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(a01 + 13);
b[ 14] = *(a01 + 14);
b[ 15] = *(a01 + 15);
-
+
a01 += lda;
a02 += lda;
a03 += lda;
@@ -598,7 +598,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 14] = *(a01 + 14);
b[ 15] = *(a01 + 15);
b += 16;
-
+
if (i >= 2) {
b[ 0] = ZERO;
#ifdef UNIT
@@ -622,7 +622,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[15] = *(a02 + 15);
b += 16;
}
-
+
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -965,7 +965,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a07 = a + posX + (posY + 6) * lda;
a08 = a + posX + (posY + 7) * lda;
}
-
+
i = (m >> 3);
if (i > 0) {
do {
@@ -979,9 +979,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a07 += 8;
a08 += 8;
b += 64;
- } else
+ } else
if (X < posY) {
-
+
for (ii = 0; ii < 8; ii++){
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
@@ -1042,7 +1042,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 21] = *(a03 + 5);
b[ 22] = *(a03 + 6);
b[ 23] = *(a03 + 7);
-
+
b[ 24] = ZERO;
b[ 25] = ZERO;
b[ 26] = ZERO;
@@ -1081,7 +1081,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#endif
b[ 46] = *(a06 + 6);
b[ 47] = *(a06 + 7);
-
+
b[ 48] = ZERO;
b[ 49] = ZERO;
b[ 50] = ZERO;
@@ -1094,7 +1094,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 54] = *(a07 + 6);
#endif
b[ 55] = *(a07 + 7);
-
+
b[ 56] = ZERO;
b[ 57] = ZERO;
b[ 58] = ZERO;
@@ -1117,7 +1117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a07 += 8;
a08 += 8;
b += 64;
-
+
}
X += 8;
@@ -1137,11 +1137,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a07 += i;
a08 += i;
b += 8 * i;
- } else
+ } else
if (X < posY) {
-
+
for (ii = 0; ii < i; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
b[ 2] = *(a01 + 2);
@@ -1150,7 +1150,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a01 + 5);
b[ 6] = *(a01 + 6);
b[ 7] = *(a01 + 7);
-
+
a01 += lda;
a02 += lda;
a03 += lda;
@@ -1175,7 +1175,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 6] = *(a01 + 6);
b[ 7] = *(a01 + 7);
b += 8;
-
+
if (i >= 2) {
b[ 0] = ZERO;
#ifdef UNIT
@@ -1191,7 +1191,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = *(a02 + 7);
b += 8;
}
-
+
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -1290,7 +1290,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a03 = a + posX + (posY + 2) * lda;
a04 = a + posX + (posY + 3) * lda;
}
-
+
i = (m >> 2);
if (i > 0) {
do {
@@ -1300,9 +1300,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a03 += 4;
a04 += 4;
b += 16;
- } else
+ } else
if (X < posY) {
-
+
for (ii = 0; ii < 4; ii++){
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
@@ -1343,7 +1343,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 10] = *(a03 + 2);
#endif
b[ 11] = *(a03 + 3);
-
+
b[ 12] = ZERO;
b[ 13] = ZERO;
b[ 14] = ZERO;
@@ -1359,12 +1359,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a04 += 4;
b += 16;
}
-
+
X += 4;
i --;
} while (i > 0);
}
-
+
i = (m & 3);
if (i > 0) {
if (X > posY) {
@@ -1373,11 +1373,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a03 += i;
a04 += i;
b += 4 * i;
- } else
+ } else
if (X < posY) {
-
+
for (ii = 0; ii < i; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
b[ 2] = *(a01 + 2);
@@ -1390,7 +1390,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 4;
}
} else {
-
+
#ifdef UNIT
b[ 0] = ONE;
#else
@@ -1400,7 +1400,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 2] = *(a01 + 2);
b[ 3] = *(a01 + 3);
b += 4;
-
+
if (i >= 2) {
b[ 0] = ZERO;
#ifdef UNIT
@@ -1412,7 +1412,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = *(a02 + 3);
b += 4;
}
-
+
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -1439,7 +1439,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a01 = a + posX + (posY + 0) * lda;
a02 = a + posX + (posY + 1) * lda;
}
-
+
i = (m >> 1);
if (i > 0) {
do {
@@ -1447,7 +1447,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a01 += 2;
a02 += 2;
b += 4;
- } else
+ } else
if (X < posY) {
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
@@ -1475,18 +1475,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a02 += 2;
b += 4;
}
-
+
X += 2;
i --;
} while (i > 0);
}
-
+
if (m & 1) {
if (X > posY) {
a01 ++;
a02 ++;
b += 2;
- } else
+ } else
if (X < posY) {
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
@@ -1514,15 +1514,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} else {
a01 = a + posX + (posY + 0) * lda;
}
-
+
i = m;
if (i > 0) {
do {
-
+
if (X > posY) {
b ++;
a01 ++;
- } else
+ } else
if (X < posY) {
b[ 0] = *(a01 + 0);
a01 += lda;
diff --git a/kernel/generic/trmm_ltcopy_2.c b/kernel/generic/trmm_ltcopy_2.c
index 098e16f..e9ad45f 100644
--- a/kernel/generic/trmm_ltcopy_2.c
+++ b/kernel/generic/trmm_ltcopy_2.c
@@ -68,13 +68,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += 2;
ao2 += 2;
b += 4;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -114,16 +114,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
if (m & 1) {
-
+
if (X > posY) {
ao1 += 1;
ao2 += 1;
b += 2;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
ao1 += lda;
@@ -137,7 +137,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
#endif
@@ -164,11 +164,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = m;
if (i > 0) {
do {
-
+
if (X > posY) {
ao1 += 1;
b += 1;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
b[ 0] = data01;
diff --git a/kernel/generic/trmm_ltcopy_4.c b/kernel/generic/trmm_ltcopy_4.c
index 69a233b..66a7325 100644
--- a/kernel/generic/trmm_ltcopy_4.c
+++ b/kernel/generic/trmm_ltcopy_4.c
@@ -76,28 +76,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao4 += 4;
b += 16;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
-
+
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -106,7 +106,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
@@ -128,12 +128,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
data12 = *(ao3 + 3);
-
+
b[ 0] = ONE;
b[ 1] = data02;
b[ 2] = data03;
@@ -143,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ONE;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = ONE;
@@ -158,14 +158,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
-
+
data16 = *(ao4 + 3);
b[ 0] = data01;
@@ -176,7 +176,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = data11;
@@ -200,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X > posY) {
if (m & 2) {
@@ -210,7 +210,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao4 += 2;
b += 8;
}
-
+
if (m & 1) {
ao1 += 1;
ao2 += 1;
@@ -218,8 +218,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao4 += 1;
b += 4;
}
-
- } else
+
+ } else
if (X < posY) {
if (m & 2) {
data01 = *(ao1 + 0);
@@ -230,7 +230,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -239,28 +239,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
ao1 += 2 * lda;
ao2 += 2 * lda;
-
+
b += 8;
}
-
+
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
-
+
ao1 += lda;
b += 4;
}
-
+
} else {
#ifdef UNIT
@@ -276,13 +276,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (i >= 3) {
data12 = *(ao3 + 3);
}
-
+
b[ 0] = ONE;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b += 4;
-
+
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = ONE;
@@ -290,7 +290,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = data08;
b += 4;
}
-
+
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -314,13 +314,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
}
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b += 4;
-
+
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = data06;
@@ -328,7 +328,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = data08;
b += 4;
}
-
+
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -365,7 +365,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 2;
b += 4;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -410,17 +410,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 1);
if (i) {
-
+
if (X > posY) {
ao1 += 1;
ao2 += 1;
-
+
b += 2;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
ao1 += lda;
diff --git a/kernel/generic/trmm_ltcopy_6.c b/kernel/generic/trmm_ltcopy_6.c
index 69a233b..66a7325 100644
--- a/kernel/generic/trmm_ltcopy_6.c
+++ b/kernel/generic/trmm_ltcopy_6.c
@@ -76,28 +76,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao4 += 4;
b += 16;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
-
+
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -106,7 +106,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
@@ -128,12 +128,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
data12 = *(ao3 + 3);
-
+
b[ 0] = ONE;
b[ 1] = data02;
b[ 2] = data03;
@@ -143,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ONE;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = ONE;
@@ -158,14 +158,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
-
+
data16 = *(ao4 + 3);
b[ 0] = data01;
@@ -176,7 +176,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = data11;
@@ -200,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X > posY) {
if (m & 2) {
@@ -210,7 +210,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao4 += 2;
b += 8;
}
-
+
if (m & 1) {
ao1 += 1;
ao2 += 1;
@@ -218,8 +218,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao4 += 1;
b += 4;
}
-
- } else
+
+ } else
if (X < posY) {
if (m & 2) {
data01 = *(ao1 + 0);
@@ -230,7 +230,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -239,28 +239,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
ao1 += 2 * lda;
ao2 += 2 * lda;
-
+
b += 8;
}
-
+
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
-
+
ao1 += lda;
b += 4;
}
-
+
} else {
#ifdef UNIT
@@ -276,13 +276,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (i >= 3) {
data12 = *(ao3 + 3);
}
-
+
b[ 0] = ONE;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b += 4;
-
+
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = ONE;
@@ -290,7 +290,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = data08;
b += 4;
}
-
+
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -314,13 +314,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
}
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b += 4;
-
+
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = data06;
@@ -328,7 +328,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = data08;
b += 4;
}
-
+
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -365,7 +365,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 2;
b += 4;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -410,17 +410,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 1);
if (i) {
-
+
if (X > posY) {
ao1 += 1;
ao2 += 1;
-
+
b += 2;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
ao1 += lda;
diff --git a/kernel/generic/trmm_ltcopy_8.c b/kernel/generic/trmm_ltcopy_8.c
index 64954da..1012728 100644
--- a/kernel/generic/trmm_ltcopy_8.c
+++ b/kernel/generic/trmm_ltcopy_8.c
@@ -96,7 +96,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 64;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -106,7 +106,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
@@ -124,7 +124,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data22 = *(ao3 + 5);
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
@@ -133,7 +133,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data30 = *(ao4 + 5);
data31 = *(ao4 + 6);
data32 = *(ao4 + 7);
-
+
data33 = *(ao5 + 0);
data34 = *(ao5 + 1);
data35 = *(ao5 + 2);
@@ -142,7 +142,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data38 = *(ao5 + 5);
data39 = *(ao5 + 6);
data40 = *(ao5 + 7);
-
+
data41 = *(ao6 + 0);
data42 = *(ao6 + 1);
data43 = *(ao6 + 2);
@@ -160,7 +160,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data54 = *(ao7 + 5);
data55 = *(ao7 + 6);
data56 = *(ao7 + 7);
-
+
data57 = *(ao8 + 0);
data58 = *(ao8 + 1);
data59 = *(ao8 + 2);
@@ -169,7 +169,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data62 = *(ao8 + 5);
data63 = *(ao8 + 6);
data64 = *(ao8 + 7);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -178,7 +178,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
@@ -196,7 +196,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = data22;
b[22] = data23;
b[23] = data24;
-
+
b[24] = data25;
b[25] = data26;
b[26] = data27;
@@ -214,7 +214,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[37] = data38;
b[38] = data39;
b[39] = data40;
-
+
b[40] = data41;
b[41] = data42;
b[42] = data43;
@@ -241,7 +241,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[61] = data62;
b[62] = data63;
b[63] = data64;
-
+
ao1 += 8 * lda;
ao2 += 8 * lda;
ao3 += 8 * lda;
@@ -250,7 +250,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao6 += 8 * lda;
ao7 += 8 * lda;
ao8 += 8 * lda;
-
+
b += 64;
} else {
@@ -265,7 +265,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
#ifndef UNIT
data10 = *(ao2 + 1);
#endif
@@ -284,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data22 = *(ao3 + 5);
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
-
+
#ifndef UNIT
data28 = *(ao4 + 3);
#endif
@@ -292,14 +292,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data30 = *(ao4 + 5);
data31 = *(ao4 + 6);
data32 = *(ao4 + 7);
-
+
#ifndef UNIT
data37 = *(ao5 + 4);
#endif
data38 = *(ao5 + 5);
data39 = *(ao5 + 6);
data40 = *(ao5 + 7);
-
+
#ifndef UNIT
data46 = *(ao6 + 5);
#endif
@@ -310,7 +310,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data55 = *(ao7 + 6);
#endif
data56 = *(ao7 + 7);
-
+
#ifndef UNIT
data64 = *(ao8 + 7);
#endif
@@ -328,7 +328,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = ZERO;
#ifdef UNIT
b[ 9] = ONE;
@@ -354,7 +354,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = data22;
b[22] = data23;
b[23] = data24;
-
+
b[24] = ZERO;
b[25] = ZERO;
b[26] = ZERO;
@@ -380,7 +380,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[37] = data38;
b[38] = data39;
b[39] = data40;
-
+
b[40] = ZERO;
b[41] = ZERO;
b[42] = ZERO;
@@ -419,7 +419,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#else
b[63] = data64;
#endif
-
+
ao1 += 8;
ao2 += 8;
ao3 += 8;
@@ -428,7 +428,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao6 += 8;
ao7 += 8;
ao8 += 8;
-
+
b += 64;
}
@@ -439,7 +439,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 7);
if (i) {
-
+
if (X > posY) {
if (m & 4) {
@@ -451,10 +451,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao6 += 4;
ao7 += 4;
ao8 += 4;
-
+
b += 32;
}
-
+
if (m & 2) {
ao1 += 2;
ao2 += 2;
@@ -464,14 +464,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao6 += 2;
ao7 += 2;
ao8 += 2;
-
+
b += 16;
}
-
+
if (m & 1) {
b += 8;
}
- } else
+ } else
if (X < posY) {
if (m & 4) {
data01 = *(ao1 + 0);
@@ -482,7 +482,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
@@ -491,7 +491,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data14 = *(ao2 + 5);
data15 = *(ao2 + 6);
data16 = *(ao2 + 7);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data19 = *(ao3 + 2);
@@ -500,7 +500,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data22 = *(ao3 + 5);
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
@@ -509,7 +509,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data30 = *(ao4 + 5);
data31 = *(ao4 + 6);
data32 = *(ao4 + 7);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -518,7 +518,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
@@ -527,7 +527,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data14;
b[14] = data15;
b[15] = data16;
-
+
b[16] = data17;
b[17] = data18;
b[18] = data19;
@@ -536,7 +536,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = data22;
b[22] = data23;
b[23] = data24;
-
+
b[24] = data25;
b[25] = data26;
b[26] = data27;
@@ -545,15 +545,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[29] = data30;
b[30] = data31;
b[31] = data32;
-
+
ao1 += 4 * lda;
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
-
+
b += 32;
}
-
+
if (m & 2) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -563,7 +563,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
@@ -572,7 +572,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data14 = *(ao2 + 5);
data15 = *(ao2 + 6);
data16 = *(ao2 + 7);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -581,7 +581,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
@@ -590,11 +590,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data14;
b[14] = data15;
b[15] = data16;
-
+
ao1 += 2 * lda;
b += 16;
}
-
+
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -613,7 +613,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b += 8;
}
} else {
@@ -650,7 +650,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
}
-
+
if (i >= 4) {
#ifndef UNIT
data28 = *(ao4 + 3);
@@ -698,7 +698,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 6] = data07;
b[ 7] = data08;
b += 8;
-
+
if(i >= 2) {
b[ 0] = ZERO;
#ifdef UNIT
@@ -714,7 +714,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = data16;
b += 8;
}
-
+
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -730,8 +730,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = data24;
b += 8;
}
-
- if (i >= 4) {
+
+ if (i >= 4) {
b[ 0] = ZERO;
b[ 1] = ZERO;
b[ 2] = ZERO;
@@ -762,7 +762,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = data40;
b += 8;
}
-
+
if (i >= 6) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -829,7 +829,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 16;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -855,7 +855,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -949,7 +949,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 4;
ao3 += 4;
ao4 += 4;
-
+
b += 16;
}
@@ -960,7 +960,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X > posY) {
if (m & 2) {
@@ -968,14 +968,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 2;
ao3 += 2;
ao4 += 2;
-
+
b += 8;
}
-
+
if (m & 1) {
b += 4;
}
- } else
+ } else
if (X < posY) {
if (m & 2) {
data01 = *(ao1 + 0);
@@ -987,7 +987,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -996,11 +996,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data10;
b[ 6] = data11;
b[ 7] = data12;
-
+
ao1 += 2 * lda;
b += 8;
}
-
+
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -1011,7 +1011,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
-
+
b += 4;
}
} else {
@@ -1038,7 +1038,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#endif
data20 = *(ao3 + 3);
}
-
+
#ifdef UNIT
b[ 0] = ONE;
#else
@@ -1048,7 +1048,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 2] = data03;
b[ 3] = data04;
b += 4;
-
+
if(i >= 2) {
b[ 0] = ZERO;
#ifdef UNIT
@@ -1060,7 +1060,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = data12;
b += 4;
}
-
+
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -1097,7 +1097,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 2;
b += 4;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -1106,7 +1106,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += 2 * lda;
ao2 += 2 * lda;
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data09;
@@ -1147,10 +1147,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
if (m & 1) {
-
+
if (X > posY) {
b += 2;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -1190,11 +1190,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (X > posY) {
ao1 += 1;
b += 1;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
ao1 += lda;
-
+
b[ 0] = data01;
b += 1;
diff --git a/kernel/generic/trmm_uncopy_1.c b/kernel/generic/trmm_uncopy_1.c
index 6e75c2f..f77c310 100644
--- a/kernel/generic/trmm_uncopy_1.c
+++ b/kernel/generic/trmm_uncopy_1.c
@@ -48,17 +48,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (n > 0) {
X = posX;
-
+
if (posX <= posY) {
ao1 = a + posX + (posY + 0) * lda;
} else {
ao1 = a + posY + (posX + 0) * lda;
}
-
+
i = m;
if (m > 0) {
do {
-
+
if (X < posY) {
data01 = *(ao1 + 0);
b[ 0] = data01;
@@ -78,7 +78,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 1;
ao1 += lda;
}
-
+
X += 1;
i --;
} while (i > 0);
diff --git a/kernel/generic/trmm_uncopy_16.c b/kernel/generic/trmm_uncopy_16.c
index 6325a26..19b2fdd 100644
--- a/kernel/generic/trmm_uncopy_16.c
+++ b/kernel/generic/trmm_uncopy_16.c
@@ -88,13 +88,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a15 = a + posY + (posX + 14) * lda;
a16 = a + posY + (posX + 15) * lda;
}
-
+
i = (m >> 4);
if (i > 0) {
do {
if (X < posY) {
for (ii = 0; ii < 16; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a02 + 0);
b[ 2] = *(a03 + 0);
@@ -103,7 +103,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a06 + 0);
b[ 6] = *(a07 + 0);
b[ 7] = *(a08 + 0);
-
+
b[ 8] = *(a09 + 0);
b[ 9] = *(a10 + 0);
b[ 10] = *(a11 + 0);
@@ -112,7 +112,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(a14 + 0);
b[ 14] = *(a15 + 0);
b[ 15] = *(a16 + 0);
-
+
a01 ++;
a02 ++;
a03 ++;
@@ -131,7 +131,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a16 ++;
b += 16;
}
- } else
+ } else
if (X > posY) {
a01 += 16 * lda;
a02 += 16 * lda;
@@ -171,7 +171,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(a14 + 0);
b[ 14] = *(a15 + 0);
b[ 15] = *(a16 + 0);
-
+
b[ 16] = ZERO;
#ifdef UNIT
b[ 17] = ONE;
@@ -503,7 +503,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a14 += 16 * lda;
a15 += 16 * lda;
a16 += 16 * lda;
-
+
b += 256;
}
@@ -514,10 +514,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 15);
if (i) {
-
+
if (X < posY) {
for (ii = 0; ii < i; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a02 + 0);
b[ 2] = *(a03 + 0);
@@ -526,7 +526,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a06 + 0);
b[ 6] = *(a07 + 0);
b[ 7] = *(a08 + 0);
-
+
b[ 8] = *(a09 + 0);
b[ 9] = *(a10 + 0);
b[ 10] = *(a11 + 0);
@@ -535,7 +535,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(a14 + 0);
b[ 14] = *(a15 + 0);
b[ 15] = *(a16 + 0);
-
+
a01 ++;
a02 ++;
a03 ++;
@@ -554,7 +554,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a16 ++;
b += 16;
}
- } else
+ } else
if (X > posY) {
a01 += i * lda;
a02 += i * lda;
@@ -595,7 +595,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 14] = *(a15 + 0);
b[ 15] = *(a16 + 0);
b += 16;
-
+
if (i >= 2) {
b[ 0] = ZERO;
#ifdef UNIT
@@ -739,7 +739,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 15] = *(a16 + 6);
b += 16;
}
-
+
if (i >= 8) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -968,7 +968,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
do {
if (X < posY) {
for (ii = 0; ii < 8; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a02 + 0);
b[ 2] = *(a03 + 0);
@@ -977,7 +977,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a06 + 0);
b[ 6] = *(a07 + 0);
b[ 7] = *(a08 + 0);
-
+
a01 ++;
a02 ++;
a03 ++;
@@ -988,7 +988,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a08 ++;
b += 8;
}
- } else
+ } else
if (X > posY) {
a01 += 8 * lda;
a02 += 8 * lda;
@@ -1012,7 +1012,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a06 + 0);
b[ 6] = *(a07 + 0);
b[ 7] = *(a08 + 0);
-
+
b[ 8] = ZERO;
#ifdef UNIT
b[ 9] = ONE;
@@ -1122,10 +1122,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 7);
if (i) {
-
+
if (X < posY) {
for (ii = 0; ii < i; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a02 + 0);
b[ 2] = *(a03 + 0);
@@ -1134,7 +1134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a06 + 0);
b[ 6] = *(a07 + 0);
b[ 7] = *(a08 + 0);
-
+
a01 ++;
a02 ++;
a03 ++;
@@ -1145,7 +1145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a08 ++;
b += 8;
}
- } else
+ } else
if (X > posY) {
a01 += i * lda;
a02 += i * lda;
@@ -1170,7 +1170,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 6] = *(a07 + 0);
b[ 7] = *(a08 + 0);
b += 8;
-
+
if (i >= 2) {
b[ 0] = ZERO;
#ifdef UNIT
@@ -1292,7 +1292,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
do {
if (X < posY) {
for (ii = 0; ii < 4; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a02 + 0);
b[ 2] = *(a03 + 0);
@@ -1304,7 +1304,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a04 ++;
b += 4;
}
- } else
+ } else
if (X > posY) {
a01 += 4 * lda;
a02 += 4 * lda;
@@ -1320,7 +1320,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 1] = *(a02 + 0);
b[ 2] = *(a03 + 0);
b[ 3] = *(a04 + 0);
-
+
b[ 4] = ZERO;
#ifdef UNIT
b[ 5] = ONE;
@@ -1362,22 +1362,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X < posY) {
for (ii = 0; ii < i; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a02 + 0);
b[ 2] = *(a03 + 0);
b[ 3] = *(a04 + 0);
-
+
a01 ++;
a02 ++;
a03 ++;
a04 ++;
b += 4;
}
- } else
+ } else
if (X > posY) {
a01 += i * lda;
a02 += i * lda;
@@ -1394,7 +1394,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 2] = *(a03 + 0);
b[ 3] = *(a04 + 0);
b += 4;
-
+
if (i >= 2) {
b[ 0] = ZERO;
#ifdef UNIT
@@ -1443,11 +1443,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 1] = *(a02 + 0);
b[ 2] = *(a01 + 1);
b[ 3] = *(a02 + 1);
-
+
a01 += 2;
a02 += 2;
b += 4;
- } else
+ } else
if (X > posY) {
a01 += 2 * lda;
a02 += 2 * lda;
@@ -1459,7 +1459,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 0] = *(a01 + 0);
#endif
b[ 1] = *(a02 + 0);
-
+
b[ 2] = ZERO;
#ifdef UNIT
b[ 3] = ONE;
@@ -1478,15 +1478,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
if (m & 1) {
-
+
if (X < posY) {
b[ 0] = *(a01 + 0);
b[ 1] = *(a02 + 0);
-
+
a01 ++;
a02 ++;
b += 2;
- } else
+ } else
if (X > posY) {
a01 += lda;
a02 += lda;
@@ -1520,7 +1520,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 0] = *(a01 + 0);
a01 += 1;
b += 1;
- } else
+ } else
if (X > posY) {
a01 += lda;
b += 1;
diff --git a/kernel/generic/trmm_uncopy_2.c b/kernel/generic/trmm_uncopy_2.c
index 1b6d235..61303a2 100644
--- a/kernel/generic/trmm_uncopy_2.c
+++ b/kernel/generic/trmm_uncopy_2.c
@@ -69,7 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
b[ 0] = data01;
b[ 1] = data03;
b[ 2] = data02;
@@ -79,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 2;
b += 4;
- } else
+ } else
if (X > posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
@@ -114,18 +114,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
if (m & 1) {
-
+
if (X < posY) {
data01 = *(ao1 + 0);
data03 = *(ao2 + 0);
-
+
b[ 0] = data01;
b[ 1] = data03;
-
+
ao1 += 1;
ao2 += 1;
b += 2;
- } else
+ } else
if (X > posY) {
ao1 += lda;
b += 2;
@@ -138,7 +138,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#else
data01 = *(ao1 + 0);
data03 = *(ao2 + 0);
-
+
b[ 0] = data01;
b[ 1] = data03;
#endif
@@ -154,17 +154,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1){
X = posX;
-
+
if (posX <= posY) {
ao1 = a + posX + (posY + 0) * lda;
} else {
ao1 = a + posY + (posX + 0) * lda;
}
-
+
i = m;
if (m > 0) {
do {
-
+
if (X < posY) {
data01 = *(ao1 + 0);
b[ 0] = data01;
@@ -184,7 +184,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 1;
ao1 += lda;
}
-
+
X += 1;
i --;
} while (i > 0);
diff --git a/kernel/generic/trmm_uncopy_4.c b/kernel/generic/trmm_uncopy_4.c
index 4ff6948..0218a0e 100644
--- a/kernel/generic/trmm_uncopy_4.c
+++ b/kernel/generic/trmm_uncopy_4.c
@@ -74,22 +74,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
-
+
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);
-
+
b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data09;
@@ -98,7 +98,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data10;
b[ 7] = data14;
-
+
b[ 8] = data03;
b[ 9] = data07;
b[10] = data11;
@@ -107,13 +107,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data08;
b[14] = data12;
b[15] = data16;
-
+
ao1 += 4;
ao2 += 4;
ao3 += 4;
ao4 += 4;
b += 16;
- } else
+ } else
if (X > posY) {
ao1 += 4 * lda;
ao2 += 4 * lda;
@@ -124,14 +124,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} else {
#ifdef UNIT
data05 = *(ao2 + 0);
-
+
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
-
+
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
-
+
b[ 0] = ONE;
b[ 1] = data05;
b[ 2] = data09;
@@ -141,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ONE;
b[ 6] = data10;
b[ 7] = data14;
-
+
b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = ONE;
@@ -153,19 +153,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[15] = ONE;
#else
data01 = *(ao1 + 0);
-
+
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
-
+
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
-
+
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);
-
+
b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data09;
@@ -175,7 +175,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data10;
b[ 7] = data14;
-
+
b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = data11;
@@ -190,7 +190,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 4;
ao3 += 4;
ao4 += 4;
-
+
b += 16;
}
@@ -201,7 +201,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X < posY) {
if (m & 2) {
@@ -222,14 +222,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data04;
b[ 6] = data06;
b[ 7] = data08;
-
+
ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2;
b += 8;
}
-
+
if (m & 1) {
data01 = *(ao1 + 0);
data03 = *(ao2 + 0);
@@ -247,20 +247,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao4 += 1;
b += 4;
}
-
- } else
+
+ } else
if (X > posY) {
if (m & 2) {
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 8;
}
-
+
if (m & 1) {
ao1 += lda;
b += 4;
}
-
+
} else {
#ifdef UNIT
data05 = *(ao2 + 0);
@@ -275,13 +275,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (i >= 3) {
data15 = *(ao4 + 2);
}
-
+
b[ 0] = ONE;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;
b += 4;
-
+
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = ONE;
@@ -289,7 +289,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = data14;
b += 4;
}
-
+
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -313,13 +313,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data11 = *(ao3 + 2);
data15 = *(ao4 + 2);
}
-
+
b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data09;
b[ 3] = data13;
b += 4;
-
+
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = data06;
@@ -327,7 +327,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = data14;
b += 4;
}
-
+
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -363,17 +363,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
-
+
b[ 0] = data01;
b[ 1] = data05;
b[ 2] = data02;
b[ 3] = data06;
-
+
ao1 += 2;
ao2 += 2;
b += 4;
- } else
+ } else
if (X > posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
@@ -400,7 +400,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += 2 * lda;
ao2 += 2 * lda;
-
+
b += 4;
}
@@ -411,7 +411,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 1);
if (i) {
-
+
if (X < posY) {
data01 = *(ao1 + 0);
data05 = *(ao2 + 0);
@@ -421,7 +421,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += 1;
ao2 += 1;
b += 2;
- } else
+ } else
if (X > posY) {
ao1 += lda;
ao2 += lda;
@@ -478,7 +478,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += lda;
b += 1;
}
-
+
X += 1;
i --;
} while (i > 0);
diff --git a/kernel/generic/trmm_uncopy_6.c b/kernel/generic/trmm_uncopy_6.c
index 70945a2..4878f3f 100644
--- a/kernel/generic/trmm_uncopy_6.c
+++ b/kernel/generic/trmm_uncopy_6.c
@@ -170,7 +170,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao5 += 6;
ao6 += 6;
b += 36;
- } else
+ } else
if (X > posY) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -287,7 +287,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[33] = ZERO;
b[34] = ZERO;
b[35] = ONE;
-#else
+#else
b[ 0] = data01;
b[ 1] = data07;
b[ 2] = data13;
@@ -390,7 +390,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao3 += 4;
ao4 += 4;
b += 16;
- } else
+ } else
if (X > posY) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -544,7 +544,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 4;
}
- } else
+ } else
if (X > posY) {
if (m & 2) {
ao1 += 2 * lda;
@@ -669,7 +669,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 2;
b += 4;
- } else
+ } else
if (X > posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
@@ -717,7 +717,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += 1;
ao2 += 1;
b += 2;
- } else
+ } else
if (X > posY) {
ao1 += lda;
ao2 += lda;
diff --git a/kernel/generic/trmm_uncopy_8.c b/kernel/generic/trmm_uncopy_8.c
index 4e23ffc..ecfefd0 100644
--- a/kernel/generic/trmm_uncopy_8.c
+++ b/kernel/generic/trmm_uncopy_8.c
@@ -80,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao7 = a + posY + (posX + 6) * lda;
ao8 = a + posY + (posX + 7) * lda;
}
-
+
i = (m >> 3);
if (i > 0) {
do {
@@ -93,7 +93,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
@@ -111,7 +111,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data22 = *(ao3 + 5);
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
@@ -120,7 +120,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data30 = *(ao4 + 5);
data31 = *(ao4 + 6);
data32 = *(ao4 + 7);
-
+
data33 = *(ao5 + 0);
data34 = *(ao5 + 1);
data35 = *(ao5 + 2);
@@ -129,7 +129,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data38 = *(ao5 + 5);
data39 = *(ao5 + 6);
data40 = *(ao5 + 7);
-
+
data41 = *(ao6 + 0);
data42 = *(ao6 + 1);
data43 = *(ao6 + 2);
@@ -147,7 +147,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data54 = *(ao7 + 5);
data55 = *(ao7 + 6);
data56 = *(ao7 + 7);
-
+
data57 = *(ao8 + 0);
data58 = *(ao8 + 1);
data59 = *(ao8 + 2);
@@ -156,7 +156,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data62 = *(ao8 + 5);
data63 = *(ao8 + 6);
data64 = *(ao8 + 7);
-
+
b[ 0] = data01;
b[ 1] = data09;
b[ 2] = data17;
@@ -240,7 +240,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 64;
- } else
+ } else
if (X > posY) {
ao1 += 8 * lda;
ao2 += 8 * lda;
@@ -250,7 +250,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao6 += 8 * lda;
ao7 += 8 * lda;
ao8 += 8 * lda;
-
+
b += 64;
} else {
@@ -258,12 +258,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#ifndef UNIT
data01 = *(ao1 + 0);
#endif
-
+
data09 = *(ao2 + 0);
#ifndef UNIT
data10 = *(ao2 + 1);
#endif
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
#ifndef UNIT
@@ -276,7 +276,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#ifndef UNIT
data28 = *(ao4 + 3);
#endif
-
+
data33 = *(ao5 + 0);
data34 = *(ao5 + 1);
data35 = *(ao5 + 2);
@@ -284,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#ifndef UNIT
data37 = *(ao5 + 4);
#endif
-
+
data41 = *(ao6 + 0);
data42 = *(ao6 + 1);
data43 = *(ao6 + 2);
@@ -293,7 +293,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#ifndef UNIT
data46 = *(ao6 + 5);
#endif
-
+
data49 = *(ao7 + 0);
data50 = *(ao7 + 1);
data51 = *(ao7 + 2);
@@ -303,7 +303,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#ifndef UNIT
data55 = *(ao7 + 6);
#endif
-
+
data57 = *(ao8 + 0);
data58 = *(ao8 + 1);
data59 = *(ao8 + 2);
@@ -314,7 +314,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#ifndef UNIT
data64 = *(ao8 + 7);
#endif
-
+
#ifdef UNIT
b[ 0] = ONE;
@@ -328,7 +328,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data41;
b[ 6] = data49;
b[ 7] = data57;
-
+
b[ 8] = ZERO;
#ifdef UNIT
b[ 9] = ONE;
@@ -354,7 +354,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = data43;
b[22] = data51;
b[23] = data59;
-
+
b[24] = ZERO;
b[25] = ZERO;
b[26] = ZERO;
@@ -380,7 +380,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[37] = data45;
b[38] = data53;
b[39] = data61;
-
+
b[40] = ZERO;
b[41] = ZERO;
b[42] = ZERO;
@@ -419,7 +419,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#else
b[63] = data64;
#endif
-
+
ao1 += 8 * lda;
ao2 += 8 * lda;
ao3 += 8 * lda;
@@ -428,7 +428,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao6 += 8 * lda;
ao7 += 8 * lda;
ao8 += 8 * lda;
-
+
b += 64;
}
@@ -439,7 +439,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 7);
if (i) {
-
+
if (X < posY) {
if (m & 4) {
@@ -447,42 +447,42 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data19 = *(ao3 + 2);
data20 = *(ao3 + 3);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
data28 = *(ao4 + 3);
-
+
data33 = *(ao5 + 0);
data34 = *(ao5 + 1);
data35 = *(ao5 + 2);
data36 = *(ao5 + 3);
-
+
data41 = *(ao6 + 0);
data42 = *(ao6 + 1);
data43 = *(ao6 + 2);
data44 = *(ao6 + 3);
-
+
data49 = *(ao7 + 0);
data50 = *(ao7 + 1);
data51 = *(ao7 + 2);
data52 = *(ao7 + 3);
-
+
data57 = *(ao8 + 0);
data58 = *(ao8 + 1);
data59 = *(ao8 + 2);
data60 = *(ao8 + 3);
-
+
b[ 0] = data01;
b[ 1] = data09;
b[ 2] = data17;
@@ -491,7 +491,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data41;
b[ 6] = data49;
b[ 7] = data57;
-
+
b[ 8] = data02;
b[ 9] = data10;
b[10] = data18;
@@ -500,7 +500,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data42;
b[14] = data50;
b[15] = data58;
-
+
b[16] = data03;
b[17] = data11;
b[18] = data19;
@@ -509,7 +509,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = data43;
b[22] = data51;
b[23] = data59;
-
+
b[24] = data04;
b[25] = data12;
b[26] = data20;
@@ -527,35 +527,35 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao6 += 4;
ao7 += 4;
ao8 += 4;
-
+
b += 32;
}
-
+
if (m & 2) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
-
+
data33 = *(ao5 + 0);
data34 = *(ao5 + 1);
-
+
data41 = *(ao6 + 0);
data42 = *(ao6 + 1);
-
+
data49 = *(ao7 + 0);
data50 = *(ao7 + 1);
-
+
data57 = *(ao8 + 0);
data58 = *(ao8 + 1);
-
+
b[ 0] = data01;
b[ 1] = data09;
b[ 2] = data17;
@@ -564,7 +564,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data41;
b[ 6] = data49;
b[ 7] = data57;
-
+
b[ 8] = data02;
b[ 9] = data10;
b[10] = data18;
@@ -573,7 +573,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data42;
b[14] = data50;
b[15] = data58;
-
+
ao1 += 2;
ao2 += 2;
ao3 += 2;
@@ -582,10 +582,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao6 += 2;
ao7 += 2;
ao8 += 2;
-
+
b += 16;
}
-
+
if (m & 1) {
data01 = *(ao1 + 0);
data09 = *(ao2 + 0);
@@ -595,7 +595,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data41 = *(ao6 + 0);
data49 = *(ao7 + 0);
data57 = *(ao8 + 0);
-
+
b[ 0] = data01;
b[ 1] = data09;
b[ 2] = data17;
@@ -604,25 +604,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data41;
b[ 6] = data49;
b[ 7] = data57;
-
+
b += 8;
}
- } else
+ } else
if (X > posY) {
if (m & 4) {
ao1 += 4 * lda;
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
-
+
b += 32;
}
-
+
if (m & 2) {
ao1 += 2 * lda;
b += 16;
}
-
+
if (m & 1) {
b += 8;
}
@@ -661,7 +661,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data51 = *(ao7 + 2);
data59 = *(ao8 + 2);
}
-
+
if (i >= 4) {
#ifndef UNIT
data28 = *(ao4 + 3);
@@ -709,7 +709,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 6] = data49;
b[ 7] = data57;
b += 8;
-
+
if(i >= 2) {
b[ 0] = ZERO;
#ifdef UNIT
@@ -725,7 +725,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = data58;
b += 8;
}
-
+
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -741,8 +741,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = data59;
b += 8;
}
-
- if (i >= 4) {
+
+ if (i >= 4) {
b[ 0] = ZERO;
b[ 1] = ZERO;
b[ 2] = ZERO;
@@ -773,7 +773,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = data61;
b += 8;
}
-
+
if (i >= 6) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -837,37 +837,37 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data19 = *(ao3 + 2);
data20 = *(ao3 + 3);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
data28 = *(ao4 + 3);
-
+
b[ 0] = data01;
b[ 1] = data09;
b[ 2] = data17;
b[ 3] = data25;
-
+
b[ 4] = data02;
b[ 5] = data10;
b[ 6] = data18;
b[ 7] = data26;
-
+
b[ 8] = data03;
b[ 9] = data11;
b[10] = data19;
b[11] = data27;
-
+
b[12] = data04;
b[13] = data12;
b[14] = data20;
@@ -880,7 +880,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 16;
- } else
+ } else
if (X > posY) {
ao1 += 4 * lda;
ao2 += 4 * lda;
@@ -906,7 +906,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = data25;
b[ 4] = ZERO;
- b[ 5] = ONE;
+ b[ 5] = ONE;
b[ 6] = data18;
b[ 7] = data26;
@@ -958,7 +958,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
-
+
b += 16;
}
@@ -969,60 +969,60 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X < posY) {
if (m & 2) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
-
+
b[ 0] = data01;
b[ 1] = data09;
b[ 2] = data17;
b[ 3] = data25;
-
+
b[ 4] = data02;
b[ 5] = data10;
b[ 6] = data18;
b[ 7] = data26;
-
+
ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2;
-
+
b += 8;
}
-
+
if (m & 1) {
data01 = *(ao1 + 0);
data09 = *(ao2 + 0);
data17 = *(ao3 + 0);
data25 = *(ao4 + 0);
-
+
b[ 0] = data01;
b[ 1] = data09;
b[ 2] = data17;
b[ 3] = data25;
-
+
b += 4;
}
- } else
+ } else
if (X > posY) {
if (m & 2) {
ao1 += 2 * lda;
b += 8;
}
-
+
if (m & 1) {
b += 4;
}
@@ -1049,7 +1049,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#endif
data27 = *(ao4 + 2);
}
-
+
#ifdef UNIT
b[ 0] = ONE;
#else
@@ -1059,7 +1059,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 2] = data17;
b[ 3] = data25;
b += 4;
-
+
if(i >= 2) {
b[ 0] = ZERO;
#ifdef UNIT
@@ -1071,7 +1071,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = data26;
b += 4;
}
-
+
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -1109,7 +1109,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
-
+
b[ 0] = data01;
b[ 1] = data09;
b[ 2] = data02;
@@ -1119,7 +1119,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 2;
b += 4;
- } else
+ } else
if (X > posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
@@ -1156,15 +1156,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
if (m & 1) {
-
+
if (X < posY) {
data01 = *(ao1 + 0);
data09 = *(ao2 + 0);
-
+
b[ 0] = data01;
b[ 1] = data09;
b += 2;
- } else
+ } else
if (X > posY) {
b += 2;
} else {
@@ -1201,7 +1201,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 0] = data01;
ao1 += 1;
b += 1;
- } else
+ } else
if (X > posY) {
ao1 += lda;
b += 1;
diff --git a/kernel/generic/trmm_utcopy_1.c b/kernel/generic/trmm_utcopy_1.c
index 92f2da3..86665e8 100644
--- a/kernel/generic/trmm_utcopy_1.c
+++ b/kernel/generic/trmm_utcopy_1.c
@@ -48,13 +48,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (n > 0) {
X = posX;
-
+
if (posX <= posY) {
ao1 = a + posX + (posY + 0) * lda;
} else {
ao1 = a + posY + (posX + 0) * lda;
}
-
+
i = m;
if (m > 0) {
do {
@@ -77,7 +77,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 1;
ao1 += lda;
}
-
+
X += 1;
i --;
} while (i > 0);
diff --git a/kernel/generic/trmm_utcopy_16.c b/kernel/generic/trmm_utcopy_16.c
index a964cd3..b83989f 100644
--- a/kernel/generic/trmm_utcopy_16.c
+++ b/kernel/generic/trmm_utcopy_16.c
@@ -88,7 +88,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a15 = a + posY + (posX + 14) * lda;
a16 = a + posY + (posX + 15) * lda;
}
-
+
i = (m >> 4);
if (i > 0) {
do {
@@ -110,11 +110,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a15 += 16;
a16 += 16;
b += 256;
- } else
+ } else
if (X > posY) {
for (ii = 0; ii < 16; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
b[ 2] = *(a01 + 2);
@@ -123,7 +123,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a01 + 5);
b[ 6] = *(a01 + 6);
b[ 7] = *(a01 + 7);
-
+
b[ 8] = *(a01 + 8);
b[ 9] = *(a01 + 9);
b[ 10] = *(a01 + 10);
@@ -132,7 +132,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(a01 + 13);
b[ 14] = *(a01 + 14);
b[ 15] = *(a01 + 15);
-
+
a01 += lda;
b += 16;
}
@@ -174,7 +174,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = ZERO;
b[ 14] = ZERO;
b[ 15] = ZERO;
-
+
b[ 16] = *(a02 + 0);
#ifdef UNIT
b[ 17] = ONE;
@@ -506,7 +506,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a14 += 16 * lda;
a15 += 16 * lda;
a16 += 16 * lda;
-
+
b += 256;
}
@@ -535,11 +535,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a15 += i;
a16 += i;
b += 16 * i;
- } else
+ } else
if (X > posY) {
-
+
for (ii = 0; ii < i; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
b[ 2] = *(a01 + 2);
@@ -548,7 +548,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a01 + 5);
b[ 6] = *(a01 + 6);
b[ 7] = *(a01 + 7);
-
+
b[ 8] = *(a01 + 8);
b[ 9] = *(a01 + 9);
b[ 10] = *(a01 + 10);
@@ -557,7 +557,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(a01 + 13);
b[ 14] = *(a01 + 14);
b[ 15] = *(a01 + 15);
-
+
a01 += lda;
a02 += lda;
a03 += lda;
@@ -576,7 +576,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a16 += lda;
b += 16;
}
-
+
} else {
#ifdef UNIT
b[ 0] = ONE;
@@ -598,7 +598,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = ZERO;
b[14] = ZERO;
b[15] = ZERO;
-
+
if (i >= 2) {
b[ 0] = *(a02 + 0);
#ifdef UNIT
@@ -942,7 +942,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
js --;
} while (js > 0);
} /* End of main loop */
-
+
if (n & 8){
X = posX;
@@ -966,7 +966,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a07 = a + posY + (posX + 6) * lda;
a08 = a + posY + (posX + 7) * lda;
}
-
+
i = (m >> 3);
if (i > 0) {
do {
@@ -980,11 +980,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a07 += 8;
a08 += 8;
b += 64;
- } else
+ } else
if (X > posY) {
-
+
for (ii = 0; ii < 8; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
b[ 2] = *(a01 + 2);
@@ -993,7 +993,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a01 + 5);
b[ 6] = *(a01 + 6);
b[ 7] = *(a01 + 7);
-
+
a01 += lda;
b += 8;
}
@@ -1018,7 +1018,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ZERO;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = *(a02 + 0);
#ifdef UNIT
b[ 9] = ONE;
@@ -1121,7 +1121,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 64;
}
-
+
X += 8;
i --;
} while (i > 0);
@@ -1139,10 +1139,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a07 += i;
a08 += i;
b += 8 * i;
- } else
+ } else
if (X > posY) {
for (ii = 0; ii < i; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
b[ 2] = *(a01 + 2);
@@ -1151,11 +1151,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a01 + 5);
b[ 6] = *(a01 + 6);
b[ 7] = *(a01 + 7);
-
+
a01 += lda;
b += 8;
}
-
+
a02 += i * lda;
a03 += i * lda;
a04 += i * lda;
@@ -1177,7 +1177,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 6] = ZERO;
b[ 7] = ZERO;
b += 8;
-
+
if (i >= 2) {
b[ 0] = *(a02 + 0);
#ifdef UNIT
@@ -1292,7 +1292,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a03 = a + posY + (posX + 2) * lda;
a04 = a + posY + (posX + 3) * lda;
}
-
+
i = (m >> 2);
if (i > 0) {
do {
@@ -1302,11 +1302,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a03 += 4;
a04 += 4;
b += 16;
- } else
+ } else
if (X > posY) {
-
+
for (ii = 0; ii < 4; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
b[ 2] = *(a01 + 2);
@@ -1328,7 +1328,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 1] = ZERO;
b[ 2] = ZERO;
b[ 3] = ZERO;
-
+
b[ 4] = *(a02 + 0);
#ifdef UNIT
b[ 5] = ONE;
@@ -1346,7 +1346,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 10] = *(a03 + 2);
#endif
b[ 11] = ZERO;
-
+
b[ 12] = *(a04 + 0);
b[ 13] = *(a04 + 1);
b[ 14] = *(a04 + 2);
@@ -1362,7 +1362,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a04 += 4 * lda;
b += 16;
}
-
+
X += 4;
i --;
} while (i > 0);
@@ -1376,10 +1376,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a03 += i;
a04 += i;
b += 4 * i;
- } else
+ } else
if (X > posY) {
for (ii = 0; ii < i; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
b[ 2] = *(a01 + 2);
@@ -1391,7 +1391,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a03 += lda;
a04 += lda;
} else {
-
+
#ifdef UNIT
b[ 0] = ONE;
#else
@@ -1401,7 +1401,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 2] = ZERO;
b[ 3] = ZERO;
b += 4;
-
+
if (i >= 2) {
b[ 0] = *(a02 + 0);
#ifdef UNIT
@@ -1440,7 +1440,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a01 = a + posY + (posX + 0) * lda;
a02 = a + posY + (posX + 1) * lda;
}
-
+
i = (m >> 1);
if (i > 0) {
do {
@@ -1448,7 +1448,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a01 += 2;
a02 += 2;
b += 4;
- } else
+ } else
if (X > posY) {
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
@@ -1465,7 +1465,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 0] = *(a01 + 0);
#endif
b[ 1] = ZERO;
-
+
b[ 2] = *(a02 + 0);
#ifdef UNIT
b[ 3] = ONE;
@@ -1477,7 +1477,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a02 += 2 * lda;
b += 4;
}
-
+
X += 2;
i --;
} while (i > 0);
@@ -1488,7 +1488,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a01 ++;
a02 ++;
b += 2;
- } else
+ } else
if (X > posY) {
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
@@ -1507,7 +1507,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
posY += 2;
}
-
+
if (n & 1){
X = posX;
@@ -1517,14 +1517,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} else {
a01 = a + posY + (posX + 0) * lda;
}
-
+
i = m;
if (i > 0) {
do {
if (X < posY) {
a01 += 1;
b ++;
- } else
+ } else
if (X > posY) {
b[ 0] = *(a01 + 0);
a01 += lda;
@@ -1538,7 +1538,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a01 += lda;
b ++;
}
-
+
X += 1;
i --;
} while (i > 0);
diff --git a/kernel/generic/trmm_utcopy_2.c b/kernel/generic/trmm_utcopy_2.c
index 620b06a..ae4a19e 100644
--- a/kernel/generic/trmm_utcopy_2.c
+++ b/kernel/generic/trmm_utcopy_2.c
@@ -69,13 +69,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 2;
b += 4;
- } else
+ } else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -86,9 +86,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 4;
} else {
-#ifdef UNIT
+#ifdef UNIT
data03 = *(ao2 + 0);
-
+
b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = data03;
@@ -97,7 +97,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data01 = *(ao1 + 0);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
b[ 0] = data01;
b[ 1] = ZERO;
b[ 2] = data03;
@@ -115,27 +115,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
if (m & 1) {
-
+
if (X < posY) {
ao1 += 1;
ao2 += 1;
b += 2;
- } else
+ } else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
ao1 += lda;
b += 2;
} else {
-#ifdef UNIT
+#ifdef UNIT
b[ 0] = ONE;
b[ 1] = ZERO;
#else
data01 = *(ao1 + 0);
-
+
b[ 0] = data01;
b[ 1] = ZERO;
#endif
@@ -151,13 +151,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1){
X = posX;
-
+
if (posX <= posY) {
ao1 = a + posX + (posY + 0) * lda;
} else {
ao1 = a + posY + (posX + 0) * lda;
}
-
+
i = m;
if (m > 0) {
do {
@@ -180,7 +180,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 1;
ao1 += lda;
}
-
+
X += 1;
i --;
} while (i > 0);
diff --git a/kernel/generic/trmm_utcopy_4.c b/kernel/generic/trmm_utcopy_4.c
index 7d4dba3..441f733 100644
--- a/kernel/generic/trmm_utcopy_4.c
+++ b/kernel/generic/trmm_utcopy_4.c
@@ -75,28 +75,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao3 += 4;
ao4 += 4;
b += 16;
- } else
+ } else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
-
+
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -105,7 +105,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
@@ -122,14 +122,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 16;
} else {
-#ifdef UNIT
+#ifdef UNIT
data05 = *(ao2 + 0);
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
-
+
b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = ZERO;
@@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ONE;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = ONE;
@@ -160,7 +160,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);
-
+
b[ 0] = data01;
b[ 1] = ZERO;
b[ 2] = ZERO;
@@ -170,7 +170,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
@@ -186,7 +186,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
-
+
b += 16;
}
@@ -197,7 +197,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X < posY) {
if (m & 2) {
@@ -207,7 +207,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao4 += 2;
b += 8;
}
-
+
if (m & 1) {
ao1 += 1;
ao2 += 1;
@@ -215,8 +215,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao4 += 1;
b += 4;
}
-
- } else
+
+ } else
if (X > posY) {
if (m & 2) {
data01 = *(ao1 + 0);
@@ -227,7 +227,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -236,30 +236,30 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 8;
}
-
+
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
-
+
ao1 += lda;
b += 4;
}
-
+
} else {
-#ifdef UNIT
+#ifdef UNIT
if (i >= 2) {
data05 = *(ao2 + 0);
}
@@ -274,7 +274,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 2] = ZERO;
b[ 3] = ZERO;
b += 4;
-
+
if(i >= 2) {
b[ 0] = data05;
b[ 1] = ONE;
@@ -282,7 +282,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = ZERO;
b += 4;
}
-
+
if (i >= 3) {
b[ 0] = data09;
b[ 1] = data10;
@@ -309,7 +309,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 2] = ZERO;
b[ 3] = ZERO;
b += 4;
-
+
if(i >= 2) {
b[ 0] = data05;
b[ 1] = data06;
@@ -317,7 +317,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = ZERO;
b += 4;
}
-
+
if (i >= 3) {
b[ 0] = data09;
b[ 1] = data10;
@@ -353,7 +353,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 2;
b += 4;
- } else
+ } else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -399,18 +399,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 1);
if (i) {
-
+
if (X < posY) {
ao1 += 2;
b += 2;
- } else
+ } else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
-
+
ao1 += lda;
b += 2;
} else {
diff --git a/kernel/generic/trmm_utcopy_6.c b/kernel/generic/trmm_utcopy_6.c
index 7d4dba3..441f733 100644
--- a/kernel/generic/trmm_utcopy_6.c
+++ b/kernel/generic/trmm_utcopy_6.c
@@ -75,28 +75,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao3 += 4;
ao4 += 4;
b += 16;
- } else
+ } else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data05 = *(ao2 + 0);
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data11 = *(ao3 + 2);
data12 = *(ao3 + 3);
-
+
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -105,7 +105,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
@@ -122,14 +122,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 16;
} else {
-#ifdef UNIT
+#ifdef UNIT
data05 = *(ao2 + 0);
data09 = *(ao3 + 0);
data10 = *(ao3 + 1);
data13 = *(ao4 + 0);
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
-
+
b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = ZERO;
@@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ONE;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = ONE;
@@ -160,7 +160,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data14 = *(ao4 + 1);
data15 = *(ao4 + 2);
data16 = *(ao4 + 3);
-
+
b[ 0] = data01;
b[ 1] = ZERO;
b[ 2] = ZERO;
@@ -170,7 +170,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
@@ -186,7 +186,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
-
+
b += 16;
}
@@ -197,7 +197,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X < posY) {
if (m & 2) {
@@ -207,7 +207,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao4 += 2;
b += 8;
}
-
+
if (m & 1) {
ao1 += 1;
ao2 += 1;
@@ -215,8 +215,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao4 += 1;
b += 4;
}
-
- } else
+
+ } else
if (X > posY) {
if (m & 2) {
data01 = *(ao1 + 0);
@@ -227,7 +227,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -236,30 +236,30 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 8;
}
-
+
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
-
+
ao1 += lda;
b += 4;
}
-
+
} else {
-#ifdef UNIT
+#ifdef UNIT
if (i >= 2) {
data05 = *(ao2 + 0);
}
@@ -274,7 +274,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 2] = ZERO;
b[ 3] = ZERO;
b += 4;
-
+
if(i >= 2) {
b[ 0] = data05;
b[ 1] = ONE;
@@ -282,7 +282,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = ZERO;
b += 4;
}
-
+
if (i >= 3) {
b[ 0] = data09;
b[ 1] = data10;
@@ -309,7 +309,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 2] = ZERO;
b[ 3] = ZERO;
b += 4;
-
+
if(i >= 2) {
b[ 0] = data05;
b[ 1] = data06;
@@ -317,7 +317,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = ZERO;
b += 4;
}
-
+
if (i >= 3) {
b[ 0] = data09;
b[ 1] = data10;
@@ -353,7 +353,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 2;
b += 4;
- } else
+ } else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -399,18 +399,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 1);
if (i) {
-
+
if (X < posY) {
ao1 += 2;
b += 2;
- } else
+ } else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
-
+
ao1 += lda;
b += 2;
} else {
diff --git a/kernel/generic/trmm_utcopy_8.c b/kernel/generic/trmm_utcopy_8.c
index 6dbf8bd..65fee35 100644
--- a/kernel/generic/trmm_utcopy_8.c
+++ b/kernel/generic/trmm_utcopy_8.c
@@ -80,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao7 = a + posY + (posX + 6) * lda;
ao8 = a + posY + (posX + 7) * lda;
}
-
+
i = (m >> 3);
if (i > 0) {
do {
@@ -95,7 +95,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao8 += 8;
b += 64;
- } else
+ } else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -105,7 +105,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
@@ -114,7 +114,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data14 = *(ao2 + 5);
data15 = *(ao2 + 6);
data16 = *(ao2 + 7);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data19 = *(ao3 + 2);
@@ -123,7 +123,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data22 = *(ao3 + 5);
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
@@ -132,7 +132,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data30 = *(ao4 + 5);
data31 = *(ao4 + 6);
data32 = *(ao4 + 7);
-
+
data33 = *(ao5 + 0);
data34 = *(ao5 + 1);
data35 = *(ao5 + 2);
@@ -141,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data38 = *(ao5 + 5);
data39 = *(ao5 + 6);
data40 = *(ao5 + 7);
-
+
data41 = *(ao6 + 0);
data42 = *(ao6 + 1);
data43 = *(ao6 + 2);
@@ -159,7 +159,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data54 = *(ao7 + 5);
data55 = *(ao7 + 6);
data56 = *(ao7 + 7);
-
+
data57 = *(ao8 + 0);
data58 = *(ao8 + 1);
data59 = *(ao8 + 2);
@@ -168,7 +168,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data62 = *(ao8 + 5);
data63 = *(ao8 + 6);
data64 = *(ao8 + 7);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
@@ -195,7 +195,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = data22;
b[22] = data23;
b[23] = data24;
-
+
b[24] = data25;
b[25] = data26;
b[26] = data27;
@@ -213,7 +213,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[37] = data38;
b[38] = data39;
b[39] = data40;
-
+
b[40] = data41;
b[41] = data42;
b[42] = data43;
@@ -240,7 +240,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[61] = data62;
b[62] = data63;
b[63] = data64;
-
+
ao1 += 8 * lda;
ao2 += 8 * lda;
ao3 += 8 * lda;
@@ -249,25 +249,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao6 += 8 * lda;
ao7 += 8 * lda;
ao8 += 8 * lda;
-
+
b += 64;
} else {
#ifdef UNIT
data09 = *(ao2 + 0);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
-
+
data33 = *(ao5 + 0);
data34 = *(ao5 + 1);
data35 = *(ao5 + 2);
data36 = *(ao5 + 3);
-
+
data41 = *(ao6 + 0);
data42 = *(ao6 + 1);
data43 = *(ao6 + 2);
@@ -280,7 +280,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data52 = *(ao7 + 3);
data53 = *(ao7 + 4);
data54 = *(ao7 + 5);
-
+
data57 = *(ao8 + 0);
data58 = *(ao8 + 1);
data59 = *(ao8 + 2);
@@ -297,7 +297,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ZERO;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = data09;
b[ 9] = ONE;
b[10] = ZERO;
@@ -315,7 +315,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = ZERO;
b[22] = ZERO;
b[23] = ZERO;
-
+
b[24] = data25;
b[25] = data26;
b[26] = data27;
@@ -333,7 +333,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[37] = ZERO;
b[38] = ZERO;
b[39] = ZERO;
-
+
b[40] = data41;
b[41] = data42;
b[42] = data43;
@@ -365,22 +365,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data19 = *(ao3 + 2);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
data28 = *(ao4 + 3);
-
+
data33 = *(ao5 + 0);
data34 = *(ao5 + 1);
data35 = *(ao5 + 2);
data36 = *(ao5 + 3);
data37 = *(ao5 + 4);
-
+
data41 = *(ao6 + 0);
data42 = *(ao6 + 1);
data43 = *(ao6 + 2);
@@ -395,7 +395,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data53 = *(ao7 + 4);
data54 = *(ao7 + 5);
data55 = *(ao7 + 6);
-
+
data57 = *(ao8 + 0);
data58 = *(ao8 + 1);
data59 = *(ao8 + 2);
@@ -413,7 +413,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ZERO;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = ZERO;
@@ -431,7 +431,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = ZERO;
b[22] = ZERO;
b[23] = ZERO;
-
+
b[24] = data25;
b[25] = data26;
b[26] = data27;
@@ -449,7 +449,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[37] = ZERO;
b[38] = ZERO;
b[39] = ZERO;
-
+
b[40] = data41;
b[41] = data42;
b[42] = data43;
@@ -486,7 +486,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao6 += 8 * lda;
ao7 += 8 * lda;
ao8 += 8 * lda;
-
+
b += 64;
}
@@ -497,7 +497,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 7);
if (i) {
-
+
if (X < posY) {
if (m & 4) {
@@ -509,10 +509,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao6 += 4;
ao7 += 4;
ao8 += 4;
-
+
b += 32;
}
-
+
if (m & 2) {
ao1 += 2;
ao2 += 2;
@@ -522,14 +522,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao6 += 2;
ao7 += 2;
ao8 += 2;
-
+
b += 16;
}
-
+
if (m & 1) {
b += 8;
}
- } else
+ } else
if (X > posY) {
if (m & 4) {
data01 = *(ao1 + 0);
@@ -540,7 +540,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
@@ -549,7 +549,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data14 = *(ao2 + 5);
data15 = *(ao2 + 6);
data16 = *(ao2 + 7);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data19 = *(ao3 + 2);
@@ -558,7 +558,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data22 = *(ao3 + 5);
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
@@ -567,7 +567,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data30 = *(ao4 + 5);
data31 = *(ao4 + 6);
data32 = *(ao4 + 7);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -576,7 +576,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
@@ -585,7 +585,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data14;
b[14] = data15;
b[15] = data16;
-
+
b[16] = data17;
b[17] = data18;
b[18] = data19;
@@ -594,7 +594,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = data22;
b[22] = data23;
b[23] = data24;
-
+
b[24] = data25;
b[25] = data26;
b[26] = data27;
@@ -603,15 +603,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[29] = data30;
b[30] = data31;
b[31] = data32;
-
+
ao1 += 4 * lda;
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
-
+
b += 32;
}
-
+
if (m & 2) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -621,7 +621,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
@@ -630,7 +630,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data14 = *(ao2 + 5);
data15 = *(ao2 + 6);
data16 = *(ao2 + 7);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -639,7 +639,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
@@ -648,11 +648,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data14;
b[14] = data15;
b[15] = data16;
-
+
ao1 += 2 * lda;
b += 16;
}
-
+
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -671,7 +671,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b += 8;
}
} else {
@@ -709,7 +709,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data51 = *(ao7 + 2);
data59 = *(ao8 + 2);
}
-
+
if (i >= 4) {
#ifndef UNIT
data28 = *(ao4 + 3);
@@ -757,7 +757,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 6] = ZERO;
b[ 7] = ZERO;
b += 8;
-
+
if(i >= 2) {
b[ 0] = data09;
#ifdef UNIT
@@ -773,7 +773,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = ZERO;
b += 8;
}
-
+
if (i >= 3) {
b[ 0] = data17;
b[ 1] = data18;
@@ -789,8 +789,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = ZERO;
b += 8;
}
-
- if (i >= 4) {
+
+ if (i >= 4) {
b[ 0] = data25;
b[ 1] = data26;
b[ 2] = data27;
@@ -821,7 +821,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = ZERO;
b += 8;
}
-
+
if (i >= 6) {
b[ 0] = data41;
b[ 1] = data42;
@@ -888,7 +888,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 16;
- } else
+ } else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -914,7 +914,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -1007,7 +1007,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
-
+
b += 16;
}
@@ -1018,7 +1018,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X < posY) {
if (m & 2) {
@@ -1026,14 +1026,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 2;
ao3 += 2;
ao4 += 2;
-
+
b += 8;
}
-
+
if (m & 1) {
b += 4;
}
- } else
+ } else
if (X > posY) {
if (m & 2) {
data01 = *(ao1 + 0);
@@ -1045,7 +1045,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -1054,11 +1054,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data10;
b[ 6] = data11;
b[ 7] = data12;
-
+
ao1 += 2 * lda;
b += 8;
}
-
+
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -1069,7 +1069,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
-
+
b += 4;
}
} else {
@@ -1095,7 +1095,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#endif
data27 = *(ao4 + 2);
}
-
+
#ifndef UNIT
b[ 0] = ONE;
#else
@@ -1105,7 +1105,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 2] = ZERO;
b[ 3] = ZERO;
b += 4;
-
+
if(i >= 2) {
b[ 0] = data09;
#ifndef UNIT
@@ -1117,10 +1117,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 3] = ZERO;
b += 4;
}
-
+
if (i >= 3) {
b[ 0] = data17;
-
+
b[ 1] = data18;
#ifndef UNIT
b[ 2] = ONE;
@@ -1155,7 +1155,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 2;
b += 4;
- } else
+ } else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -1164,7 +1164,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += 2 * lda;
ao2 += 2 * lda;
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data09;
@@ -1204,10 +1204,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
if (m & 1) {
-
+
if (X < posY) {
b += 2;
- } else
+ } else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -1247,11 +1247,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (X < posY) {
ao1 += 1;
b += 1;
- } else
+ } else
if (X > posY) {
data01 = *(ao1 + 0);
ao1 += lda;
-
+
b[ 0] = data01;
b += 1;
diff --git a/kernel/generic/trmmkernel_16x2.c b/kernel/generic/trmmkernel_16x2.c
index 437fa09..078a91d 100644
--- a/kernel/generic/trmmkernel_16x2.c
+++ b/kernel/generic/trmmkernel_16x2.c
@@ -1,6 +1,6 @@
#include "common.h"
-int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
{
BLASLONG i,j,k;
@@ -51,12 +51,12 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
BLASLONG off, temp;
#if !defined(LEFT)
- off = -offset;
+ off = -offset;
#endif
- for (j=0; j<bn/2; j+=1)
+ for (j=0; j<bn/2; j+=1)
{
C0 = C;
C1 = C0+ldc;
@@ -69,7 +69,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
ptrba = ba;
- for (i=0; i<bm/16; i+=1)
+ for (i=0; i<bm/16; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -120,13 +120,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
-#elif defined(LEFT)
+#elif defined(LEFT)
temp = off+16; // number of values in A
#else
temp = off+2; // number of values in B
#endif
- for (k=0; k<temp; k++)
+ for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
@@ -274,11 +274,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 16; // number of values in A
-#else
+#else
temp -= 2; // number of values in B
#endif
ptrba += temp*16;
@@ -296,7 +296,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
- if ( bm & 8)
+ if ( bm & 8)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -328,13 +328,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
-#elif defined(LEFT)
+#elif defined(LEFT)
temp = off+8; // number of values in A
#else
temp = off+2; // number of values in B
#endif
- for (k=0; k<temp; k++)
+ for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
@@ -412,11 +412,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C1[7] = res1_7;
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 8; // number of values in A
-#else
+#else
temp -= 2; // number of values in B
#endif
ptrba += temp*8;
@@ -454,13 +454,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
-#elif defined(LEFT)
+#elif defined(LEFT)
temp = off+4; // number of values in A
#else
temp = off+2; // number of values in B
#endif
- for (k=0; k<temp; k++)
+ for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
@@ -506,11 +506,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C1[3] = res1_3;
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 4; // number of values in A
-#else
+#else
temp -= 2; // number of values in B
#endif
ptrba += temp*4;
@@ -545,13 +545,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
-#elif defined(LEFT)
+#elif defined(LEFT)
temp = off+2; // number of values in A
#else
temp = off+2; // number of values in B
#endif
- for (k=0; k<temp; k++)
+ for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
@@ -581,11 +581,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C1[1] = res1_1;
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2; // number of values in A
-#else
+#else
temp -= 2; // number of values in B
#endif
ptrba += temp*2;
@@ -618,13 +618,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
-#elif defined(LEFT)
+#elif defined(LEFT)
temp = off+1; // number of values in A
#else
temp = off+2; // number of values in B
#endif
- for (k=0; k<temp; k++)
+ for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
@@ -646,11 +646,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C1[0] = res1_0;
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1; // number of values in A
-#else
+#else
temp -= 2; // number of values in B
#endif
ptrba += temp*1;
@@ -683,7 +683,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
- for (j=0; j<(bn&1); j+=1)
+ for (j=0; j<(bn&1); j+=1)
{
C0 = C;
@@ -694,7 +694,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
ptrba = ba;
- for (i=0; i<bm/16; i+=1)
+ for (i=0; i<bm/16; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -727,13 +727,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
-#elif defined(LEFT)
+#elif defined(LEFT)
temp = off+16; // number of values in A
#else
temp = off+1; // number of values in B
#endif
- for (k=0; k<temp; k++)
+ for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
@@ -827,11 +827,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C0[15] = res0_15;
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 16; // number of values in A
-#else
+#else
temp -= 1; // number of values in B
#endif
ptrba += temp*16;
@@ -870,13 +870,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
-#elif defined(LEFT)
+#elif defined(LEFT)
temp = off+8; // number of values in A
#else
temp = off+1; // number of values in B
#endif
- for (k=0; k<temp; k++)
+ for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
@@ -926,11 +926,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C0[6] = res0_6;
C0[7] = res0_7;
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 8; // number of values in A
-#else
+#else
temp -= 1; // number of values in B
#endif
ptrba += temp*8;
@@ -962,13 +962,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
-#elif defined(LEFT)
+#elif defined(LEFT)
temp = off+4; // number of values in A
#else
temp = off+1; // number of values in B
#endif
- for (k=0; k<temp; k++)
+ for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
@@ -999,11 +999,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C0[3] = res0_3;
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 4; // number of values in A
-#else
+#else
temp -= 1; // number of values in B
#endif
ptrba += temp*4;
@@ -1035,13 +1035,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
-#elif defined(LEFT)
+#elif defined(LEFT)
temp = off+2; // number of values in A
#else
temp = off+1; // number of values in B
#endif
- for (k=0; k<temp; k++)
+ for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
@@ -1062,11 +1062,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C0[1] = res0_1;
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2; // number of values in A
-#else
+#else
temp -= 1; // number of values in B
#endif
ptrba += temp*2;
@@ -1096,13 +1096,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
-#elif defined(LEFT)
+#elif defined(LEFT)
temp = off+1; // number of values in A
#else
temp = off+1; // number of values in B
#endif
- for (k=0; k<temp; k++)
+ for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
@@ -1118,11 +1118,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C0[0] = res0_0;
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1; // number of values in A
-#else
+#else
temp -= 1; // number of values in B
#endif
ptrba += temp*1;
diff --git a/kernel/generic/trmmkernel_2x2.c b/kernel/generic/trmmkernel_2x2.c
index 5b16806..40fbeea 100644
--- a/kernel/generic/trmmkernel_2x2.c
+++ b/kernel/generic/trmmkernel_2x2.c
@@ -3,16 +3,16 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#ifdef TRMMKERNEL
,BLASLONG offset
#endif
- )
+ )
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
BLASLONG off, temp;
#if defined(TRMMKERNEL) && !defined(LEFT)
- off = -offset;
+ off = -offset;
#endif
- for (j=0; j<bn/2; j+=1)
+ for (j=0; j<bn/2; j+=1)
{
C0 = C;
C1 = C0+ldc;
@@ -20,7 +20,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
off = offset;
#endif
ptrba = ba;
- for (i=0; i<bm/2; i+=1)
+ for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
@@ -36,12 +36,12 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#if (defined(LEFT) && !defined(TRANSA)) || \
(!defined(LEFT) && defined(TRANSA))
temp = bk-off;
-#elif defined(LEFT)
+#elif defined(LEFT)
temp = off+2;
#else
temp = off+2;
#endif
- for (k=0; k<temp/4; k+=1)
+ for (k=0; k<temp/4; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
@@ -78,7 +78,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
ptrba = ptrba+8;
ptrbb = ptrbb+8;
}
- for (k=0; k<(temp&3); k+=1)
+ for (k=0; k<(temp&3); k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
@@ -100,11 +100,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res3 = res3*alpha;
C1[1] = res3;
#if ( defined(LEFT) && defined(TRANSA)) || \
- (!defined(LEFT) && !defined(TRANSA))
+ (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2;
-#else
+#else
temp -= 2;
#endif
ptrba += temp*2;
@@ -116,11 +116,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C0 = C0+2;
C1 = C1+2;
}
- for (i=0; i<(bm&1); i+=1)
+ for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
-#else
+#else
ptrba += off;
ptrbb = bb+off*2;
#endif
@@ -130,10 +130,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
temp = bk-off;
#elif defined(LEFT)
temp = off+1;
-#else
+#else
temp = off+2;
#endif
- for (k=0; k<temp; k+=1)
+ for (k=0; k<temp; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[2*0+0];
@@ -147,17 +147,17 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C0[0] = res0;
res1 = res1*alpha;
C1[0] = res1;
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk-off;
#ifdef LEFT
temp -= 1;
-#else
+#else
temp -= 2;
#endif
ptrba += temp;
ptrbb += temp*2;
#endif
-#ifdef LEFT
+#ifdef LEFT
off += 1;
#endif
C0 = C0+1;
@@ -171,19 +171,19 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
i = (ldc<<1);
C = C+i;
}
- for (j=0; j<(bn&1); j+=1)
+ for (j=0; j<(bn&1); j+=1)
{
C0 = C;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
- for (i=0; i<bm/2; i+=1)
+ for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
-#else
+#else
ptrba += off*2;
ptrbb = bb + off;
#endif
@@ -197,7 +197,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#else
temp = off+1;
#endif
- for (k=0; k<temp; k+=1)
+ for (k=0; k<temp; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[0+0];
@@ -216,7 +216,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
temp = bk - off;
#ifdef LEFT
temp -= 2;
-#else
+#else
temp -= 1;
#endif
ptrba += temp*2;
@@ -228,11 +228,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C0 = C0+2;
}
- for (i=0; i<(bm&1); i+=1)
+ for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
-#else
+#else
ptrba += off;
ptrbb = bb+off;
#endif
@@ -241,10 +241,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
temp = bk-off;
#elif defined(LEFT)
temp = off + 1;
-#else
+#else
temp = off + 1;
#endif
- for (k=0; k<temp; k+=1)
+ for (k=0; k<temp; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[0+0];
@@ -258,7 +258,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
temp = bk-off;
#ifdef LEFT
temp -= 1;
-#else
+#else
temp -= 1;
#endif
ptrba += temp;
diff --git a/kernel/generic/trmmkernel_8x2.c b/kernel/generic/trmmkernel_8x2.c
index 5af289c..2aa43c7 100644
--- a/kernel/generic/trmmkernel_8x2.c
+++ b/kernel/generic/trmmkernel_8x2.c
@@ -1,6 +1,6 @@
#include "common.h"
-int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
{
BLASLONG i,j,k;
@@ -33,12 +33,12 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
BLASLONG off, temp;
#if !defined(LEFT)
- off = -offset;
+ off = -offset;
#endif
- for (j=0; j<bn/2; j+=1)
+ for (j=0; j<bn/2; j+=1)
{
C0 = C;
C1 = C0+ldc;
@@ -50,7 +50,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
ptrba = ba;
- for (i=0; i<bm/8; i+=1)
+ for (i=0; i<bm/8; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -82,13 +82,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
-#elif defined(LEFT)
+#elif defined(LEFT)
temp = off+8; // number of values in A
#else
temp = off+2; // number of values in B
#endif
- for (k=0; k<temp; k++)
+ for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
@@ -166,11 +166,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C1[7] = res1_7;
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 8; // number of values in A
-#else
+#else
temp -= 2; // number of values in B
#endif
ptrba += temp*8;
@@ -208,13 +208,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
-#elif defined(LEFT)
+#elif defined(LEFT)
temp = off+4; // number of values in A
#else
temp = off+2; // number of values in B
#endif
- for (k=0; k<temp; k++)
+ for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
@@ -260,11 +260,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C1[3] = res1_3;
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 4; // number of values in A
-#else
+#else
temp -= 2; // number of values in B
#endif
ptrba += temp*4;
@@ -299,13 +299,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
-#elif defined(LEFT)
+#elif defined(LEFT)
temp = off+2; // number of values in A
#else
temp = off+2; // number of values in B
#endif
- for (k=0; k<temp; k++)
+ for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
@@ -335,11 +335,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C1[1] = res1_1;
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2; // number of values in A
-#else
+#else
temp -= 2; // number of values in B
#endif
ptrba += temp*2;
@@ -372,13 +372,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
-#elif defined(LEFT)
+#elif defined(LEFT)
temp = off+1; // number of values in A
#else
temp = off+2; // number of values in B
#endif
- for (k=0; k<temp; k++)
+ for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
@@ -400,11 +400,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C1[0] = res1_0;
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1; // number of values in A
-#else
+#else
temp -= 2; // number of values in B
#endif
ptrba += temp*1;
@@ -437,7 +437,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
- for (j=0; j<(bn&1); j+=1)
+ for (j=0; j<(bn&1); j+=1)
{
C0 = C;
@@ -447,7 +447,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
ptrba = ba;
- for (i=0; i<bm/8; i+=1)
+ for (i=0; i<bm/8; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -469,13 +469,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
-#elif defined(LEFT)
+#elif defined(LEFT)
temp = off+8; // number of values in A
#else
temp = off+1; // number of values in B
#endif
- for (k=0; k<temp; k++)
+ for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
@@ -525,11 +525,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C0[6] = res0_6;
C0[7] = res0_7;
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 8; // number of values in A
-#else
+#else
temp -= 1; // number of values in B
#endif
ptrba += temp*8;
@@ -561,13 +561,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
-#elif defined(LEFT)
+#elif defined(LEFT)
temp = off+4; // number of values in A
#else
temp = off+1; // number of values in B
#endif
- for (k=0; k<temp; k++)
+ for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
@@ -598,11 +598,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C0[3] = res0_3;
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 4; // number of values in A
-#else
+#else
temp -= 1; // number of values in B
#endif
ptrba += temp*4;
@@ -634,13 +634,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
-#elif defined(LEFT)
+#elif defined(LEFT)
temp = off+2; // number of values in A
#else
temp = off+1; // number of values in B
#endif
- for (k=0; k<temp; k++)
+ for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
@@ -661,11 +661,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C0[1] = res0_1;
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2; // number of values in A
-#else
+#else
temp -= 1; // number of values in B
#endif
ptrba += temp*2;
@@ -695,13 +695,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
-#elif defined(LEFT)
+#elif defined(LEFT)
temp = off+1; // number of values in A
#else
temp = off+1; // number of values in B
#endif
- for (k=0; k<temp; k++)
+ for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
@@ -717,11 +717,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C0[0] = res0_0;
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1; // number of values in A
-#else
+#else
temp -= 1; // number of values in B
#endif
ptrba += temp*1;
diff --git a/kernel/generic/trsm_kernel_LN.c b/kernel/generic/trsm_kernel_LN.c
index 931cba3..c08f86e 100644
--- a/kernel/generic/trsm_kernel_LN.c
+++ b/kernel/generic/trsm_kernel_LN.c
@@ -104,7 +104,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
for (i = m - 1; i >= 0; i--) {
aa = *(a + i);
-
+
for (j = 0; j < n; j ++) {
bb = *(c + i + j * ldc);
bb *= aa;
@@ -141,7 +141,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
aa1 = *(a + i * 2 + 0);
aa2 = *(a + i * 2 + 1);
-
+
for (j = 0; j < n; j ++) {
bb1 = *(c + i * 2 + 0 + j * ldc);
bb2 = *(c + i * 2 + 1 + j * ldc);
@@ -181,7 +181,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
#endif
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
#ifdef COMPLEX
FLOAT dummy2,
#endif
@@ -197,33 +197,33 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
#endif
j = (n >> GEMM_UNROLL_N_SHIFT);
-
+
while (j > 0) {
kk = m + offset;
-
+
if (m & (GEMM_UNROLL_M - 1)) {
for (i = 1; i < GEMM_UNROLL_M; i *= 2){
if (m & i) {
aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE;
cc = c + ((m & ~(i - 1)) - i) * COMPSIZE;
-
+
if (k - kk > 0) {
- GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+ GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa + i * kk * COMPSIZE,
- b + GEMM_UNROLL_N * kk * COMPSIZE,
+ b + GEMM_UNROLL_N * kk * COMPSIZE,
cc,
- ldc);
+ ldc);
}
- solve(i, GEMM_UNROLL_N,
- aa + (kk - i) * i * COMPSIZE,
+ solve(i, GEMM_UNROLL_N,
+ aa + (kk - i) * i * COMPSIZE,
b + (kk - i) * GEMM_UNROLL_N * COMPSIZE,
cc, ldc);
-
+
kk -= i;
}
}
@@ -236,102 +236,102 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
do {
if (k - kk > 0) {
- GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1,
+ GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa + GEMM_UNROLL_M * kk * COMPSIZE,
- b + GEMM_UNROLL_N * kk * COMPSIZE,
+ b + GEMM_UNROLL_N * kk * COMPSIZE,
cc,
- ldc);
+ ldc);
}
- solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
- aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,
+ solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
+ aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,
b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE,
cc, ldc);
-
+
aa -= GEMM_UNROLL_M * k * COMPSIZE;
cc -= GEMM_UNROLL_M * COMPSIZE;
kk -= GEMM_UNROLL_M;
i --;
} while (i > 0);
}
-
+
b += GEMM_UNROLL_N * k * COMPSIZE;
c += GEMM_UNROLL_N * ldc * COMPSIZE;
j --;
}
-
+
if (n & (GEMM_UNROLL_N - 1)) {
j = (GEMM_UNROLL_N >> 1);
while (j > 0) {
if (n & j) {
-
+
kk = m + offset;
-
+
if (m & (GEMM_UNROLL_M - 1)) {
for (i = 1; i < GEMM_UNROLL_M; i *= 2){
if (m & i) {
aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE;
cc = c + ((m & ~(i - 1)) - i) * COMPSIZE;
-
+
if (k - kk > 0) {
- GEMM_KERNEL(i, j, k - kk, dm1,
+ GEMM_KERNEL(i, j, k - kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa + i * kk * COMPSIZE,
- b + j * kk * COMPSIZE,
- cc, ldc);
+ b + j * kk * COMPSIZE,
+ cc, ldc);
}
- solve(i, j,
+ solve(i, j,
aa + (kk - i) * i * COMPSIZE,
b + (kk - i) * j * COMPSIZE,
cc, ldc);
-
+
kk -= i;
}
}
}
-
+
i = (m >> GEMM_UNROLL_M_SHIFT);
if (i > 0) {
aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE;
cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE;
-
+
do {
if (k - kk > 0) {
- GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1,
+ GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa + GEMM_UNROLL_M * kk * COMPSIZE,
- b + j * kk * COMPSIZE,
+ b + j * kk * COMPSIZE,
cc,
- ldc);
+ ldc);
}
- solve(GEMM_UNROLL_M, j,
+ solve(GEMM_UNROLL_M, j,
aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,
b + (kk - GEMM_UNROLL_M) * j * COMPSIZE,
cc, ldc);
-
+
aa -= GEMM_UNROLL_M * k * COMPSIZE;
cc -= GEMM_UNROLL_M * COMPSIZE;
kk -= GEMM_UNROLL_M;
i --;
} while (i > 0);
}
-
+
b += j * k * COMPSIZE;
c += j * ldc * COMPSIZE;
}
j >>= 1;
}
}
-
+
return 0;
}
diff --git a/kernel/generic/trsm_kernel_LT.c b/kernel/generic/trsm_kernel_LT.c
index 0996242..07b3346 100644
--- a/kernel/generic/trsm_kernel_LT.c
+++ b/kernel/generic/trsm_kernel_LT.c
@@ -101,7 +101,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
for (i = 0; i < m; i++) {
aa = *(a + i);
-
+
for (j = 0; j < n; j ++) {
bb = *(c + i + j * ldc);
bb *= aa;
@@ -134,7 +134,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
aa1 = *(a + i * 2 + 0);
aa2 = *(a + i * 2 + 1);
-
+
for (j = 0; j < n; j ++) {
bb1 = *(c + i * 2 + 0 + j * ldc);
bb2 = *(c + i * 2 + 1 + j * ldc);
@@ -191,24 +191,24 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
j = (n >> GEMM_UNROLL_N_SHIFT);
while (j > 0) {
-
+
kk = offset;
aa = a;
cc = c;
-
+
i = (m >> GEMM_UNROLL_M_SHIFT);
-
+
while (i > 0) {
if (kk > 0) {
- GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1,
+ GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
- aa, b, cc, ldc);
+ aa, b, cc, ldc);
}
- solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
+ solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
aa + kk * GEMM_UNROLL_M * COMPSIZE,
b + kk * GEMM_UNROLL_N * COMPSIZE,
cc, ldc);
@@ -218,19 +218,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
kk += GEMM_UNROLL_M;
i --;
}
-
+
if (m & (GEMM_UNROLL_M - 1)) {
i = (GEMM_UNROLL_M >> 1);
while (i > 0) {
if (m & i) {
if (kk > 0) {
- GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+ GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
- aa, b, cc, ldc);
+ aa, b, cc, ldc);
}
- solve(i, GEMM_UNROLL_N,
+ solve(i, GEMM_UNROLL_N,
aa + kk * i * COMPSIZE,
b + kk * GEMM_UNROLL_N * COMPSIZE,
cc, ldc);
@@ -242,39 +242,39 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
i >>= 1;
}
}
-
+
b += GEMM_UNROLL_N * k * COMPSIZE;
c += GEMM_UNROLL_N * ldc * COMPSIZE;
j --;
jj += GEMM_UNROLL_M;
}
-
+
if (n & (GEMM_UNROLL_N - 1)) {
j = (GEMM_UNROLL_N >> 1);
while (j > 0) {
if (n & j) {
-
+
kk = offset;
aa = a;
cc = c;
-
+
i = (m >> GEMM_UNROLL_M_SHIFT);
-
+
while (i > 0) {
if (kk > 0) {
- GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1,
+ GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa,
- b,
+ b,
cc,
- ldc);
+ ldc);
}
- solve(GEMM_UNROLL_M, j,
- aa + kk * GEMM_UNROLL_M * COMPSIZE,
+ solve(GEMM_UNROLL_M, j,
+ aa + kk * GEMM_UNROLL_M * COMPSIZE,
b + kk * j * COMPSIZE, cc, ldc);
aa += GEMM_UNROLL_M * k * COMPSIZE;
@@ -282,24 +282,24 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
kk += GEMM_UNROLL_M;
i --;
}
-
+
if (m & (GEMM_UNROLL_M - 1)) {
i = (GEMM_UNROLL_M >> 1);
while (i > 0) {
if (m & i) {
if (kk > 0) {
- GEMM_KERNEL(i, j, kk, dm1,
+ GEMM_KERNEL(i, j, kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa,
- b,
+ b,
cc,
- ldc);
+ ldc);
}
- solve(i, j,
- aa + kk * i * COMPSIZE,
+ solve(i, j,
+ aa + kk * i * COMPSIZE,
b + kk * j * COMPSIZE, cc, ldc);
aa += i * k * COMPSIZE;
@@ -309,7 +309,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
i >>= 1;
}
}
-
+
b += j * k * COMPSIZE;
c += j * ldc * COMPSIZE;
}
diff --git a/kernel/generic/trsm_kernel_RN.c b/kernel/generic/trsm_kernel_RN.c
index d7e650e..07a4f3b 100644
--- a/kernel/generic/trsm_kernel_RN.c
+++ b/kernel/generic/trsm_kernel_RN.c
@@ -101,7 +101,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
for (i = 0; i < n; i++) {
bb = *(b + i);
-
+
for (j = 0; j < m; j ++) {
aa = *(c + j + i * ldc);
aa *= bb;
@@ -134,7 +134,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
bb1 = *(b + i * 2 + 0);
bb2 = *(b + i * 2 + 1);
-
+
for (j = 0; j < m; j ++) {
aa1 = *(c + j * 2 + 0 + i * ldc);
aa2 = *(c + j * 2 + 1 + i * ldc);
@@ -171,7 +171,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
#endif
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
#ifdef COMPLEX
FLOAT dummy2,
#endif
@@ -191,46 +191,46 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
kk = -offset;
while (j > 0) {
-
+
aa = a;
cc = c;
-
+
i = (m >> GEMM_UNROLL_M_SHIFT);
-
+
if (i > 0) {
do {
if (kk > 0) {
- GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1,
+ GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
- aa, b, cc, ldc);
+ aa, b, cc, ldc);
}
- solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
+ solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
aa + kk * GEMM_UNROLL_M * COMPSIZE,
b + kk * GEMM_UNROLL_N * COMPSIZE,
cc, ldc);
-
+
aa += GEMM_UNROLL_M * k * COMPSIZE;
cc += GEMM_UNROLL_M * COMPSIZE;
i --;
} while (i > 0);
}
-
+
if (m & (GEMM_UNROLL_M - 1)) {
i = (GEMM_UNROLL_M >> 1);
while (i > 0) {
if (m & i) {
if (kk > 0) {
- GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+ GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
- aa, b, cc, ldc);
+ aa, b, cc, ldc);
}
- solve(i, GEMM_UNROLL_N,
+ solve(i, GEMM_UNROLL_N,
aa + kk * i * COMPSIZE,
b + kk * GEMM_UNROLL_N * COMPSIZE,
cc, ldc);
@@ -241,63 +241,63 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
i >>= 1;
}
}
-
+
kk += GEMM_UNROLL_N;
b += GEMM_UNROLL_N * k * COMPSIZE;
c += GEMM_UNROLL_N * ldc * COMPSIZE;
j --;
jj += GEMM_UNROLL_M;
}
-
+
if (n & (GEMM_UNROLL_N - 1)) {
j = (GEMM_UNROLL_N >> 1);
while (j > 0) {
if (n & j) {
-
+
aa = a;
cc = c;
-
+
i = (m >> GEMM_UNROLL_M_SHIFT);
-
+
while (i > 0) {
if (kk > 0) {
- GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1,
+ GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa,
- b,
+ b,
cc,
- ldc);
+ ldc);
}
- solve(GEMM_UNROLL_M, j,
- aa + kk * GEMM_UNROLL_M * COMPSIZE,
+ solve(GEMM_UNROLL_M, j,
+ aa + kk * GEMM_UNROLL_M * COMPSIZE,
b + kk * j * COMPSIZE, cc, ldc);
aa += GEMM_UNROLL_M * k * COMPSIZE;
cc += GEMM_UNROLL_M * COMPSIZE;
i --;
}
-
+
if (m & (GEMM_UNROLL_M - 1)) {
i = (GEMM_UNROLL_M >> 1);
while (i > 0) {
if (m & i) {
if (kk > 0) {
- GEMM_KERNEL(i, j, kk, dm1,
+ GEMM_KERNEL(i, j, kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa,
- b,
+ b,
cc,
- ldc);
+ ldc);
}
- solve(i, j,
- aa + kk * i * COMPSIZE,
+ solve(i, j,
+ aa + kk * i * COMPSIZE,
b + kk * j * COMPSIZE, cc, ldc);
aa += i * k * COMPSIZE;
@@ -306,7 +306,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
i >>= 1;
}
}
-
+
b += j * k * COMPSIZE;
c += j * ldc * COMPSIZE;
kk += j;
diff --git a/kernel/generic/trsm_kernel_RT.c b/kernel/generic/trsm_kernel_RT.c
index a469453..0c4db33 100644
--- a/kernel/generic/trsm_kernel_RT.c
+++ b/kernel/generic/trsm_kernel_RT.c
@@ -106,7 +106,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
for (i = n - 1; i >= 0; i--) {
bb = *(b + i);
-
+
for (j = 0; j < m; j ++) {
aa = *(c + j + i * ldc);
aa *= bb;
@@ -144,7 +144,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
bb1 = *(b + i * 2 + 0);
bb2 = *(b + i * 2 + 1);
-
+
for (j = 0; j < m; j ++) {
aa1 = *(c + j * 2 + 0 + i * ldc);
@@ -193,7 +193,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
BLASLONG i, j;
FLOAT *aa, *cc;
BLASLONG kk;
-
+
#if 0
fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n",
m, n, k, offset);
@@ -208,32 +208,32 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
j = 1;
while (j < GEMM_UNROLL_N) {
if (n & j) {
-
+
aa = a;
b -= j * k * COMPSIZE;
c -= j * ldc* COMPSIZE;
cc = c;
-
+
i = (m >> GEMM_UNROLL_M_SHIFT);
if (i > 0) {
do {
if (k - kk > 0) {
- GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1,
+ GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa + GEMM_UNROLL_M * kk * COMPSIZE,
- b + j * kk * COMPSIZE,
+ b + j * kk * COMPSIZE,
cc,
- ldc);
+ ldc);
}
- solve(GEMM_UNROLL_M, j,
+ solve(GEMM_UNROLL_M, j,
aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE,
b + (kk - j) * j * COMPSIZE,
cc, ldc);
-
+
aa += GEMM_UNROLL_M * k * COMPSIZE;
cc += GEMM_UNROLL_M * COMPSIZE;
i --;
@@ -246,23 +246,23 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
if (m & i) {
if (k - kk > 0) {
- GEMM_KERNEL(i, j, k - kk, dm1,
+ GEMM_KERNEL(i, j, k - kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa + i * kk * COMPSIZE,
- b + j * kk * COMPSIZE,
- cc, ldc);
+ b + j * kk * COMPSIZE,
+ cc, ldc);
}
- solve(i, j,
+ solve(i, j,
aa + (kk - j) * i * COMPSIZE,
b + (kk - j) * j * COMPSIZE,
cc, ldc);
aa += i * k * COMPSIZE;
cc += i * COMPSIZE;
-
+
}
i >>= 1;
} while (i > 0);
@@ -287,21 +287,21 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
if (i > 0) {
do {
if (k - kk > 0) {
- GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1,
+ GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa + GEMM_UNROLL_M * kk * COMPSIZE,
- b + GEMM_UNROLL_N * kk * COMPSIZE,
+ b + GEMM_UNROLL_N * kk * COMPSIZE,
cc,
- ldc);
+ ldc);
}
-
- solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
- aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE,
+
+ solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
+ aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE,
b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
cc, ldc);
-
+
aa += GEMM_UNROLL_M * k * COMPSIZE;
cc += GEMM_UNROLL_M * COMPSIZE;
i --;
@@ -313,28 +313,28 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
do {
if (m & i) {
if (k - kk > 0) {
- GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+ GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
#ifdef COMPLEX
ZERO,
#endif
aa + i * kk * COMPSIZE,
- b + GEMM_UNROLL_N * kk * COMPSIZE,
+ b + GEMM_UNROLL_N * kk * COMPSIZE,
cc,
- ldc);
+ ldc);
}
-
- solve(i, GEMM_UNROLL_N,
- aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE,
+
+ solve(i, GEMM_UNROLL_N,
+ aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE,
b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
cc, ldc);
-
+
aa += i * k * COMPSIZE;
cc += i * COMPSIZE;
}
i >>= 1;
} while (i > 0);
}
-
+
kk -= GEMM_UNROLL_N;
j --;
} while (j > 0);
diff --git a/kernel/generic/trsm_lncopy_1.c b/kernel/generic/trsm_lncopy_1.c
index abad971..13c88ad 100644
--- a/kernel/generic/trsm_lncopy_1.c
+++ b/kernel/generic/trsm_lncopy_1.c
@@ -73,7 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
}
if (ii > jj) *(b + 0) = *(a1 + 0);
-
+
a1 ++;
b ++;
diff --git a/kernel/generic/trsm_lncopy_16.c b/kernel/generic/trsm_lncopy_16.c
index a7f9cb0..9754e67 100644
--- a/kernel/generic/trsm_lncopy_16.c
+++ b/kernel/generic/trsm_lncopy_16.c
@@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8;
FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16;
-
+
jj = offset;
j = (n >> 4);
@@ -78,14 +78,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 16)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k) = *(a1 + k * lda);
}
*(b + ii - jj) = INV(*(a1 + (ii - jj) * lda));
}
-
+
if (ii - jj >= 16) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a2 + 0);
@@ -143,14 +143,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 8)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k) = *(a1 + k * lda);
}
*(b + ii - jj) = INV(*(a1 + (ii - jj) * lda));
}
-
+
if (ii - jj >= 8) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a2 + 0);
@@ -187,14 +187,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 4)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k) = *(a1 + k * lda);
}
*(b + ii - jj) = INV(*(a1 + (ii - jj) * lda));
}
-
+
if (ii - jj >= 4) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a2 + 0);
@@ -221,14 +221,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 2)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k) = *(a1 + k * lda);
}
*(b + ii - jj) = INV(*(a1 + (ii - jj) * lda));
}
-
+
if (ii - jj >= 2) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a2 + 0);
@@ -249,14 +249,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 1)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k) = *(a1 + k * lda);
}
*(b + ii - jj) = INV(*(a1 + (ii - jj) * lda));
}
-
+
if (ii - jj >= 1) {
*(b + 0) = *(a1 + 0);
}
diff --git a/kernel/generic/trsm_lncopy_2.c b/kernel/generic/trsm_lncopy_2.c
index 20cc642..69bfbea 100644
--- a/kernel/generic/trsm_lncopy_2.c
+++ b/kernel/generic/trsm_lncopy_2.c
@@ -91,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data02;
*(b + 3) = data04;
}
-
+
a1 += 2;
a2 += 2;
b += 4;
@@ -142,7 +142,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
data01 = *(a1 + 0);
*(b + 0) = data01;
}
-
+
a1+= 1;
b += 1;
i --;
diff --git a/kernel/generic/trsm_lncopy_4.c b/kernel/generic/trsm_lncopy_4.c
index 9f7bcc2..a37c50d 100644
--- a/kernel/generic/trsm_lncopy_4.c
+++ b/kernel/generic/trsm_lncopy_4.c
@@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 14) = data12;
*(b + 15) = data16;
}
-
+
a1 += 4;
a2 += 4;
a3 += 4;
@@ -193,7 +193,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data06;
*(b + 7) = data08;
}
-
+
a1 += 2;
a2 += 2;
a3 += 2;
@@ -266,7 +266,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data02;
*(b + 3) = data04;
}
-
+
a1 += 2;
a2 += 2;
b += 4;
@@ -314,7 +314,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
data01 = *(a1 + 0);
*(b + 0) = data01;
}
-
+
a1+= 1;
b += 1;
i --;
diff --git a/kernel/generic/trsm_lncopy_6.c b/kernel/generic/trsm_lncopy_6.c
index 9f7bcc2..a37c50d 100644
--- a/kernel/generic/trsm_lncopy_6.c
+++ b/kernel/generic/trsm_lncopy_6.c
@@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 14) = data12;
*(b + 15) = data16;
}
-
+
a1 += 4;
a2 += 4;
a3 += 4;
@@ -193,7 +193,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data06;
*(b + 7) = data08;
}
-
+
a1 += 2;
a2 += 2;
a3 += 2;
@@ -266,7 +266,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data02;
*(b + 3) = data04;
}
-
+
a1 += 2;
a2 += 2;
b += 4;
@@ -314,7 +314,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
data01 = *(a1 + 0);
*(b + 0) = data01;
}
-
+
a1+= 1;
b += 1;
i --;
diff --git a/kernel/generic/trsm_lncopy_8.c b/kernel/generic/trsm_lncopy_8.c
index 40feb81..ca019fc 100644
--- a/kernel/generic/trsm_lncopy_8.c
+++ b/kernel/generic/trsm_lncopy_8.c
@@ -140,7 +140,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
#endif
*(b + 0) = INV(data01);
-
+
*(b + 8) = data02;
*(b + 9) = INV(data10);
@@ -152,7 +152,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 25) = data12;
*(b + 26) = data20;
*(b + 27) = INV(data28);
-
+
*(b + 32) = data05;
*(b + 33) = data13;
*(b + 34) = data21;
@@ -165,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 43) = data30;
*(b + 44) = data38;
*(b + 45) = INV(data46);
-
+
*(b + 48) = data07;
*(b + 49) = data15;
*(b + 50) = data23;
@@ -265,7 +265,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 5) = data41;
*(b + 6) = data49;
*(b + 7) = data57;
-
+
*(b + 8) = data02;
*(b + 9) = data10;
*(b + 10) = data18;
@@ -274,7 +274,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 13) = data42;
*(b + 14) = data50;
*(b + 15) = data58;
-
+
*(b + 16) = data03;
*(b + 17) = data11;
*(b + 18) = data19;
@@ -283,7 +283,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 21) = data43;
*(b + 22) = data51;
*(b + 23) = data59;
-
+
*(b + 24) = data04;
*(b + 25) = data12;
*(b + 26) = data20;
@@ -292,7 +292,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 29) = data44;
*(b + 30) = data52;
*(b + 31) = data60;
-
+
*(b + 32) = data05;
*(b + 33) = data13;
*(b + 34) = data21;
@@ -310,7 +310,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 45) = data46;
*(b + 46) = data54;
*(b + 47) = data62;
-
+
*(b + 48) = data07;
*(b + 49) = data15;
*(b + 50) = data23;
@@ -329,7 +329,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 62) = data56;
*(b + 63) = data64;
}
-
+
a1 += 8;
a2 += 8;
a3 += 8;
@@ -370,7 +370,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
#endif
*(b + 0) = INV(data01);
-
+
*(b + 8) = data02;
*(b + 9) = INV(data10);
@@ -429,7 +429,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 5) = data41;
*(b + 6) = data49;
*(b + 7) = data57;
-
+
*(b + 8) = data02;
*(b + 9) = data10;
*(b + 10) = data18;
@@ -438,7 +438,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 13) = data42;
*(b + 14) = data50;
*(b + 15) = data58;
-
+
*(b + 16) = data03;
*(b + 17) = data11;
*(b + 18) = data19;
@@ -447,7 +447,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 21) = data43;
*(b + 22) = data51;
*(b + 23) = data59;
-
+
*(b + 24) = data04;
*(b + 25) = data12;
*(b + 26) = data20;
@@ -456,9 +456,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 29) = data44;
*(b + 30) = data52;
*(b + 31) = data60;
-
+
}
-
+
a1 += 4;
a2 += 4;
a3 += 4;
@@ -483,7 +483,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
#endif
*(b + 0) = INV(data01);
-
+
*(b + 8) = data02;
*(b + 9) = INV(data10);
}
@@ -515,7 +515,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 5) = data41;
*(b + 6) = data49;
*(b + 7) = data57;
-
+
*(b + 8) = data02;
*(b + 9) = data10;
*(b + 10) = data18;
@@ -525,7 +525,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 14) = data50;
*(b + 15) = data58;
}
-
+
a1 += 2;
a2 += 2;
a3 += 2;
@@ -608,7 +608,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
#endif
*(b + 0) = INV(data01);
-
+
*(b + 4) = data02;
*(b + 5) = INV(data10);
@@ -652,18 +652,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 5) = data10;
*(b + 6) = data18;
*(b + 7) = data26;
-
+
*(b + 8) = data03;
*(b + 9) = data11;
*(b + 10) = data19;
*(b + 11) = data27;
-
+
*(b + 12) = data04;
*(b + 13) = data12;
*(b + 14) = data20;
*(b + 15) = data28;
}
-
+
a1 += 4;
a2 += 4;
a3 += 4;
@@ -686,7 +686,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
#endif
*(b + 0) = INV(data01);
-
+
*(b + 4) = data02;
*(b + 5) = INV(data10);
}
@@ -710,7 +710,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data18;
*(b + 7) = data26;
}
-
+
a1 += 2;
a2 += 2;
a3 += 2;
@@ -779,7 +779,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data02;
*(b + 3) = data10;
}
-
+
a1 += 2;
a2 += 2;
b += 4;
@@ -828,7 +828,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
data01 = *(a1 + 0);
*(b + 0) = data01;
}
-
+
a1 += 1;
b += 1;
diff --git a/kernel/generic/trsm_ltcopy_16.c b/kernel/generic/trsm_ltcopy_16.c
index 1203f1b..42618c2 100644
--- a/kernel/generic/trsm_ltcopy_16.c
+++ b/kernel/generic/trsm_ltcopy_16.c
@@ -61,7 +61,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 16)) {
*(b + ii - jj) = INV(*(a1 + ii - jj));
@@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
}
}
-
+
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -107,17 +107,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 8)) {
*(b + ii - jj) = INV(*(a1 + ii - jj));
-
+
for (k = ii - jj + 1; k < 8; k ++) {
*(b + k) = *(a1 + k);
}
}
-
+
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 4)) {
*(b + ii - jj) = INV(*(a1 + ii - jj));
@@ -154,7 +154,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
}
}
-
+
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -178,7 +178,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 2)) {
*(b + ii - jj) = INV(*(a1 + ii - jj));
@@ -188,7 +188,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
}
}
-
+
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -209,11 +209,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 1)) {
*(b + ii - jj) = INV(*(a1 + ii - jj));
}
-
+
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
}
diff --git a/kernel/generic/trsm_ltcopy_2.c b/kernel/generic/trsm_ltcopy_2.c
index 4705635..9f48e84 100644
--- a/kernel/generic/trsm_ltcopy_2.c
+++ b/kernel/generic/trsm_ltcopy_2.c
@@ -92,7 +92,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data03;
*(b + 3) = data04;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 4;
@@ -109,7 +109,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
data01 = *(a1 + 0);
#endif
data02 = *(a1 + 1);
-
+
*(b + 0) = INV(data01);
*(b + 1) = data02;
}
@@ -147,7 +147,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
data01 = *(a1 + 0);
*(b + 0) = data01;
}
-
+
a1 += 1 * lda;
b += 1;
diff --git a/kernel/generic/trsm_ltcopy_4.c b/kernel/generic/trsm_ltcopy_4.c
index d891468..12043eb 100644
--- a/kernel/generic/trsm_ltcopy_4.c
+++ b/kernel/generic/trsm_ltcopy_4.c
@@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 14) = data15;
*(b + 15) = data16;
}
-
+
a1 += 4 * lda;
a2 += 4 * lda;
a3 += 4 * lda;
@@ -159,25 +159,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
if ((m & 2) != 0) {
if (ii== jj) {
-
+
#ifndef UNIT
data01 = *(a1 + 0);
#endif
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);
-
+
#ifndef UNIT
data06 = *(a2 + 1);
#endif
data07 = *(a2 + 2);
data08 = *(a2 + 3);
-
+
*(b + 0) = INV(data01);
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
-
+
*(b + 5) = INV(data06);
*(b + 6) = data07;
*(b + 7) = data08;
@@ -204,7 +204,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data07;
*(b + 7) = data08;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 8;
@@ -222,7 +222,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);
-
+
*(b + 0) = INV(data01);
*(b + 1) = data02;
*(b + 2) = data03;
@@ -284,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data03;
*(b + 3) = data04;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 4;
@@ -333,7 +333,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
data01 = *(a1 + 0);
*(b + 0) = data01;
}
-
+
a1 += 1 * lda;
b += 1;
diff --git a/kernel/generic/trsm_ltcopy_6.c b/kernel/generic/trsm_ltcopy_6.c
index d891468..12043eb 100644
--- a/kernel/generic/trsm_ltcopy_6.c
+++ b/kernel/generic/trsm_ltcopy_6.c
@@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 14) = data15;
*(b + 15) = data16;
}
-
+
a1 += 4 * lda;
a2 += 4 * lda;
a3 += 4 * lda;
@@ -159,25 +159,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
if ((m & 2) != 0) {
if (ii== jj) {
-
+
#ifndef UNIT
data01 = *(a1 + 0);
#endif
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);
-
+
#ifndef UNIT
data06 = *(a2 + 1);
#endif
data07 = *(a2 + 2);
data08 = *(a2 + 3);
-
+
*(b + 0) = INV(data01);
*(b + 1) = data02;
*(b + 2) = data03;
*(b + 3) = data04;
-
+
*(b + 5) = INV(data06);
*(b + 6) = data07;
*(b + 7) = data08;
@@ -204,7 +204,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data07;
*(b + 7) = data08;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 8;
@@ -222,7 +222,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
data02 = *(a1 + 1);
data03 = *(a1 + 2);
data04 = *(a1 + 3);
-
+
*(b + 0) = INV(data01);
*(b + 1) = data02;
*(b + 2) = data03;
@@ -284,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data03;
*(b + 3) = data04;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 4;
@@ -333,7 +333,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
data01 = *(a1 + 0);
*(b + 0) = data01;
}
-
+
a1 += 1 * lda;
b += 1;
diff --git a/kernel/generic/trsm_ltcopy_8.c b/kernel/generic/trsm_ltcopy_8.c
index 0925dcc..9d64e26 100644
--- a/kernel/generic/trsm_ltcopy_8.c
+++ b/kernel/generic/trsm_ltcopy_8.c
@@ -326,7 +326,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 62) = data63;
*(b + 63) = data64;
}
-
+
a1 += 8 * lda;
a2 += 8 * lda;
a3 += 8 * lda;
@@ -484,7 +484,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 30) = data31;
*(b + 31) = data32;
}
-
+
a1 += 4 * lda;
a2 += 4 * lda;
a3 += 4 * lda;
@@ -572,7 +572,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 14) = data15;
*(b + 15) = data16;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 16;
@@ -720,7 +720,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 14) = data27;
*(b + 15) = data28;
}
-
+
a1 += 4 * lda;
a2 += 4 * lda;
a3 += 4 * lda;
@@ -777,7 +777,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data11;
*(b + 7) = data12;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 8;
@@ -854,7 +854,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data09;
*(b + 3) = data10;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 4;
@@ -907,7 +907,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
data01 = *(a1 + 0);
*(b + 0) = data01;
}
-
+
a1 += lda;
b += 1;
diff --git a/kernel/generic/trsm_uncopy_1.c b/kernel/generic/trsm_uncopy_1.c
index 3a25860..ee06c4e 100644
--- a/kernel/generic/trsm_uncopy_1.c
+++ b/kernel/generic/trsm_uncopy_1.c
@@ -74,7 +74,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
}
if (ii < jj) *(b + 0) = *(a1 + 0);
-
+
a1 ++;
b ++;
i --;
diff --git a/kernel/generic/trsm_uncopy_16.c b/kernel/generic/trsm_uncopy_16.c
index e2b8ce4..b0480ce 100644
--- a/kernel/generic/trsm_uncopy_16.c
+++ b/kernel/generic/trsm_uncopy_16.c
@@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8;
FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16;
-
+
jj = offset;
j = (n >> 4);
@@ -78,14 +78,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 16)) {
*(b + ii - jj) = INV(*(a1 + (ii - jj) * lda));
for (k = ii - jj + 1; k < 16; k ++) {
*(b + k) = *(a1 + k * lda);
}
}
-
+
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a2 + 0);
@@ -143,14 +143,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 8)) {
*(b + ii - jj) = INV(*(a1 + (ii - jj) * lda));
for (k = ii - jj + 1; k < 8; k ++) {
*(b + k) = *(a1 + k * lda);
}
}
-
+
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a2 + 0);
@@ -187,14 +187,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 4)) {
*(b + ii - jj) = INV(*(a1 + (ii - jj) * lda));
for (k = ii - jj + 1; k < 4; k ++) {
*(b + k) = *(a1 + k * lda);
}
}
-
+
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a2 + 0);
@@ -221,14 +221,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 2)) {
*(b + ii - jj) = INV(*(a1 + (ii - jj) * lda));
for (k = ii - jj + 1; k < 2; k ++) {
*(b + k) = *(a1 + k * lda);
}
}
-
+
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a2 + 0);
@@ -249,14 +249,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 1)) {
*(b + ii - jj) = INV(*(a1 + (ii - jj) * lda));
for (k = ii - jj + 1; k < 1; k ++) {
*(b + k) = *(a1 + k * lda);
}
}
-
+
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
}
diff --git a/kernel/generic/trsm_uncopy_2.c b/kernel/generic/trsm_uncopy_2.c
index f7f3435..6c257ee 100644
--- a/kernel/generic/trsm_uncopy_2.c
+++ b/kernel/generic/trsm_uncopy_2.c
@@ -94,7 +94,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data02;
*(b + 3) = data04;
}
-
+
a1 += 2;
a2 += 2;
b += 4;
@@ -148,7 +148,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
data01 = *(a1 + 0);
*(b + 0) = data01;
}
-
+
a1+= 1;
b += 1;
i --;
diff --git a/kernel/generic/trsm_uncopy_4.c b/kernel/generic/trsm_uncopy_4.c
index 837a250..a1bb1e2 100644
--- a/kernel/generic/trsm_uncopy_4.c
+++ b/kernel/generic/trsm_uncopy_4.c
@@ -146,7 +146,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 14) = data12;
*(b + 15) = data16;
}
-
+
a1 += 4;
a2 += 4;
a3 += 4;
@@ -160,7 +160,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
if ((m & 2) != 0) {
if (ii== jj) {
-
+
#ifndef UNIT
data01 = *(a1 + 0);
#endif
@@ -205,7 +205,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data07;
*(b + 7) = data08;
}
-
+
a1 += 2;
a2 += 2;
b += 8;
@@ -284,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data02;
*(b + 3) = data04;
}
-
+
a1 += 2;
a2 += 2;
b += 4;
@@ -338,7 +338,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
data01 = *(a1 + 0);
*(b + 0) = data01;
}
-
+
a1+= 1;
b += 1;
i --;
diff --git a/kernel/generic/trsm_uncopy_6.c b/kernel/generic/trsm_uncopy_6.c
index 837a250..a1bb1e2 100644
--- a/kernel/generic/trsm_uncopy_6.c
+++ b/kernel/generic/trsm_uncopy_6.c
@@ -146,7 +146,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 14) = data12;
*(b + 15) = data16;
}
-
+
a1 += 4;
a2 += 4;
a3 += 4;
@@ -160,7 +160,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
if ((m & 2) != 0) {
if (ii== jj) {
-
+
#ifndef UNIT
data01 = *(a1 + 0);
#endif
@@ -205,7 +205,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data07;
*(b + 7) = data08;
}
-
+
a1 += 2;
a2 += 2;
b += 8;
@@ -284,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data02;
*(b + 3) = data04;
}
-
+
a1 += 2;
a2 += 2;
b += 4;
@@ -338,7 +338,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
data01 = *(a1 + 0);
*(b + 0) = data01;
}
-
+
a1+= 1;
b += 1;
i --;
diff --git a/kernel/generic/trsm_uncopy_8.c b/kernel/generic/trsm_uncopy_8.c
index 8c5623d..40903d4 100644
--- a/kernel/generic/trsm_uncopy_8.c
+++ b/kernel/generic/trsm_uncopy_8.c
@@ -266,7 +266,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 5) = data41;
*(b + 6) = data49;
*(b + 7) = data57;
-
+
*(b + 8) = data02;
*(b + 9) = data10;
*(b + 10) = data18;
@@ -275,7 +275,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 13) = data42;
*(b + 14) = data50;
*(b + 15) = data58;
-
+
*(b + 16) = data03;
*(b + 17) = data11;
*(b + 18) = data19;
@@ -284,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 21) = data43;
*(b + 22) = data51;
*(b + 23) = data59;
-
+
*(b + 24) = data04;
*(b + 25) = data12;
*(b + 26) = data20;
@@ -293,7 +293,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 29) = data44;
*(b + 30) = data52;
*(b + 31) = data60;
-
+
*(b + 32) = data05;
*(b + 33) = data13;
*(b + 34) = data21;
@@ -311,7 +311,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 45) = data46;
*(b + 46) = data54;
*(b + 47) = data62;
-
+
*(b + 48) = data07;
*(b + 49) = data15;
*(b + 50) = data23;
@@ -330,7 +330,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 62) = data56;
*(b + 63) = data64;
}
-
+
a1 += 8;
a2 += 8;
a3 += 8;
@@ -467,7 +467,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 5) = data41;
*(b + 6) = data49;
*(b + 7) = data57;
-
+
*(b + 8) = data02;
*(b + 9) = data10;
*(b + 10) = data18;
@@ -476,7 +476,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 13) = data42;
*(b + 14) = data50;
*(b + 15) = data58;
-
+
*(b + 16) = data03;
*(b + 17) = data11;
*(b + 18) = data19;
@@ -485,7 +485,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 21) = data43;
*(b + 22) = data51;
*(b + 23) = data59;
-
+
*(b + 24) = data04;
*(b + 25) = data12;
*(b + 26) = data20;
@@ -495,7 +495,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 30) = data52;
*(b + 31) = data60;
}
-
+
a1 += 4;
a2 += 4;
a3 += 4;
@@ -579,7 +579,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 5) = data41;
*(b + 6) = data49;
*(b + 7) = data57;
-
+
*(b + 8) = data02;
*(b + 9) = data10;
*(b + 10) = data18;
@@ -589,7 +589,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 14) = data50;
*(b + 15) = data58;
}
-
+
a1 += 2;
a2 += 2;
a3 += 2;
@@ -732,7 +732,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 5) = data10;
*(b + 6) = data18;
*(b + 7) = data26;
-
+
*(b + 8) = data03;
*(b + 9) = data11;
*(b + 10) = data19;
@@ -742,7 +742,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 14) = data20;
*(b + 15) = data28;
}
-
+
a1 += 4;
a2 += 4;
a3 += 4;
@@ -798,7 +798,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data18;
*(b + 7) = data26;
}
-
+
a1 += 2;
a2 += 2;
a3 += 2;
@@ -879,7 +879,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data02;
*(b + 3) = data10;
}
-
+
a1 += 2;
a2 += 2;
b += 4;
@@ -934,7 +934,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
data01 = *(a1 + 0);
*(b + 0) = data01;
}
-
+
a1 += 1;
b += 1;
i --;
diff --git a/kernel/generic/trsm_utcopy_1.c b/kernel/generic/trsm_utcopy_1.c
index ea490d5..cad8180 100644
--- a/kernel/generic/trsm_utcopy_1.c
+++ b/kernel/generic/trsm_utcopy_1.c
@@ -73,7 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
}
if (ii > jj) *(b + 0) = *(a1 + 0);
-
+
a1 += lda;
b ++;
i --;
diff --git a/kernel/generic/trsm_utcopy_16.c b/kernel/generic/trsm_utcopy_16.c
index 5466412..741fcde 100644
--- a/kernel/generic/trsm_utcopy_16.c
+++ b/kernel/generic/trsm_utcopy_16.c
@@ -61,7 +61,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 16)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k) = *(a1 + k);
@@ -69,7 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + ii - jj) = INV(*(a1 + ii - jj));
}
-
+
if (ii - jj >= 16) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -105,7 +105,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 8)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k) = *(a1 + k);
@@ -113,7 +113,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + ii - jj) = INV(*(a1 + ii - jj));
}
-
+
if (ii - jj >= 8) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -141,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 4)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k) = *(a1 + k);
@@ -149,7 +149,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + ii - jj) = INV(*(a1 + ii - jj));
}
-
+
if (ii - jj >= 4) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -173,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 2)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k) = *(a1 + k);
@@ -181,7 +181,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + ii - jj) = INV(*(a1 + ii - jj));
}
-
+
if (ii - jj >= 2) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -202,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 1)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k) = *(a1 + k);
@@ -210,7 +210,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + ii - jj) = INV(*(a1 + ii - jj));
}
-
+
if (ii - jj >= 1) {
*(b + 0) = *(a1 + 0);
}
diff --git a/kernel/generic/trsm_utcopy_2.c b/kernel/generic/trsm_utcopy_2.c
index 3def611..bdd5416 100644
--- a/kernel/generic/trsm_utcopy_2.c
+++ b/kernel/generic/trsm_utcopy_2.c
@@ -91,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data03;
*(b + 3) = data04;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 4;
@@ -142,7 +142,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
data01 = *(a1 + 0);
*(b + 0) = data01;
}
-
+
a1 += 1 * lda;
b += 1;
diff --git a/kernel/generic/trsm_utcopy_4.c b/kernel/generic/trsm_utcopy_4.c
index bbba78d..f836172 100644
--- a/kernel/generic/trsm_utcopy_4.c
+++ b/kernel/generic/trsm_utcopy_4.c
@@ -144,7 +144,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 14) = data15;
*(b + 15) = data16;
}
-
+
a1 += 4 * lda;
a2 += 4 * lda;
a3 += 4 * lda;
@@ -192,7 +192,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data07;
*(b + 7) = data08;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 8;
@@ -261,7 +261,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data03;
*(b + 3) = data04;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 4;
@@ -309,7 +309,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
data01 = *(a1 + 0);
*(b + 0) = data01;
}
-
+
a1 += 1 * lda;
b += 1;
diff --git a/kernel/generic/trsm_utcopy_6.c b/kernel/generic/trsm_utcopy_6.c
index bbba78d..f836172 100644
--- a/kernel/generic/trsm_utcopy_6.c
+++ b/kernel/generic/trsm_utcopy_6.c
@@ -144,7 +144,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 14) = data15;
*(b + 15) = data16;
}
-
+
a1 += 4 * lda;
a2 += 4 * lda;
a3 += 4 * lda;
@@ -192,7 +192,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data07;
*(b + 7) = data08;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 8;
@@ -261,7 +261,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data03;
*(b + 3) = data04;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 4;
@@ -309,7 +309,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
data01 = *(a1 + 0);
*(b + 0) = data01;
}
-
+
a1 += 1 * lda;
b += 1;
diff --git a/kernel/generic/trsm_utcopy_8.c b/kernel/generic/trsm_utcopy_8.c
index 531ac59..97da66f 100644
--- a/kernel/generic/trsm_utcopy_8.c
+++ b/kernel/generic/trsm_utcopy_8.c
@@ -325,7 +325,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 62) = data63;
*(b + 63) = data64;
}
-
+
a1 += 8 * lda;
a2 += 8 * lda;
a3 += 8 * lda;
@@ -450,7 +450,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 30) = data31;
*(b + 31) = data32;
}
-
+
a1 += 4 * lda;
a2 += 4 * lda;
a3 += 4 * lda;
@@ -511,7 +511,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 14) = data15;
*(b + 15) = data16;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 16;
@@ -637,7 +637,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 14) = data27;
*(b + 15) = data28;
}
-
+
a1 += 4 * lda;
a2 += 4 * lda;
a3 += 4 * lda;
@@ -678,7 +678,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data11;
*(b + 7) = data12;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 8;
@@ -744,7 +744,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data09;
*(b + 3) = data10;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 4;
@@ -791,7 +791,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
data01 = *(a1 + 0);
*(b + 0) = data01;
}
-
+
a1 += lda;
b += 1;
i --;
diff --git a/kernel/generic/zgemm3m_ncopy_1.c b/kernel/generic/zgemm3m_ncopy_1.c
index 7ac734b..0aa9470 100644
--- a/kernel/generic/zgemm3m_ncopy_1.c
+++ b/kernel/generic/zgemm3m_ncopy_1.c
@@ -57,7 +57,7 @@
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
#ifdef USE_ALPHA
- FLOAT alpha_r, FLOAT alpha_i,
+ FLOAT alpha_r, FLOAT alpha_i,
#endif
FLOAT *b){
@@ -70,16 +70,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
while (n > 0) {
a_offset = a;
a += lda;
-
+
for (i = 0; i < m; i ++) {
-
+
a1 = *(a_offset + 0);
a2 = *(a_offset + 1);
-
+
*(b + 0) = CMULT(a1, a2);
-
+
a_offset += 2;
-
+
b ++;
}
n --;
diff --git a/kernel/generic/zgemm3m_ncopy_2.c b/kernel/generic/zgemm3m_ncopy_2.c
index 702524a..dd5a732 100644
--- a/kernel/generic/zgemm3m_ncopy_2.c
+++ b/kernel/generic/zgemm3m_ncopy_2.c
@@ -57,7 +57,7 @@
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
#ifdef USE_ALPHA
- FLOAT alpha_r, FLOAT alpha_i,
+ FLOAT alpha_r, FLOAT alpha_i,
#endif
FLOAT *b){
@@ -71,14 +71,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset = a;
b_offset = b;
-
+
j = (n >> 1);
if (j > 0){
do{
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
-
+
for (i = 0; i < m; i ++) {
a1 = *(a_offset1 + 0);
@@ -91,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset1 += 2;
a_offset2 += 2;
-
+
b_offset += 2;
}
@@ -99,19 +99,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
j--;
}while(j > 0);
} /* end of if(j > 0) */
-
+
if (n & 1) {
a_offset1 = a_offset;
-
+
for (i = 0; i < m; i ++) {
-
+
a1 = *(a_offset1 + 0);
a2 = *(a_offset1 + 1);
-
+
*(b_offset + 0) = CMULT(a1, a2);
-
+
a_offset1 += 2;
-
+
b_offset += 1;
}
}
diff --git a/kernel/generic/zgemm3m_ncopy_4.c b/kernel/generic/zgemm3m_ncopy_4.c
index 1117d77..b4d23e2 100644
--- a/kernel/generic/zgemm3m_ncopy_4.c
+++ b/kernel/generic/zgemm3m_ncopy_4.c
@@ -57,7 +57,7 @@
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
#ifdef USE_ALPHA
- FLOAT alpha_r, FLOAT alpha_i,
+ FLOAT alpha_r, FLOAT alpha_i,
#endif
FLOAT *b){
@@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset = a;
b_offset = b;
-
+
j = (n >> 2);
if (j > 0){
do{
@@ -80,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset += 4 * lda;
-
+
for (i = 0; i < m; i ++) {
a1 = *(a_offset1 + 0);
@@ -101,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset2 += 2;
a_offset3 += 2;
a_offset4 += 2;
-
+
b_offset += 4;
}
@@ -109,12 +109,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
j--;
}while(j > 0);
} /* end of if(j > 0) */
-
+
if (n & 2) {
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
-
+
for (i = 0; i < m; i ++) {
a1 = *(a_offset1 + 0);
@@ -127,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset1 += 2;
a_offset2 += 2;
-
+
b_offset += 2;
}
@@ -135,16 +135,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
if (n & 1) {
a_offset1 = a_offset;
-
+
for (i = 0; i < m; i ++) {
-
+
a1 = *(a_offset1 + 0);
a2 = *(a_offset1 + 1);
-
+
*(b_offset + 0) = CMULT(a1, a2);
-
+
a_offset1 += 2;
-
+
b_offset += 1;
}
}
diff --git a/kernel/generic/zgemm3m_ncopy_8.c b/kernel/generic/zgemm3m_ncopy_8.c
index 0c3cb5d..d3e5da8 100644
--- a/kernel/generic/zgemm3m_ncopy_8.c
+++ b/kernel/generic/zgemm3m_ncopy_8.c
@@ -57,7 +57,7 @@
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
#ifdef USE_ALPHA
- FLOAT alpha_r, FLOAT alpha_i,
+ FLOAT alpha_r, FLOAT alpha_i,
#endif
FLOAT *b){
@@ -89,7 +89,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset = a;
b_offset = b;
-
+
j = (n >> 3);
if (j > 0){
do{
@@ -102,7 +102,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset7 = a_offset6 + lda;
a_offset8 = a_offset7 + lda;
a_offset += 8 * lda;
-
+
for (i = 0; i < m; i ++) {
a1 = *(a_offset1 + 0);
a2 = *(a_offset1 + 1);
@@ -129,7 +129,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
*(b_offset + 5) = CMULT(a11, a12);
*(b_offset + 6) = CMULT(a13, a14);
*(b_offset + 7) = CMULT(a15, a16);
-
+
a_offset1 += 2;
a_offset2 += 2;
a_offset3 += 2;
@@ -138,21 +138,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset6 += 2;
a_offset7 += 2;
a_offset8 += 2;
-
+
b_offset += 8;
}
j--;
}while(j > 0);
}
-
+
if (n & 4){
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset += 4 * lda;
-
+
for (i = 0; i < m; i ++) {
a1 = *(a_offset1 + 0);
a2 = *(a_offset1 + 1);
@@ -162,17 +162,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset3 + 1);
a7 = *(a_offset4 + 0);
a8 = *(a_offset4 + 1);
-
+
*(b_offset + 0) = CMULT(a1, a2);
*(b_offset + 1) = CMULT(a3, a4);
*(b_offset + 2) = CMULT(a5, a6);
*(b_offset + 3) = CMULT(a7, a8);
-
+
a_offset1 += 2;
a_offset2 += 2;
a_offset3 += 2;
a_offset4 += 2;
-
+
b_offset += 4;
}
}
@@ -181,30 +181,30 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
-
+
for (i = 0; i < m; i ++) {
a1 = *(a_offset1 + 0);
a2 = *(a_offset1 + 1);
a3 = *(a_offset2 + 0);
a4 = *(a_offset2 + 1);
-
+
*(b_offset + 0) = CMULT(a1, a2);
*(b_offset + 1) = CMULT(a3, a4);
-
+
a_offset1 += 2;
a_offset2 += 2;
-
+
b_offset += 2;
}
}
if (n & 1){
a_offset1 = a_offset;
-
+
for (i = 0; i < m; i ++) {
a1 = *(a_offset1 + 0);
a2 = *(a_offset1 + 1);
-
+
*(b_offset + 0) = CMULT(a1, a2);
a_offset1 += 2;
diff --git a/kernel/generic/zgemm3m_tcopy_1.c b/kernel/generic/zgemm3m_tcopy_1.c
index 47cf7e5..33e8ad6 100644
--- a/kernel/generic/zgemm3m_tcopy_1.c
+++ b/kernel/generic/zgemm3m_tcopy_1.c
@@ -57,7 +57,7 @@
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
#ifdef USE_ALPHA
- FLOAT alpha_r, FLOAT alpha_i,
+ FLOAT alpha_r, FLOAT alpha_i,
#endif
FLOAT *b){
@@ -70,20 +70,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
while (n > 0) {
a_offset = a;
a += 2;
-
+
for (i = 0; i < m; i ++) {
-
+
a1 = *(a_offset + 0);
a2 = *(a_offset + 1);
-
+
*(b + 0) = CMULT(a1, a2);
-
+
a_offset += lda;
-
+
b ++;
}
n --;
}
-
+
return 0;
}
diff --git a/kernel/generic/zgemm3m_tcopy_2.c b/kernel/generic/zgemm3m_tcopy_2.c
index f6fe10b..b8a2626 100644
--- a/kernel/generic/zgemm3m_tcopy_2.c
+++ b/kernel/generic/zgemm3m_tcopy_2.c
@@ -57,7 +57,7 @@
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
#ifdef USE_ALPHA
- FLOAT alpha_r, FLOAT alpha_i,
+ FLOAT alpha_r, FLOAT alpha_i,
#endif
FLOAT *b){
@@ -80,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
-
+
b_offset1 = b_offset;
b_offset += 4;
@@ -104,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset1 += 4;
a_offset2 += 4;
-
+
b_offset1 += m * 2;
i --;
}while(i > 0);
@@ -119,7 +119,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
*(b_offset2 + 0) = CMULT(a1, a2);
*(b_offset2 + 1) = CMULT(a3, a4);
-
+
b_offset2 += 2;
}
@@ -130,7 +130,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
if (m & 1){
a_offset1 = a_offset;
b_offset1 = b_offset;
-
+
i = (n >> 1);
if (i > 0){
do{
@@ -138,10 +138,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a2 = *(a_offset1 + 1);
a3 = *(a_offset1 + 2);
a4 = *(a_offset1 + 3);
-
+
*(b_offset1 + 0) = CMULT(a1, a2);
*(b_offset1 + 1) = CMULT(a3, a4);
-
+
a_offset1 += 4;
b_offset1 += 2 * m;
@@ -153,10 +153,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
if (n & 1) {
a1 = *(a_offset1 + 0);
a2 = *(a_offset1 + 1);
-
+
*(b_offset2 + 0) = CMULT(a1, a2);
}
}
-
+
return 0;
}
diff --git a/kernel/generic/zgemm3m_tcopy_4.c b/kernel/generic/zgemm3m_tcopy_4.c
index e072262..2c071ff 100644
--- a/kernel/generic/zgemm3m_tcopy_4.c
+++ b/kernel/generic/zgemm3m_tcopy_4.c
@@ -57,7 +57,7 @@
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
#ifdef USE_ALPHA
- FLOAT alpha_r, FLOAT alpha_i,
+ FLOAT alpha_r, FLOAT alpha_i,
#endif
FLOAT *b){
@@ -83,7 +83,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset += 4 * lda;
-
+
b_offset1 = b_offset;
b_offset += 16;
@@ -151,7 +151,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset2 += 8;
a_offset3 += 8;
a_offset4 += 8;
-
+
b_offset1 += m * 4;
i --;
}while(i > 0);
@@ -167,12 +167,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset2 + 1);
a7 = *(a_offset2 + 2);
a8 = *(a_offset2 + 3);
-
+
*(b_offset2 + 0) = CMULT(a1, a2);
*(b_offset2 + 1) = CMULT(a3, a4);
*(b_offset2 + 2) = CMULT(a5, a6);
*(b_offset2 + 3) = CMULT(a7, a8);
-
+
a1 = *(a_offset3 + 0);
a2 = *(a_offset3 + 1);
a3 = *(a_offset3 + 2);
@@ -181,17 +181,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset4 + 1);
a7 = *(a_offset4 + 2);
a8 = *(a_offset4 + 3);
-
+
*(b_offset2 + 4) = CMULT(a1, a2);
*(b_offset2 + 5) = CMULT(a3, a4);
*(b_offset2 + 6) = CMULT(a5, a6);
*(b_offset2 + 7) = CMULT(a7, a8);
-
+
a_offset1 += 4;
a_offset2 += 4;
a_offset3 += 4;
a_offset4 += 4;
-
+
b_offset2 += 8;
}
@@ -210,7 +210,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
*(b_offset3 + 1) = CMULT(a3, a4);
*(b_offset3 + 2) = CMULT(a5, a6);
*(b_offset3 + 3) = CMULT(a7, a8);
-
+
b_offset3 += 4;
}
@@ -222,10 +222,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
-
+
b_offset1 = b_offset;
b_offset += 8;
-
+
i = (n >> 2);
if (i > 0){
do{
@@ -238,12 +238,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset1 + 5);
a7 = *(a_offset1 + 6);
a8 = *(a_offset1 + 7);
-
+
*(b_offset1 + 0) = CMULT(a1, a2);
*(b_offset1 + 1) = CMULT(a3, a4);
*(b_offset1 + 2) = CMULT(a5, a6);
*(b_offset1 + 3) = CMULT(a7, a8);
-
+
a1 = *(a_offset2 + 0);
a2 = *(a_offset2 + 1);
a3 = *(a_offset2 + 2);
@@ -252,15 +252,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset2 + 5);
a7 = *(a_offset2 + 6);
a8 = *(a_offset2 + 7);
-
+
*(b_offset1 + 4) = CMULT(a1, a2);
*(b_offset1 + 5) = CMULT(a3, a4);
*(b_offset1 + 6) = CMULT(a5, a6);
*(b_offset1 + 7) = CMULT(a7, a8);
-
+
a_offset1 += 8;
a_offset2 += 8;
-
+
b_offset1 += m * 4;
i --;
}while(i > 0);
@@ -275,23 +275,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset2 + 1);
a7 = *(a_offset2 + 2);
a8 = *(a_offset2 + 3);
-
+
*(b_offset2 + 0) = CMULT(a1, a2);
*(b_offset2 + 1) = CMULT(a3, a4);
*(b_offset2 + 2) = CMULT(a5, a6);
*(b_offset2 + 3) = CMULT(a7, a8);
-
+
a_offset1 += 4;
a_offset2 += 4;
b_offset2 += 4;
}
-
+
if (n & 1) {
a1 = *(a_offset1 + 0);
a2 = *(a_offset1 + 1);
a3 = *(a_offset2 + 0);
a4 = *(a_offset2 + 1);
-
+
*(b_offset3 + 0) = CMULT(a1, a2);
*(b_offset3 + 1) = CMULT(a3, a4);
@@ -302,7 +302,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
if (m & 1){
a_offset1 = a_offset;
b_offset1 = b_offset;
-
+
i = (n >> 2);
if (i > 0){
do{
@@ -314,12 +314,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset1 + 5);
a7 = *(a_offset1 + 6);
a8 = *(a_offset1 + 7);
-
+
*(b_offset1 + 0) = CMULT(a1, a2);
*(b_offset1 + 1) = CMULT(a3, a4);
*(b_offset1 + 2) = CMULT(a5, a6);
*(b_offset1 + 3) = CMULT(a7, a8);
-
+
a_offset1 += 8;
b_offset1 += 4 * m;
@@ -333,17 +333,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a2 = *(a_offset1 + 1);
a3 = *(a_offset1 + 2);
a4 = *(a_offset1 + 3);
-
+
*(b_offset2 + 0) = CMULT(a1, a2);
*(b_offset2 + 1) = CMULT(a3, a4);
-
+
a_offset1 += 4;
}
-
+
if (n & 1) {
a1 = *(a_offset1 + 0);
a2 = *(a_offset1 + 1);
-
+
*(b_offset3 + 0) = CMULT(a1, a2);
}
}
diff --git a/kernel/generic/zgemm3m_tcopy_8.c b/kernel/generic/zgemm3m_tcopy_8.c
index e68bccf..fddbdd8 100644
--- a/kernel/generic/zgemm3m_tcopy_8.c
+++ b/kernel/generic/zgemm3m_tcopy_8.c
@@ -57,7 +57,7 @@
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
#ifdef USE_ALPHA
- FLOAT alpha_r, FLOAT alpha_i,
+ FLOAT alpha_r, FLOAT alpha_i,
#endif
FLOAT *b){
@@ -107,7 +107,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset8 = a_offset7 + lda;
a_offset += 8 * lda;
-
+
b_offset1 = b_offset;
b_offset += 64;
@@ -130,7 +130,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a14 = *(a_offset1 + 13);
a15 = *(a_offset1 + 14);
a16 = *(a_offset1 + 15);
-
+
*(b_offset1 + 0) = CMULT(a1, a2);
*(b_offset1 + 1) = CMULT(a3, a4);
*(b_offset1 + 2) = CMULT(a5, a6);
@@ -156,7 +156,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a14 = *(a_offset2 + 13);
a15 = *(a_offset2 + 14);
a16 = *(a_offset2 + 15);
-
+
*(b_offset1 + 8) = CMULT(a1, a2);
*(b_offset1 + 9) = CMULT(a3, a4);
*(b_offset1 + 10) = CMULT(a5, a6);
@@ -182,7 +182,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a14 = *(a_offset3 + 13);
a15 = *(a_offset3 + 14);
a16 = *(a_offset3 + 15);
-
+
*(b_offset1 + 16) = CMULT(a1, a2);
*(b_offset1 + 17) = CMULT(a3, a4);
*(b_offset1 + 18) = CMULT(a5, a6);
@@ -208,7 +208,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a14 = *(a_offset4 + 13);
a15 = *(a_offset4 + 14);
a16 = *(a_offset4 + 15);
-
+
*(b_offset1 + 24) = CMULT(a1, a2);
*(b_offset1 + 25) = CMULT(a3, a4);
*(b_offset1 + 26) = CMULT(a5, a6);
@@ -234,7 +234,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a14 = *(a_offset5 + 13);
a15 = *(a_offset5 + 14);
a16 = *(a_offset5 + 15);
-
+
*(b_offset1 + 32) = CMULT(a1, a2);
*(b_offset1 + 33) = CMULT(a3, a4);
*(b_offset1 + 34) = CMULT(a5, a6);
@@ -260,7 +260,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a14 = *(a_offset6 + 13);
a15 = *(a_offset6 + 14);
a16 = *(a_offset6 + 15);
-
+
*(b_offset1 + 40) = CMULT(a1, a2);
*(b_offset1 + 41) = CMULT(a3, a4);
*(b_offset1 + 42) = CMULT(a5, a6);
@@ -286,7 +286,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a14 = *(a_offset7 + 13);
a15 = *(a_offset7 + 14);
a16 = *(a_offset7 + 15);
-
+
*(b_offset1 + 48) = CMULT(a1, a2);
*(b_offset1 + 49) = CMULT(a3, a4);
*(b_offset1 + 50) = CMULT(a5, a6);
@@ -312,7 +312,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a14 = *(a_offset8 + 13);
a15 = *(a_offset8 + 14);
a16 = *(a_offset8 + 15);
-
+
*(b_offset1 + 56) = CMULT(a1, a2);
*(b_offset1 + 57) = CMULT(a3, a4);
*(b_offset1 + 58) = CMULT(a5, a6);
@@ -330,7 +330,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset6 += 16;
a_offset7 += 16;
a_offset8 += 16;
-
+
b_offset1 += m * 8;
i --;
}while(i > 0);
@@ -345,12 +345,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset1 + 5);
a7 = *(a_offset1 + 6);
a8 = *(a_offset1 + 7);
-
+
*(b_offset2 + 0) = CMULT(a1, a2);
*(b_offset2 + 1) = CMULT(a3, a4);
*(b_offset2 + 2) = CMULT(a5, a6);
*(b_offset2 + 3) = CMULT(a7, a8);
-
+
a1 = *(a_offset2 + 0);
a2 = *(a_offset2 + 1);
a3 = *(a_offset2 + 2);
@@ -359,7 +359,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset2 + 5);
a7 = *(a_offset2 + 6);
a8 = *(a_offset2 + 7);
-
+
*(b_offset2 + 4) = CMULT(a1, a2);
*(b_offset2 + 5) = CMULT(a3, a4);
*(b_offset2 + 6) = CMULT(a5, a6);
@@ -373,12 +373,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset3 + 5);
a7 = *(a_offset3 + 6);
a8 = *(a_offset3 + 7);
-
+
*(b_offset2 + 8) = CMULT(a1, a2);
*(b_offset2 + 9) = CMULT(a3, a4);
*(b_offset2 + 10) = CMULT(a5, a6);
*(b_offset2 + 11) = CMULT(a7, a8);
-
+
a1 = *(a_offset4 + 0);
a2 = *(a_offset4 + 1);
a3 = *(a_offset4 + 2);
@@ -387,12 +387,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset4 + 5);
a7 = *(a_offset4 + 6);
a8 = *(a_offset4 + 7);
-
+
*(b_offset2 + 12) = CMULT(a1, a2);
*(b_offset2 + 13) = CMULT(a3, a4);
*(b_offset2 + 14) = CMULT(a5, a6);
*(b_offset2 + 15) = CMULT(a7, a8);
-
+
a1 = *(a_offset5 + 0);
a2 = *(a_offset5 + 1);
a3 = *(a_offset5 + 2);
@@ -401,12 +401,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset5 + 5);
a7 = *(a_offset5 + 6);
a8 = *(a_offset5 + 7);
-
+
*(b_offset2 + 16) = CMULT(a1, a2);
*(b_offset2 + 17) = CMULT(a3, a4);
*(b_offset2 + 18) = CMULT(a5, a6);
*(b_offset2 + 19) = CMULT(a7, a8);
-
+
a1 = *(a_offset6 + 0);
a2 = *(a_offset6 + 1);
a3 = *(a_offset6 + 2);
@@ -415,12 +415,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset6 + 5);
a7 = *(a_offset6 + 6);
a8 = *(a_offset6 + 7);
-
+
*(b_offset2 + 20) = CMULT(a1, a2);
*(b_offset2 + 21) = CMULT(a3, a4);
*(b_offset2 + 22) = CMULT(a5, a6);
*(b_offset2 + 23) = CMULT(a7, a8);
-
+
a1 = *(a_offset7 + 0);
a2 = *(a_offset7 + 1);
a3 = *(a_offset7 + 2);
@@ -429,12 +429,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset7 + 5);
a7 = *(a_offset7 + 6);
a8 = *(a_offset7 + 7);
-
+
*(b_offset2 + 24) = CMULT(a1, a2);
*(b_offset2 + 25) = CMULT(a3, a4);
*(b_offset2 + 26) = CMULT(a5, a6);
*(b_offset2 + 27) = CMULT(a7, a8);
-
+
a1 = *(a_offset8 + 0);
a2 = *(a_offset8 + 1);
a3 = *(a_offset8 + 2);
@@ -443,12 +443,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset8 + 5);
a7 = *(a_offset8 + 6);
a8 = *(a_offset8 + 7);
-
+
*(b_offset2 + 28) = CMULT(a1, a2);
*(b_offset2 + 29) = CMULT(a3, a4);
*(b_offset2 + 30) = CMULT(a5, a6);
*(b_offset2 + 31) = CMULT(a7, a8);
-
+
a_offset1 += 8;
a_offset2 += 8;
a_offset3 += 8;
@@ -457,7 +457,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset6 += 8;
a_offset7 += 8;
a_offset8 += 8;
-
+
b_offset2 += 32;
}
@@ -466,15 +466,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a2 = *(a_offset1 + 1);
a3 = *(a_offset1 + 2);
a4 = *(a_offset1 + 3);
-
+
*(b_offset3 + 0) = CMULT(a1, a2);
*(b_offset3 + 1) = CMULT(a3, a4);
-
+
a1 = *(a_offset2 + 0);
a2 = *(a_offset2 + 1);
a3 = *(a_offset2 + 2);
a4 = *(a_offset2 + 3);
-
+
*(b_offset3 + 2) = CMULT(a1, a2);
*(b_offset3 + 3) = CMULT(a3, a4);
@@ -482,50 +482,50 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a2 = *(a_offset3 + 1);
a3 = *(a_offset3 + 2);
a4 = *(a_offset3 + 3);
-
+
*(b_offset3 + 4) = CMULT(a1, a2);
*(b_offset3 + 5) = CMULT(a3, a4);
-
+
a1 = *(a_offset4 + 0);
a2 = *(a_offset4 + 1);
a3 = *(a_offset4 + 2);
a4 = *(a_offset4 + 3);
-
+
*(b_offset3 + 6) = CMULT(a1, a2);
*(b_offset3 + 7) = CMULT(a3, a4);
-
+
a1 = *(a_offset5 + 0);
a2 = *(a_offset5 + 1);
a3 = *(a_offset5 + 2);
a4 = *(a_offset5 + 3);
-
+
*(b_offset3 + 8) = CMULT(a1, a2);
*(b_offset3 + 9) = CMULT(a3, a4);
-
+
a1 = *(a_offset6 + 0);
a2 = *(a_offset6 + 1);
a3 = *(a_offset6 + 2);
a4 = *(a_offset6 + 3);
-
+
*(b_offset3 + 10) = CMULT(a1, a2);
*(b_offset3 + 11) = CMULT(a3, a4);
-
+
a1 = *(a_offset7 + 0);
a2 = *(a_offset7 + 1);
a3 = *(a_offset7 + 2);
a4 = *(a_offset7 + 3);
-
+
*(b_offset3 + 12) = CMULT(a1, a2);
*(b_offset3 + 13) = CMULT(a3, a4);
-
+
a1 = *(a_offset8 + 0);
a2 = *(a_offset8 + 1);
a3 = *(a_offset8 + 2);
a4 = *(a_offset8 + 3);
-
+
*(b_offset3 + 14) = CMULT(a1, a2);
*(b_offset3 + 15) = CMULT(a3, a4);
-
+
a_offset1 += 4;
a_offset2 += 4;
a_offset3 += 4;
@@ -534,49 +534,49 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset6 += 4;
a_offset7 += 4;
a_offset8 += 4;
-
+
b_offset3 += 16;
}
if (n & 1){
a1 = *(a_offset1 + 0);
a2 = *(a_offset1 + 1);
-
+
*(b_offset4 + 0) = CMULT(a1, a2);
-
+
a1 = *(a_offset2 + 0);
a2 = *(a_offset2 + 1);
-
+
*(b_offset4 + 1) = CMULT(a1, a2);
a1 = *(a_offset3 + 0);
a2 = *(a_offset3 + 1);
-
+
*(b_offset4 + 2) = CMULT(a1, a2);
-
+
a1 = *(a_offset4 + 0);
a2 = *(a_offset4 + 1);
-
+
*(b_offset4 + 3) = CMULT(a1, a2);
-
+
a1 = *(a_offset5 + 0);
a2 = *(a_offset5 + 1);
-
+
*(b_offset4 + 4) = CMULT(a1, a2);
-
+
a1 = *(a_offset6 + 0);
a2 = *(a_offset6 + 1);
-
+
*(b_offset4 + 5) = CMULT(a1, a2);
-
+
a1 = *(a_offset7 + 0);
a2 = *(a_offset7 + 1);
-
+
*(b_offset4 + 6) = CMULT(a1, a2);
-
+
a1 = *(a_offset8 + 0);
a2 = *(a_offset8 + 1);
-
+
*(b_offset4 + 7) = CMULT(a1, a2);
b_offset4 += 8;
@@ -592,7 +592,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset += 4 * lda;
-
+
b_offset1 = b_offset;
b_offset += 32;
@@ -615,7 +615,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a14 = *(a_offset1 + 13);
a15 = *(a_offset1 + 14);
a16 = *(a_offset1 + 15);
-
+
*(b_offset1 + 0) = CMULT(a1, a2);
*(b_offset1 + 1) = CMULT(a3, a4);
*(b_offset1 + 2) = CMULT(a5, a6);
@@ -641,7 +641,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a14 = *(a_offset2 + 13);
a15 = *(a_offset2 + 14);
a16 = *(a_offset2 + 15);
-
+
*(b_offset1 + 8) = CMULT(a1, a2);
*(b_offset1 + 9) = CMULT(a3, a4);
*(b_offset1 + 10) = CMULT(a5, a6);
@@ -667,7 +667,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a14 = *(a_offset3 + 13);
a15 = *(a_offset3 + 14);
a16 = *(a_offset3 + 15);
-
+
*(b_offset1 + 16) = CMULT(a1, a2);
*(b_offset1 + 17) = CMULT(a3, a4);
*(b_offset1 + 18) = CMULT(a5, a6);
@@ -693,7 +693,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a14 = *(a_offset4 + 13);
a15 = *(a_offset4 + 14);
a16 = *(a_offset4 + 15);
-
+
*(b_offset1 + 24) = CMULT(a1, a2);
*(b_offset1 + 25) = CMULT(a3, a4);
*(b_offset1 + 26) = CMULT(a5, a6);
@@ -707,7 +707,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset2 += 16;
a_offset3 += 16;
a_offset4 += 16;
-
+
b_offset1 += m * 8;
i --;
}while(i > 0);
@@ -722,12 +722,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset1 + 5);
a7 = *(a_offset1 + 6);
a8 = *(a_offset1 + 7);
-
+
*(b_offset2 + 0) = CMULT(a1, a2);
*(b_offset2 + 1) = CMULT(a3, a4);
*(b_offset2 + 2) = CMULT(a5, a6);
*(b_offset2 + 3) = CMULT(a7, a8);
-
+
a1 = *(a_offset2 + 0);
a2 = *(a_offset2 + 1);
a3 = *(a_offset2 + 2);
@@ -736,7 +736,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset2 + 5);
a7 = *(a_offset2 + 6);
a8 = *(a_offset2 + 7);
-
+
*(b_offset2 + 4) = CMULT(a1, a2);
*(b_offset2 + 5) = CMULT(a3, a4);
*(b_offset2 + 6) = CMULT(a5, a6);
@@ -750,12 +750,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset3 + 5);
a7 = *(a_offset3 + 6);
a8 = *(a_offset3 + 7);
-
+
*(b_offset2 + 8) = CMULT(a1, a2);
*(b_offset2 + 9) = CMULT(a3, a4);
*(b_offset2 + 10) = CMULT(a5, a6);
*(b_offset2 + 11) = CMULT(a7, a8);
-
+
a1 = *(a_offset4 + 0);
a2 = *(a_offset4 + 1);
a3 = *(a_offset4 + 2);
@@ -764,17 +764,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset4 + 5);
a7 = *(a_offset4 + 6);
a8 = *(a_offset4 + 7);
-
+
*(b_offset2 + 12) = CMULT(a1, a2);
*(b_offset2 + 13) = CMULT(a3, a4);
*(b_offset2 + 14) = CMULT(a5, a6);
*(b_offset2 + 15) = CMULT(a7, a8);
-
+
a_offset1 += 8;
a_offset2 += 8;
a_offset3 += 8;
a_offset4 += 8;
-
+
b_offset2 += 16;
}
@@ -783,15 +783,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a2 = *(a_offset1 + 1);
a3 = *(a_offset1 + 2);
a4 = *(a_offset1 + 3);
-
+
*(b_offset3 + 0) = CMULT(a1, a2);
*(b_offset3 + 1) = CMULT(a3, a4);
-
+
a1 = *(a_offset2 + 0);
a2 = *(a_offset2 + 1);
a3 = *(a_offset2 + 2);
a4 = *(a_offset2 + 3);
-
+
*(b_offset3 + 2) = CMULT(a1, a2);
*(b_offset3 + 3) = CMULT(a3, a4);
@@ -799,45 +799,45 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a2 = *(a_offset3 + 1);
a3 = *(a_offset3 + 2);
a4 = *(a_offset3 + 3);
-
+
*(b_offset3 + 4) = CMULT(a1, a2);
*(b_offset3 + 5) = CMULT(a3, a4);
-
+
a1 = *(a_offset4 + 0);
a2 = *(a_offset4 + 1);
a3 = *(a_offset4 + 2);
a4 = *(a_offset4 + 3);
-
+
*(b_offset3 + 6) = CMULT(a1, a2);
*(b_offset3 + 7) = CMULT(a3, a4);
-
+
a_offset1 += 4;
a_offset2 += 4;
a_offset3 += 4;
a_offset4 += 4;
-
+
b_offset3 += 8;
}
if (n & 1){
a1 = *(a_offset1 + 0);
a2 = *(a_offset1 + 1);
-
+
*(b_offset4 + 0) = CMULT(a1, a2);
-
+
a1 = *(a_offset2 + 0);
a2 = *(a_offset2 + 1);
-
+
*(b_offset4 + 1) = CMULT(a1, a2);
a1 = *(a_offset3 + 0);
a2 = *(a_offset3 + 1);
-
+
*(b_offset4 + 2) = CMULT(a1, a2);
-
+
a1 = *(a_offset4 + 0);
a2 = *(a_offset4 + 1);
-
+
*(b_offset4 + 3) = CMULT(a1, a2);
b_offset4 += 4;
@@ -848,7 +848,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
-
+
b_offset1 = b_offset;
b_offset += 16;
@@ -871,7 +871,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a14 = *(a_offset1 + 13);
a15 = *(a_offset1 + 14);
a16 = *(a_offset1 + 15);
-
+
*(b_offset1 + 0) = CMULT(a1, a2);
*(b_offset1 + 1) = CMULT(a3, a4);
*(b_offset1 + 2) = CMULT(a5, a6);
@@ -897,7 +897,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a14 = *(a_offset2 + 13);
a15 = *(a_offset2 + 14);
a16 = *(a_offset2 + 15);
-
+
*(b_offset1 + 8) = CMULT(a1, a2);
*(b_offset1 + 9) = CMULT(a3, a4);
*(b_offset1 + 10) = CMULT(a5, a6);
@@ -909,7 +909,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset1 += 16;
a_offset2 += 16;
-
+
b_offset1 += m * 8;
i --;
}while(i > 0);
@@ -924,12 +924,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset1 + 5);
a7 = *(a_offset1 + 6);
a8 = *(a_offset1 + 7);
-
+
*(b_offset2 + 0) = CMULT(a1, a2);
*(b_offset2 + 1) = CMULT(a3, a4);
*(b_offset2 + 2) = CMULT(a5, a6);
*(b_offset2 + 3) = CMULT(a7, a8);
-
+
a1 = *(a_offset2 + 0);
a2 = *(a_offset2 + 1);
a3 = *(a_offset2 + 2);
@@ -938,7 +938,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset2 + 5);
a7 = *(a_offset2 + 6);
a8 = *(a_offset2 + 7);
-
+
*(b_offset2 + 4) = CMULT(a1, a2);
*(b_offset2 + 5) = CMULT(a3, a4);
*(b_offset2 + 6) = CMULT(a5, a6);
@@ -946,7 +946,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a_offset1 += 8;
a_offset2 += 8;
-
+
b_offset2 += 8;
}
@@ -955,18 +955,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a2 = *(a_offset1 + 1);
a3 = *(a_offset1 + 2);
a4 = *(a_offset1 + 3);
-
+
*(b_offset3 + 0) = CMULT(a1, a2);
*(b_offset3 + 1) = CMULT(a3, a4);
-
+
a1 = *(a_offset2 + 0);
a2 = *(a_offset2 + 1);
a3 = *(a_offset2 + 2);
a4 = *(a_offset2 + 3);
-
+
*(b_offset3 + 2) = CMULT(a1, a2);
*(b_offset3 + 3) = CMULT(a3, a4);
-
+
a_offset1 += 4;
a_offset2 += 4;
@@ -976,12 +976,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
if (n & 1){
a1 = *(a_offset1 + 0);
a2 = *(a_offset1 + 1);
-
+
*(b_offset4 + 0) = CMULT(a1, a2);
-
+
a1 = *(a_offset2 + 0);
a2 = *(a_offset2 + 1);
-
+
*(b_offset4 + 1) = CMULT(a1, a2);
b_offset4 += 2;
@@ -1011,7 +1011,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a14 = *(a_offset1 + 13);
a15 = *(a_offset1 + 14);
a16 = *(a_offset1 + 15);
-
+
*(b_offset1 + 0) = CMULT(a1, a2);
*(b_offset1 + 1) = CMULT(a3, a4);
*(b_offset1 + 2) = CMULT(a5, a6);
@@ -1022,7 +1022,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
*(b_offset1 + 7) = CMULT(a15, a16);
a_offset1 += 16;
-
+
b_offset1 += m * 8;
i --;
}while(i > 0);
@@ -1037,7 +1037,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a6 = *(a_offset1 + 5);
a7 = *(a_offset1 + 6);
a8 = *(a_offset1 + 7);
-
+
*(b_offset2 + 0) = CMULT(a1, a2);
*(b_offset2 + 1) = CMULT(a3, a4);
*(b_offset2 + 2) = CMULT(a5, a6);
@@ -1052,7 +1052,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
a2 = *(a_offset1 + 1);
a3 = *(a_offset1 + 2);
a4 = *(a_offset1 + 3);
-
+
*(b_offset3 + 0) = CMULT(a1, a2);
*(b_offset3 + 1) = CMULT(a3, a4);
@@ -1063,10 +1063,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda,
if (n & 1){
a1 = *(a_offset1 + 0);
a2 = *(a_offset1 + 1);
-
+
*(b_offset4 + 0) = CMULT(a1, a2);
}
}
-
+
return 0;
}
diff --git a/kernel/generic/zgemm_beta.c b/kernel/generic/zgemm_beta.c
index b7a77a2..7954e22 100644
--- a/kernel/generic/zgemm_beta.c
+++ b/kernel/generic/zgemm_beta.c
@@ -41,7 +41,7 @@
#include <ctype.h>
#include "common.h"
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
FLOAT beta_r, FLOAT beta_i,
FLOAT *dummy2, BLASLONG dummy3,
FLOAT *dummy4, BLASLONG dummy5,
@@ -97,7 +97,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
do {
c_offset1 = c_offset;
c_offset += ldc;
-
+
i = (m >> 1);
if (i > 0){
do {
@@ -105,12 +105,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
atemp2 = *(c_offset1 + 1);
atemp3 = *(c_offset1 + 2);
atemp4 = *(c_offset1 + 3);
-
+
btemp1 = beta_r * atemp1;
btemp2 = beta_i * atemp2;
btemp3 = beta_r * atemp2;
btemp4 = beta_i * atemp1;
-
+
ctemp1 = btemp1 - btemp2;
ctemp2 = btemp3 + btemp4;
@@ -118,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
btemp2 = beta_i * atemp4;
btemp3 = beta_r * atemp4;
btemp4 = beta_i * atemp3;
-
+
ctemp3 = btemp1 - btemp2;
ctemp4 = btemp3 + btemp4;
@@ -136,15 +136,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
do {
atemp1 = *(c_offset1 + 0);
atemp2 = *(c_offset1 + 1);
-
+
btemp1 = beta_r * atemp1;
btemp2 = beta_i * atemp2;
btemp3 = beta_r * atemp2;
btemp4 = beta_i * atemp1;
-
+
ctemp1 = btemp1 - btemp2;
ctemp2 = btemp3 + btemp4;
-
+
*(c_offset1 + 0) = ctemp1;
*(c_offset1 + 1) = ctemp2;
c_offset1 += 2;
diff --git a/kernel/generic/zgemm_ncopy_1.c b/kernel/generic/zgemm_ncopy_1.c
index 6679a33..bc2b897 100644
--- a/kernel/generic/zgemm_ncopy_1.c
+++ b/kernel/generic/zgemm_ncopy_1.c
@@ -49,14 +49,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset = a;
b_offset = b;
-
+
lda *= 2;
i = n;
-
+
if (i > 0){
do {
-
+
j = (m >> 2);
if (j > 0){
do{
@@ -64,28 +64,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset + 1);
ctemp3 = *(a_offset + 2);
ctemp4 = *(a_offset + 3);
-
+
ctemp5 = *(a_offset + 4);
ctemp6 = *(a_offset + 5);
ctemp7 = *(a_offset + 6);
ctemp8 = *(a_offset + 7);
-
+
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp2;
*(b_offset + 2) = ctemp3;
*(b_offset + 3) = ctemp4;
-
+
*(b_offset + 4) = ctemp5;
*(b_offset + 5) = ctemp6;
*(b_offset + 6) = ctemp7;
*(b_offset + 7) = ctemp8;
-
+
a_offset += 8;
b_offset += 8;
j --;
} while(j>0);
}
-
+
j = (m & 3);
if (j > 0){
do{
diff --git a/kernel/generic/zgemm_ncopy_2.c b/kernel/generic/zgemm_ncopy_2.c
index 2d5f255..402d6e3 100644
--- a/kernel/generic/zgemm_ncopy_2.c
+++ b/kernel/generic/zgemm_ncopy_2.c
@@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset = a;
b_offset = b;
-
+
lda *= 2;
i = (n >> 1);
@@ -69,42 +69,42 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset2 + 0);
ctemp4 = *(a_offset2 + 1);
-
+
ctemp5 = *(a_offset1 + 2);
ctemp6 = *(a_offset1 + 3);
ctemp7 = *(a_offset2 + 2);
ctemp8 = *(a_offset2 + 3);
-
+
ctemp9 = *(a_offset1 + 4);
ctemp10 = *(a_offset1 + 5);
ctemp11 = *(a_offset2 + 4);
ctemp12 = *(a_offset2 + 5);
-
+
ctemp13 = *(a_offset1 + 6);
ctemp14 = *(a_offset1 + 7);
ctemp15 = *(a_offset2 + 6);
ctemp16 = *(a_offset2 + 7);
-
+
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp2;
*(b_offset + 2) = ctemp3;
*(b_offset + 3) = ctemp4;
-
+
*(b_offset + 4) = ctemp5;
*(b_offset + 5) = ctemp6;
*(b_offset + 6) = ctemp7;
*(b_offset + 7) = ctemp8;
-
+
*(b_offset + 8) = ctemp9;
*(b_offset + 9) = ctemp10;
*(b_offset +10) = ctemp11;
*(b_offset +11) = ctemp12;
-
+
*(b_offset +12) = ctemp13;
*(b_offset +13) = ctemp14;
*(b_offset +14) = ctemp15;
*(b_offset +15) = ctemp16;
-
+
a_offset1 += 8;
a_offset2 += 8;
b_offset += 16;
@@ -119,12 +119,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset2 + 0);
ctemp4 = *(a_offset2 + 1);
-
+
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp2;
*(b_offset + 2) = ctemp3;
*(b_offset + 3) = ctemp4;
-
+
a_offset1 += 2;
a_offset2 += 2;
b_offset += 4;
@@ -134,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
i --;
} while(i>0);
}
-
+
if (n & 1){
j = (m >> 2);
if (j > 0){
@@ -143,22 +143,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset + 1);
ctemp5 = *(a_offset + 2);
ctemp6 = *(a_offset + 3);
-
+
ctemp9 = *(a_offset + 4);
ctemp10 = *(a_offset + 5);
ctemp13 = *(a_offset + 6);
ctemp14 = *(a_offset + 7);
-
+
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp2;
*(b_offset + 2) = ctemp5;
*(b_offset + 3) = ctemp6;
-
+
*(b_offset + 4) = ctemp9;
*(b_offset + 5) = ctemp10;
*(b_offset + 6) = ctemp13;
*(b_offset + 7) = ctemp14;
-
+
a_offset += 8;
b_offset += 8;
j --;
diff --git a/kernel/generic/zgemm_ncopy_4.c b/kernel/generic/zgemm_ncopy_4.c
index abd1d57..0c2959b 100644
--- a/kernel/generic/zgemm_ncopy_4.c
+++ b/kernel/generic/zgemm_ncopy_4.c
@@ -59,7 +59,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset = a;
boffset = b;
lda *= 2;
-
+
#if 0
fprintf(stderr, "m = %d n = %d\n", m,n );
#endif
@@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset += 4 * lda;
-
+
i = (m >> 2);
if (i > 0){
do{
@@ -93,7 +93,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp14 = *(aoffset2 + 5);
ctemp15 = *(aoffset2 + 6);
ctemp16 = *(aoffset2 + 7);
-
+
ctemp17 = *(aoffset3 + 0);
ctemp18 = *(aoffset3 + 1);
ctemp19 = *(aoffset3 + 2);
@@ -102,7 +102,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp22 = *(aoffset3 + 5);
ctemp23 = *(aoffset3 + 6);
ctemp24 = *(aoffset3 + 7);
-
+
ctemp25 = *(aoffset4 + 0);
ctemp26 = *(aoffset4 + 1);
ctemp27 = *(aoffset4 + 2);
@@ -120,7 +120,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp18;
*(boffset + 6) = ctemp25;
*(boffset + 7) = ctemp26;
-
+
*(boffset + 8) = ctemp03;
*(boffset + 9) = ctemp04;
*(boffset + 10) = ctemp11;
@@ -129,7 +129,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 13) = ctemp20;
*(boffset + 14) = ctemp27;
*(boffset + 15) = ctemp28;
-
+
*(boffset + 16) = ctemp05;
*(boffset + 17) = ctemp06;
*(boffset + 18) = ctemp13;
@@ -162,22 +162,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
-
+
ctemp05 = *(aoffset2 + 0);
ctemp06 = *(aoffset2 + 1);
ctemp07 = *(aoffset2 + 2);
ctemp08 = *(aoffset2 + 3);
-
+
ctemp09 = *(aoffset3 + 0);
ctemp10 = *(aoffset3 + 1);
ctemp11 = *(aoffset3 + 2);
ctemp12 = *(aoffset3 + 3);
-
+
ctemp13 = *(aoffset4 + 0);
ctemp14 = *(aoffset4 + 1);
ctemp15 = *(aoffset4 + 2);
ctemp16 = *(aoffset4 + 3);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp05;
@@ -186,7 +186,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp10;
*(boffset + 6) = ctemp13;
*(boffset + 7) = ctemp14;
-
+
*(boffset + 8) = ctemp03;
*(boffset + 9) = ctemp04;
*(boffset + 10) = ctemp07;
@@ -195,27 +195,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 13) = ctemp12;
*(boffset + 14) = ctemp15;
*(boffset + 15) = ctemp16;
-
+
aoffset1 += 4;
aoffset2 += 4;
aoffset3 += 4;
aoffset4 += 4;
boffset += 16;
}
-
+
if (m & 1) {
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
-
+
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
-
+
ctemp05 = *(aoffset3 + 0);
ctemp06 = *(aoffset3 + 1);
-
+
ctemp07 = *(aoffset4 + 0);
ctemp08 = *(aoffset4 + 1);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
@@ -224,7 +224,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp06;
*(boffset + 6) = ctemp07;
*(boffset + 7) = ctemp08;
-
+
aoffset1 += 2;
aoffset2 += 2;
aoffset3 += 2;
@@ -234,12 +234,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
j--;
}while(j > 0);
} /* end of if(j > 0) */
-
+
if (n & 2){
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset += 2 * lda;
-
+
i = (m >> 2);
if (i > 0){
do{
@@ -251,7 +251,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
-
+
ctemp09 = *(aoffset2 + 0);
ctemp10 = *(aoffset2 + 1);
ctemp11 = *(aoffset2 + 2);
@@ -260,7 +260,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp14 = *(aoffset2 + 5);
ctemp15 = *(aoffset2 + 6);
ctemp16 = *(aoffset2 + 7);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp09;
@@ -269,7 +269,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp04;
*(boffset + 6) = ctemp11;
*(boffset + 7) = ctemp12;
-
+
*(boffset + 8) = ctemp05;
*(boffset + 9) = ctemp06;
*(boffset + 10) = ctemp13;
@@ -278,25 +278,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 13) = ctemp08;
*(boffset + 14) = ctemp15;
*(boffset + 15) = ctemp16;
-
+
aoffset1 += 8;
aoffset2 += 8;
boffset += 16;
i --;
}while(i > 0);
}
-
+
if (m & 2) {
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
-
+
ctemp05 = *(aoffset2 + 0);
ctemp06 = *(aoffset2 + 1);
ctemp07 = *(aoffset2 + 2);
ctemp08 = *(aoffset2 + 3);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp05;
@@ -305,33 +305,33 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp04;
*(boffset + 6) = ctemp07;
*(boffset + 7) = ctemp08;
-
+
aoffset1 += 4;
aoffset2 += 4;
boffset += 8;
}
-
+
if (m & 1) {
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
-
+
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
*(boffset + 3) = ctemp04;
-
+
aoffset1 += 2;
aoffset2 += 2;
boffset += 4;
}
}
-
+
if (n & 1){
aoffset1 = aoffset;
-
+
i = (m >> 2);
if (i > 0){
do{
@@ -343,7 +343,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
@@ -352,36 +352,36 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp06;
*(boffset + 6) = ctemp07;
*(boffset + 7) = ctemp08;
-
+
aoffset1 += 8;
boffset += 8;
i --;
}while(i > 0);
}
-
+
if (m & 2) {
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
*(boffset + 3) = ctemp04;
-
+
aoffset1 += 4;
boffset += 4;
}
-
+
if (m & 1) {
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
}
}
-
+
return 0;
}
diff --git a/kernel/generic/zgemm_ncopy_4_sandy.c b/kernel/generic/zgemm_ncopy_4_sandy.c
index 839bd59..404a3cd 100644
--- a/kernel/generic/zgemm_ncopy_4_sandy.c
+++ b/kernel/generic/zgemm_ncopy_4_sandy.c
@@ -13,19 +13,19 @@ notice, this list of conditions and the following disclaimer.
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
-3. Neither the name of the ISCAS nor the names of its contributors may
-be used to endorse or promote products derived from this software
+3. Neither the name of the ISCAS nor the names of its contributors may
+be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -33,13 +33,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <stdio.h>
#include "common.h"
-int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
+int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
{
BLASLONG i,j;
BLASLONG idx=0;
BLASLONG ii;
FLOAT *src0,*src1,*src2,*src3,*dest0;
- for (j=0; j<col/4; j+=1)
+ for (j=0; j<col/4; j+=1)
{
src0 = src;
src1 = src0+2*srcdim;
@@ -49,7 +49,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
dest0 = dest;
ii = (row<<3);
dest = dest+ii;
- for (i=0; i<row/4; i+=1)
+ for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -90,7 +90,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (4<<3);
dest0 = dest0+ii;
}
- if (row&2)
+ if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -115,7 +115,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (2<<3);
dest0 = dest0+ii;
}
- if (row&1)
+ if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -133,7 +133,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
dest0 = dest0+ii;
}
}
- if (col&2)
+ if (col&2)
{
src0 = src;
src1 = src0+2*srcdim;
@@ -141,7 +141,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
dest0 = dest;
ii = (row<<2);
dest = dest+ii;
- for (i=0; i<row/4; i+=1)
+ for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -164,7 +164,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (4<<2);
dest0 = dest0+ii;
}
- if (row&2)
+ if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -179,7 +179,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (2<<2);
dest0 = dest0+ii;
}
- if (row&1)
+ if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -191,14 +191,14 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
dest0 = dest0+ii;
}
}
- if (col&1)
+ if (col&1)
{
src0 = src;
src = src0+2*srcdim;
dest0 = dest;
ii = (row<<1);
dest = dest+ii;
- for (i=0; i<row/4; i+=1)
+ for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -212,7 +212,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (4<<1);
dest0 = dest0+ii;
}
- if (row&2)
+ if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -222,7 +222,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (2<<1);
dest0 = dest0+ii;
}
- if (row&1)
+ if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
diff --git a/kernel/generic/zgemm_ncopy_8.c b/kernel/generic/zgemm_ncopy_8.c
index 6490285..5ef1470 100644
--- a/kernel/generic/zgemm_ncopy_8.c
+++ b/kernel/generic/zgemm_ncopy_8.c
@@ -55,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset = a;
boffset = b;
lda *= 2;
-
+
j = (n >> 3);
if (j > 0){
do{
@@ -68,7 +68,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset7 = aoffset6 + lda;
aoffset8 = aoffset7 + lda;
aoffset += 8 * lda;
-
+
i = m;
if (i > 0){
do{
@@ -88,7 +88,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp14 = *(aoffset7 + 1);
ctemp15 = *(aoffset8 + 0);
ctemp16 = *(aoffset8 + 1);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
@@ -114,7 +114,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset6 += 2;
aoffset7 += 2;
aoffset8 += 2;
-
+
boffset += 16;
i --;
}while(i > 0);
@@ -122,14 +122,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
j--;
}while(j > 0);
} /* end of if(j > 0) */
-
+
if (n & 4){
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset += 4 * lda;
-
+
i = m;
if (i > 0){
do{
@@ -141,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset3 + 1);
ctemp07 = *(aoffset4 + 0);
ctemp08 = *(aoffset4 + 1);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
@@ -166,7 +166,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset += 2 * lda;
-
+
i = m;
if (i > 0){
do{
@@ -174,24 +174,24 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
*(boffset + 3) = ctemp04;
-
+
aoffset1 += 2;
aoffset2 += 2;
boffset += 4;
i --;
}while(i > 0);
}
-
+
} /* end of if(j > 0) */
if (n & 1){
aoffset1 = aoffset;
-
+
i = m;
if (i > 0){
do{
@@ -206,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
i --;
}while(i > 0);
}
-
+
} /* end of if(j > 0) */
return 0;
diff --git a/kernel/generic/zgemm_ncopy_8_sandy.c b/kernel/generic/zgemm_ncopy_8_sandy.c
index ed580a1..6e8e894 100644
--- a/kernel/generic/zgemm_ncopy_8_sandy.c
+++ b/kernel/generic/zgemm_ncopy_8_sandy.c
@@ -13,19 +13,19 @@ notice, this list of conditions and the following disclaimer.
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
-3. Neither the name of the ISCAS nor the names of its contributors may
-be used to endorse or promote products derived from this software
+3. Neither the name of the ISCAS nor the names of its contributors may
+be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -33,13 +33,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <stdio.h>
#include "common.h"
-int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
+int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
{
BLASLONG i,j;
BLASLONG idx=0;
BLASLONG ii;
FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0;
- for (j=0; j<col/8; j+=1)
+ for (j=0; j<col/8; j+=1)
{
src0 = src;
src1 = src0+2*srcdim;
@@ -53,7 +53,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
dest0 = dest;
ii = (row<<4);
dest = dest+ii;
- for (i=0; i<row/4; i+=1)
+ for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -130,7 +130,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (4<<4);
dest0 = dest0+ii;
}
- if (row&2)
+ if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -175,7 +175,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (2<<4);
dest0 = dest0+ii;
}
- if (row&1)
+ if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -205,7 +205,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
dest0 = dest0+ii;
}
}
- if (col&4)
+ if (col&4)
{
src0 = src;
src1 = src0+2*srcdim;
@@ -215,7 +215,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
dest0 = dest;
ii = (row<<3);
dest = dest+ii;
- for (i=0; i<row/4; i+=1)
+ for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -256,7 +256,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (4<<3);
dest0 = dest0+ii;
}
- if (row&2)
+ if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -281,7 +281,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (2<<3);
dest0 = dest0+ii;
}
- if (row&1)
+ if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -299,7 +299,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
dest0 = dest0+ii;
}
}
- if (col&2)
+ if (col&2)
{
src0 = src;
src1 = src0+2*srcdim;
@@ -307,7 +307,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
dest0 = dest;
ii = (row<<2);
dest = dest+ii;
- for (i=0; i<row/4; i+=1)
+ for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -330,7 +330,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (4<<2);
dest0 = dest0+ii;
}
- if (row&2)
+ if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -345,7 +345,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (2<<2);
dest0 = dest0+ii;
}
- if (row&1)
+ if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -357,14 +357,14 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
dest0 = dest0+ii;
}
}
- if (col&1)
+ if (col&1)
{
src0 = src;
src = src0+2*srcdim;
dest0 = dest;
ii = (row<<1);
dest = dest+ii;
- for (i=0; i<row/4; i+=1)
+ for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -378,7 +378,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (4<<1);
dest0 = dest0+ii;
}
- if (row&2)
+ if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -388,7 +388,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (2<<1);
dest0 = dest0+ii;
}
- if (row&1)
+ if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
diff --git a/kernel/generic/zgemm_tcopy_1.c b/kernel/generic/zgemm_tcopy_1.c
index 03dfcc7..ae05339 100644
--- a/kernel/generic/zgemm_tcopy_1.c
+++ b/kernel/generic/zgemm_tcopy_1.c
@@ -49,18 +49,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset = a;
b_offset = b;
-
+
lda *= 2;
j = m;
m *= 2;
-
+
if (j > 0){
do {
b_offset1 = b_offset;
b_offset += 2;
-
+
i = (n >> 2);
if (i > 0){
do{
@@ -68,45 +68,45 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset + 1);
ctemp3 = *(a_offset + 2);
ctemp4 = *(a_offset + 3);
-
+
ctemp5 = *(a_offset + 4);
ctemp6 = *(a_offset + 5);
ctemp7 = *(a_offset + 6);
ctemp8 = *(a_offset + 7);
-
+
*(b_offset1 + 0) = ctemp1;
*(b_offset1 + 1) = ctemp2;
-
+
b_offset1 += m;
-
+
*(b_offset1 + 0) = ctemp3;
*(b_offset1 + 1) = ctemp4;
-
+
b_offset1 += m;
-
+
*(b_offset1 + 0) = ctemp5;
*(b_offset1 + 1) = ctemp6;
b_offset1 += m;
-
+
*(b_offset1 + 0) = ctemp7;
*(b_offset1 + 1) = ctemp8;
-
+
b_offset1 += m;
a_offset += 8;
i --;
} while(i>0);
}
-
+
i = (n & 3);
if (i > 0){
do {
ctemp1 = *(a_offset + 0);
ctemp2 = *(a_offset + 1);
-
+
*(b_offset1 + 0) = ctemp1;
*(b_offset1 + 1) = ctemp2;
-
+
b_offset1 += m;
a_offset += 2;
i --;
diff --git a/kernel/generic/zgemm_tcopy_2.c b/kernel/generic/zgemm_tcopy_2.c
index 75aff7f..70e202b 100644
--- a/kernel/generic/zgemm_tcopy_2.c
+++ b/kernel/generic/zgemm_tcopy_2.c
@@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset = a;
b_offset = b;
-
+
b_offset2 = b + m * (n & ~1) * 2;
lda *= 2;
@@ -73,46 +73,46 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
-
+
ctemp5 = *(a_offset1 + 4);
ctemp6 = *(a_offset1 + 5);
ctemp7 = *(a_offset1 + 6);
ctemp8 = *(a_offset1 + 7);
-
+
ctemp9 = *(a_offset2 + 0);
ctemp10 = *(a_offset2 + 1);
ctemp11 = *(a_offset2 + 2);
ctemp12 = *(a_offset2 + 3);
-
+
ctemp13 = *(a_offset2 + 4);
ctemp14 = *(a_offset2 + 5);
ctemp15 = *(a_offset2 + 6);
ctemp16 = *(a_offset2 + 7);
-
+
*(b_offset1 + 0) = ctemp1;
*(b_offset1 + 1) = ctemp2;
*(b_offset1 + 2) = ctemp3;
*(b_offset1 + 3) = ctemp4;
-
+
*(b_offset1 + 4) = ctemp9;
*(b_offset1 + 5) = ctemp10;
*(b_offset1 + 6) = ctemp11;
*(b_offset1 + 7) = ctemp12;
-
+
b_offset1 += m * 4;
-
+
*(b_offset1 + 0) = ctemp5;
*(b_offset1 + 1) = ctemp6;
*(b_offset1 + 2) = ctemp7;
*(b_offset1 + 3) = ctemp8;
-
+
*(b_offset1 + 4) = ctemp13;
*(b_offset1 + 5) = ctemp14;
*(b_offset1 + 6) = ctemp15;
*(b_offset1 + 7) = ctemp16;
-
+
b_offset1 += m * 4;
-
+
a_offset1 += 8;
a_offset2 += 8;
i --;
@@ -124,33 +124,33 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
-
+
ctemp9 = *(a_offset2 + 0);
ctemp10 = *(a_offset2 + 1);
ctemp11 = *(a_offset2 + 2);
ctemp12 = *(a_offset2 + 3);
-
+
*(b_offset1 + 0) = ctemp1;
*(b_offset1 + 1) = ctemp2;
*(b_offset1 + 2) = ctemp3;
*(b_offset1 + 3) = ctemp4;
-
+
*(b_offset1 + 4) = ctemp9;
*(b_offset1 + 5) = ctemp10;
*(b_offset1 + 6) = ctemp11;
*(b_offset1 + 7) = ctemp12;
-
+
b_offset1 += m * 4;
a_offset1 += 4;
a_offset2 += 4;
}
-
+
if (n & 1){
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
ctemp9 = *(a_offset2 + 0);
ctemp10 = *(a_offset2 + 1);
-
+
*(b_offset2 + 0) = ctemp1;
*(b_offset2 + 1) = ctemp2;
*(b_offset2 + 2) = ctemp9;
@@ -169,45 +169,45 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset + 1);
ctemp3 = *(a_offset + 2);
ctemp4 = *(a_offset + 3);
-
+
ctemp5 = *(a_offset + 4);
ctemp6 = *(a_offset + 5);
ctemp7 = *(a_offset + 6);
ctemp8 = *(a_offset + 7);
-
+
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp2;
*(b_offset + 2) = ctemp3;
*(b_offset + 3) = ctemp4;
-
+
b_offset += m * 4;
-
+
*(b_offset + 0) = ctemp5;
*(b_offset + 1) = ctemp6;
*(b_offset + 2) = ctemp7;
*(b_offset + 3) = ctemp8;
-
+
b_offset += m * 4;
a_offset += 8;
i --;
} while(i > 0);
}
-
+
if (n & 2){
ctemp1 = *(a_offset + 0);
ctemp2 = *(a_offset + 1);
ctemp3 = *(a_offset + 2);
ctemp4 = *(a_offset + 3);
-
+
*(b_offset + 0) = ctemp1;
*(b_offset + 1) = ctemp2;
*(b_offset + 2) = ctemp3;
*(b_offset + 3) = ctemp4;
-
+
b_offset += m * 4;
a_offset += 4;
}
-
+
if (n & 1){
ctemp1 = *(a_offset + 0);
ctemp2 = *(a_offset + 1);
diff --git a/kernel/generic/zgemm_tcopy_4.c b/kernel/generic/zgemm_tcopy_4.c
index c61d9d5..3c12a6f 100644
--- a/kernel/generic/zgemm_tcopy_4.c
+++ b/kernel/generic/zgemm_tcopy_4.c
@@ -90,7 +90,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
-
+
ctemp09 = *(aoffset2 + 0);
ctemp10 = *(aoffset2 + 1);
ctemp11 = *(aoffset2 + 2);
@@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp14 = *(aoffset2 + 5);
ctemp15 = *(aoffset2 + 6);
ctemp16 = *(aoffset2 + 7);
-
+
ctemp17 = *(aoffset3 + 0);
ctemp18 = *(aoffset3 + 1);
ctemp19 = *(aoffset3 + 2);
@@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp22 = *(aoffset3 + 5);
ctemp23 = *(aoffset3 + 6);
ctemp24 = *(aoffset3 + 7);
-
+
ctemp25 = *(aoffset4 + 0);
ctemp26 = *(aoffset4 + 1);
ctemp27 = *(aoffset4 + 2);
@@ -126,7 +126,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 5) = ctemp06;
*(boffset1 + 6) = ctemp07;
*(boffset1 + 7) = ctemp08;
-
+
*(boffset1 + 8) = ctemp09;
*(boffset1 + 9) = ctemp10;
*(boffset1 + 10) = ctemp11;
@@ -144,7 +144,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 21) = ctemp22;
*(boffset1 + 22) = ctemp23;
*(boffset1 + 23) = ctemp24;
-
+
*(boffset1 + 24) = ctemp25;
*(boffset1 + 25) = ctemp26;
*(boffset1 + 26) = ctemp27;
@@ -174,17 +174,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset2 + 1);
ctemp07 = *(aoffset2 + 2);
ctemp08 = *(aoffset2 + 3);
-
+
ctemp09 = *(aoffset3 + 0);
ctemp10 = *(aoffset3 + 1);
ctemp11 = *(aoffset3 + 2);
ctemp12 = *(aoffset3 + 3);
-
+
ctemp13 = *(aoffset4 + 0);
ctemp14 = *(aoffset4 + 1);
ctemp15 = *(aoffset4 + 2);
ctemp16 = *(aoffset4 + 3);
-
+
*(boffset2 + 0) = ctemp01;
*(boffset2 + 1) = ctemp02;
*(boffset2 + 2) = ctemp03;
@@ -193,7 +193,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset2 + 5) = ctemp06;
*(boffset2 + 6) = ctemp07;
*(boffset2 + 7) = ctemp08;
-
+
*(boffset2 + 8) = ctemp09;
*(boffset2 + 9) = ctemp10;
*(boffset2 + 10) = ctemp11;
@@ -202,12 +202,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset2 + 13) = ctemp14;
*(boffset2 + 14) = ctemp15;
*(boffset2 + 15) = ctemp16;
-
+
aoffset1 += 4;
aoffset2 += 4;
aoffset3 += 4;
aoffset4 += 4;
-
+
boffset2 += 16;
}
@@ -217,13 +217,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
-
+
ctemp05 = *(aoffset3 + 0);
ctemp06 = *(aoffset3 + 1);
-
+
ctemp07 = *(aoffset4 + 0);
ctemp08 = *(aoffset4 + 1);
-
+
*(boffset3 + 0) = ctemp01;
*(boffset3 + 1) = ctemp02;
*(boffset3 + 2) = ctemp03;
@@ -232,12 +232,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset3 + 5) = ctemp06;
*(boffset3 + 6) = ctemp07;
*(boffset3 + 7) = ctemp08;
-
+
aoffset1 += 2;
aoffset2 += 2;
aoffset3 += 2;
aoffset4 += 2;
-
+
boffset3 += 8;
}
j--;
@@ -248,10 +248,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset += 2 * lda;
-
+
boffset1 = boffset;
boffset += 16;
-
+
i = (n >> 2);
if (i > 0){
do{
@@ -263,7 +263,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
-
+
ctemp09 = *(aoffset2 + 0);
ctemp10 = *(aoffset2 + 1);
ctemp11 = *(aoffset2 + 2);
@@ -272,7 +272,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp14 = *(aoffset2 + 5);
ctemp15 = *(aoffset2 + 6);
ctemp16 = *(aoffset2 + 7);
-
+
*(boffset1 + 0) = ctemp01;
*(boffset1 + 1) = ctemp02;
*(boffset1 + 2) = ctemp03;
@@ -281,7 +281,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 5) = ctemp06;
*(boffset1 + 6) = ctemp07;
*(boffset1 + 7) = ctemp08;
-
+
*(boffset1 + 8) = ctemp09;
*(boffset1 + 9) = ctemp10;
*(boffset1 + 10) = ctemp11;
@@ -290,12 +290,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 13) = ctemp14;
*(boffset1 + 14) = ctemp15;
*(boffset1 + 15) = ctemp16;
-
+
aoffset1 += 8;
aoffset2 += 8;
aoffset3 += 8;
aoffset4 += 8;
-
+
boffset1 += m * 8;
i --;
}while(i > 0);
@@ -306,12 +306,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
-
+
ctemp05 = *(aoffset2 + 0);
ctemp06 = *(aoffset2 + 1);
ctemp07 = *(aoffset2 + 2);
ctemp08 = *(aoffset2 + 3);
-
+
*(boffset2 + 0) = ctemp01;
*(boffset2 + 1) = ctemp02;
*(boffset2 + 2) = ctemp03;
@@ -320,34 +320,34 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset2 + 5) = ctemp06;
*(boffset2 + 6) = ctemp07;
*(boffset2 + 7) = ctemp08;
-
+
aoffset1 += 4;
aoffset2 += 4;
-
+
boffset2 += 8;
}
-
+
if (n & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
-
+
*(boffset3 + 0) = ctemp01;
*(boffset3 + 1) = ctemp02;
*(boffset3 + 2) = ctemp03;
*(boffset3 + 3) = ctemp04;
-
+
aoffset1 += 2;
aoffset2 += 2;
boffset3 += 4;
}
}
-
+
if (m & 1){
aoffset1 = aoffset;
boffset1 = boffset;
-
+
i = (n >> 2);
if (i > 0){
do{
@@ -359,7 +359,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
-
+
*(boffset1 + 0) = ctemp01;
*(boffset1 + 1) = ctemp02;
*(boffset1 + 2) = ctemp03;
@@ -368,7 +368,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 5) = ctemp06;
*(boffset1 + 6) = ctemp07;
*(boffset1 + 7) = ctemp08;
-
+
aoffset1 += 8;
boffset1 += m * 8;
i --;
@@ -380,7 +380,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
-
+
*(boffset2 + 0) = ctemp01;
*(boffset2 + 1) = ctemp02;
*(boffset2 + 2) = ctemp03;
@@ -389,11 +389,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 += 4;
boffset2 += 4;
}
-
+
if (n & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
-
+
*(boffset3 + 0) = ctemp01;
*(boffset3 + 1) = ctemp02;
}
diff --git a/kernel/generic/zgemm_tcopy_4_sandy.c b/kernel/generic/zgemm_tcopy_4_sandy.c
index 1ae4a4e..7e14865 100644
--- a/kernel/generic/zgemm_tcopy_4_sandy.c
+++ b/kernel/generic/zgemm_tcopy_4_sandy.c
@@ -13,19 +13,19 @@ notice, this list of conditions and the following disclaimer.
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
-3. Neither the name of the ISCAS nor the names of its contributors may
-be used to endorse or promote products derived from this software
+3. Neither the name of the ISCAS nor the names of its contributors may
+be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <stdio.h>
#include "common.h"
-int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
+int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
{
BLASLONG i,j;
BLASLONG idx=0;
@@ -46,7 +46,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = col&-2;
ii = ii*(2*row);
dest1 = dest+ii;
- for (j=0; j<row/4; j+=1)
+ for (j=0; j<row/4; j+=1)
{
src0 = src;
src1 = src0+2*srcdim;
@@ -56,7 +56,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
dest0 = dest;
ii = (4<<3);
dest = dest+ii;
- for (i=0; i<col/4; i+=1)
+ for (i=0; i<col/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -97,7 +97,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (row<<3);
dest0 = dest0+ii;
}
- if (col&2)
+ if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
@@ -121,7 +121,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
src3 = src3+4;
dest2 = dest2+16;
}
- if (col&1)
+ if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
@@ -138,7 +138,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
dest1 = dest1+8;
}
}
- if (row&2)
+ if (row&2)
{
src0 = src;
src1 = src0+2*srcdim;
@@ -146,7 +146,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
dest0 = dest;
ii = (2<<3);
dest = dest+ii;
- for (i=0; i<col/4; i+=1)
+ for (i=0; i<col/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -169,7 +169,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (row<<3);
dest0 = dest0+ii;
}
- if (col&2)
+ if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
@@ -183,7 +183,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
src1 = src1+4;
dest2 = dest2+8;
}
- if (col&1)
+ if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
@@ -194,14 +194,14 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
dest1 = dest1+4;
}
}
- if (row&1)
+ if (row&1)
{
src0 = src;
src = src0+2*srcdim;
dest0 = dest;
ii = (1<<3);
dest = dest+ii;
- for (i=0; i<col/4; i+=1)
+ for (i=0; i<col/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -215,7 +215,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (row<<3);
dest0 = dest0+ii;
}
- if (col&2)
+ if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
@@ -224,7 +224,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
src0 = src0+4;
dest2 = dest2+4;
}
- if (col&1)
+ if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
diff --git a/kernel/generic/zgemm_tcopy_8.c b/kernel/generic/zgemm_tcopy_8.c
index b258785..bad835b 100644
--- a/kernel/generic/zgemm_tcopy_8.c
+++ b/kernel/generic/zgemm_tcopy_8.c
@@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 16;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -117,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp06;
*(boffset + 6) = ctemp07;
*(boffset + 7) = ctemp08;
-
+
*(boffset + 8) = ctemp09;
*(boffset + 9) = ctemp10;
*(boffset + 10) = ctemp11;
@@ -126,7 +126,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 13) = ctemp14;
*(boffset + 14) = ctemp15;
*(boffset + 15) = ctemp16;
-
+
*(boffset + 16) = ctemp17;
*(boffset + 17) = ctemp18;
*(boffset + 18) = ctemp19;
@@ -170,7 +170,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp14 = *(aoffset1 + 13);
ctemp15 = *(aoffset1 + 14);
ctemp16 = *(aoffset1 + 15);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
@@ -179,7 +179,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp06;
*(boffset + 6) = ctemp07;
*(boffset + 7) = ctemp08;
-
+
*(boffset + 8) = ctemp09;
*(boffset + 9) = ctemp10;
*(boffset + 10) = ctemp11;
@@ -200,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 8;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -212,7 +212,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
-
+
ctemp09 = *(aoffset2 + 0);
ctemp10 = *(aoffset2 + 1);
ctemp11 = *(aoffset2 + 2);
@@ -230,7 +230,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp06;
*(boffset + 6) = ctemp07;
*(boffset + 7) = ctemp08;
-
+
*(boffset + 8) = ctemp09;
*(boffset + 9) = ctemp10;
*(boffset + 10) = ctemp11;
@@ -239,15 +239,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 13) = ctemp14;
*(boffset + 14) = ctemp15;
*(boffset + 15) = ctemp16;
-
+
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 16;
-
+
i --;
}while(i > 0);
}
-
+
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
@@ -257,7 +257,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
-
+
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
@@ -266,7 +266,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp06;
*(boffset + 6) = ctemp07;
*(boffset + 7) = ctemp08;
-
+
boffset += 8;
}
}
@@ -275,7 +275,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 4;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -297,15 +297,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = ctemp06;
*(boffset + 6) = ctemp07;
*(boffset + 7) = ctemp08;
-
+
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 8;
-
+
i --;
}while(i > 0);
}
-
+
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
@@ -316,7 +316,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
*(boffset + 3) = ctemp04;
-
+
boffset += 4;
}
}
@@ -325,7 +325,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 2;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -338,15 +338,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
*(boffset + 3) = ctemp04;
-
+
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 4;
-
+
i --;
}while(i > 0);
}
-
+
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
diff --git a/kernel/generic/zgemm_tcopy_8_sandy.c b/kernel/generic/zgemm_tcopy_8_sandy.c
index b53dd3e..e519785 100644
--- a/kernel/generic/zgemm_tcopy_8_sandy.c
+++ b/kernel/generic/zgemm_tcopy_8_sandy.c
@@ -13,19 +13,19 @@ notice, this list of conditions and the following disclaimer.
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
-3. Neither the name of the ISCAS nor the names of its contributors may
-be used to endorse or promote products derived from this software
+3. Neither the name of the ISCAS nor the names of its contributors may
+be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <stdio.h>
#include "common.h"
-int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
+int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
{
BLASLONG i,j;
BLASLONG idx=0;
@@ -49,7 +49,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = col&-2;
ii = ii*(2*row);
dest1 = dest+ii;
- for (j=0; j<row/4; j+=1)
+ for (j=0; j<row/4; j+=1)
{
src0 = src;
src1 = src0+2*srcdim;
@@ -59,7 +59,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
dest0 = dest;
ii = (4<<4);
dest = dest+ii;
- for (i=0; i<col/8; i+=1)
+ for (i=0; i<col/8; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -132,7 +132,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (row<<4);
dest0 = dest0+ii;
}
- if (col&4)
+ if (col&4)
{
dest4[0] = src0[0];
dest4[1] = src0[1];
@@ -172,7 +172,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
src3 = src3+8;
dest4 = dest4+32;
}
- if (col&2)
+ if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
@@ -196,7 +196,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
src3 = src3+4;
dest2 = dest2+16;
}
- if (col&1)
+ if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
@@ -213,7 +213,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
dest1 = dest1+8;
}
}
- if (row&2)
+ if (row&2)
{
src0 = src;
src1 = src0+2*srcdim;
@@ -221,7 +221,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
dest0 = dest;
ii = (2<<4);
dest = dest+ii;
- for (i=0; i<col/8; i+=1)
+ for (i=0; i<col/8; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -260,7 +260,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (row<<4);
dest0 = dest0+ii;
}
- if (col&4)
+ if (col&4)
{
dest4[0] = src0[0];
dest4[1] = src0[1];
@@ -282,7 +282,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
src1 = src1+8;
dest4 = dest4+16;
}
- if (col&2)
+ if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
@@ -296,7 +296,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
src1 = src1+4;
dest2 = dest2+8;
}
- if (col&1)
+ if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
@@ -307,14 +307,14 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
dest1 = dest1+4;
}
}
- if (row&1)
+ if (row&1)
{
src0 = src;
src = src0+2*srcdim;
dest0 = dest;
ii = (1<<4);
dest = dest+ii;
- for (i=0; i<col/8; i+=1)
+ for (i=0; i<col/8; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
@@ -336,7 +336,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
ii = (row<<4);
dest0 = dest0+ii;
}
- if (col&4)
+ if (col&4)
{
dest4[0] = src0[0];
dest4[1] = src0[1];
@@ -349,7 +349,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
src0 = src0+8;
dest4 = dest4+8;
}
- if (col&2)
+ if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
@@ -358,7 +358,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
src0 = src0+4;
dest2 = dest2+4;
}
- if (col&1)
+ if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
diff --git a/kernel/generic/zgemmkernel_2x2.c b/kernel/generic/zgemmkernel_2x2.c
index cb2a26e..c368111 100644
--- a/kernel/generic/zgemmkernel_2x2.c
+++ b/kernel/generic/zgemmkernel_2x2.c
@@ -14,12 +14,12 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15;
- for (j=0; j<bn/2; j+=1)
+ for (j=0; j<bn/2; j+=1)
{
C0 = C;
C1 = C0+2*ldc;
ptrba = ba;
- for (i=0; i<bm/2; i+=1)
+ for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
@@ -30,7 +30,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
res5 = 0;
res6 = 0;
res7 = 0;
- for (k=0; k<bk/4; k+=1)
+ for (k=0; k<bk/4; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
@@ -427,7 +427,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
ptrba = ptrba+16;
ptrbb = ptrbb+16;
}
- for (k=0; k<(bk&3); k+=1)
+ for (k=0; k<(bk&3); k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
@@ -571,14 +571,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
C0 = C0+4;
C1 = C1+4;
}
- for (i=0; i<(bm&1); i+=1)
+ for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
- for (k=0; k<bk; k+=1)
+ for (k=0; k<bk; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
@@ -671,18 +671,18 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
i = (ldc<<2);
C = C+i;
}
- for (j=0; j<(bn&1); j+=1)
+ for (j=0; j<(bn&1); j+=1)
{
C0 = C;
ptrba = ba;
- for (i=0; i<bm/2; i+=1)
+ for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
- for (k=0; k<bk; k+=1)
+ for (k=0; k<bk; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
@@ -769,12 +769,12 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
C0[3] = C0[3]+load3;
C0 = C0+4;
}
- for (i=0; i<(bm&1); i+=1)
+ for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
- for (k=0; k<bk; k+=1)
+ for (k=0; k<bk; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
diff --git a/kernel/generic/zger.c b/kernel/generic/zger.c
index 134ff5f..63a09ee 100644
--- a/kernel/generic/zger.c
+++ b/kernel/generic/zger.c
@@ -55,16 +55,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
lda *= 2;
incy *= 2;
- while (n > 0) {
+ while (n > 0) {
FLOAT beta_r = y[0];
FLOAT beta_i = y[1];
-#ifndef XCONJ
+#ifndef XCONJ
AXPYU_K
#else
AXPYC_K
#endif
- (m, 0, 0,
+ (m, 0, 0,
#ifndef CONJ
alpha_r * beta_r - alpha_i * beta_i,
alpha_r * beta_i + alpha_i * beta_r,
@@ -73,7 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
-alpha_r * beta_i + alpha_i * beta_r,
#endif
X, 1, a, 1, NULL, 0);
-
+
a += lda;
y += incy;
n --;
diff --git a/kernel/generic/zhemm3m_lcopy_1.c b/kernel/generic/zhemm3m_lcopy_1.c
index 72f473d..0ffbbcf 100644
--- a/kernel/generic/zhemm3m_lcopy_1.c
+++ b/kernel/generic/zhemm3m_lcopy_1.c
@@ -69,14 +69,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
lda *= 2;
js = n;
-
+
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
-
+
i = m;
-
+
while (i > 0) {
if (offset > 0) {
data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
@@ -86,17 +86,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} else {
data01 = CMULT(*(ao1 + 0), ZERO);
}
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
-
+
b[ 0] = data01;
-
+
b ++;
-
+
offset --;
i --;
}
-
+
posX ++;
js --;
}
diff --git a/kernel/generic/zhemm3m_lcopy_2.c b/kernel/generic/zhemm3m_lcopy_2.c
index f0da12c..517ed64 100644
--- a/kernel/generic/zhemm3m_lcopy_2.c
+++ b/kernel/generic/zhemm3m_lcopy_2.c
@@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
js = (n >> 1);
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
@@ -116,7 +116,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
i = m;
diff --git a/kernel/generic/zhemm3m_lcopy_4.c b/kernel/generic/zhemm3m_lcopy_4.c
index 7e958f1..a407838 100644
--- a/kernel/generic/zhemm3m_lcopy_4.c
+++ b/kernel/generic/zhemm3m_lcopy_4.c
@@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
js = (n >> 2);
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
@@ -142,7 +142,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
@@ -187,7 +187,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
i = m;
diff --git a/kernel/generic/zhemm3m_lcopy_8.c b/kernel/generic/zhemm3m_lcopy_8.c
index 86600b5..856d5e5 100644
--- a/kernel/generic/zhemm3m_lcopy_8.c
+++ b/kernel/generic/zhemm3m_lcopy_8.c
@@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
@@ -219,7 +219,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 4) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
@@ -289,7 +289,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
@@ -333,7 +333,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
i = m;
diff --git a/kernel/generic/zhemm3m_ucopy_1.c b/kernel/generic/zhemm3m_ucopy_1.c
index a6d4975..43f6d99 100644
--- a/kernel/generic/zhemm3m_ucopy_1.c
+++ b/kernel/generic/zhemm3m_ucopy_1.c
@@ -69,15 +69,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
lda *= 2;
js = n;
-
+
while (js > 0){
-
+
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
-
+
i = m;
-
+
while (i > 0) {
if (offset > 0) {
data01 = CMULT(*(ao1 + 0), -*(ao1 + 1));
@@ -87,17 +87,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} else {
data01 = CMULT(*(ao1 + 0), ZERO);
}
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
b[ 0] = data01;
-
+
b ++;
-
+
offset --;
i --;
}
-
+
posX ++;
js --;
}
diff --git a/kernel/generic/zhemm3m_ucopy_2.c b/kernel/generic/zhemm3m_ucopy_2.c
index fecbae6..2a20fe0 100644
--- a/kernel/generic/zhemm3m_ucopy_2.c
+++ b/kernel/generic/zhemm3m_ucopy_2.c
@@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
js = (n >> 1);
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
@@ -97,7 +97,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
break;
}
}
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
@@ -109,14 +109,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
offset --;
i --;
}
-
+
posX += 2;
js --;
}
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
i = m;
@@ -130,7 +130,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} else {
data01 = CMULT(*(ao1 + 0), ZERO);
}
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
b[ 0] = data01;
diff --git a/kernel/generic/zhemm3m_ucopy_4.c b/kernel/generic/zhemm3m_ucopy_4.c
index 6a45c7e..879ae2d 100644
--- a/kernel/generic/zhemm3m_ucopy_4.c
+++ b/kernel/generic/zhemm3m_ucopy_4.c
@@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
@@ -120,7 +120,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
break;
}
}
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
if (offset > -2) ao3 += 2; else ao3 += lda;
@@ -143,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
@@ -169,7 +169,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
break;
}
}
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
@@ -181,13 +181,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
offset --;
i --;
}
-
+
posX += 2;
}
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
i = m;
@@ -201,7 +201,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} else {
data01 = CMULT(*(ao1 + 0), ZERO);
}
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
b[ 0] = data01;
diff --git a/kernel/generic/zhemm3m_ucopy_8.c b/kernel/generic/zhemm3m_ucopy_8.c
index efed390..151422f 100644
--- a/kernel/generic/zhemm3m_ucopy_8.c
+++ b/kernel/generic/zhemm3m_ucopy_8.c
@@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
@@ -220,7 +220,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 4) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
@@ -290,7 +290,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
@@ -334,7 +334,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
i = m;
diff --git a/kernel/generic/zhemm_ltcopy_1.c b/kernel/generic/zhemm_ltcopy_1.c
index 6f5615b..b5edda6 100644
--- a/kernel/generic/zhemm_ltcopy_1.c
+++ b/kernel/generic/zhemm_ltcopy_1.c
@@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
i = m;
@@ -61,7 +61,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > 0) {
diff --git a/kernel/generic/zhemm_ltcopy_2.c b/kernel/generic/zhemm_ltcopy_2.c
index 8547b4d..41713b0 100644
--- a/kernel/generic/zhemm_ltcopy_2.c
+++ b/kernel/generic/zhemm_ltcopy_2.c
@@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
@@ -64,7 +64,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
@@ -109,7 +109,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
i = m;
@@ -117,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > 0) {
diff --git a/kernel/generic/zhemm_ltcopy_4.c b/kernel/generic/zhemm_ltcopy_4.c
index d7afc11..88fa6ef 100644
--- a/kernel/generic/zhemm_ltcopy_4.c
+++ b/kernel/generic/zhemm_ltcopy_4.c
@@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
@@ -70,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
if (offset > -2) ao3 += lda; else ao3 += 2;
@@ -153,7 +153,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
@@ -164,7 +164,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
@@ -209,7 +209,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
i = m;
@@ -217,7 +217,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > 0) {
diff --git a/kernel/generic/zhemm_ltcopy_8.c b/kernel/generic/zhemm_ltcopy_8.c
index d5ebd1c..d3f35a7 100644
--- a/kernel/generic/zhemm_ltcopy_8.c
+++ b/kernel/generic/zhemm_ltcopy_8.c
@@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
@@ -82,7 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data14 = *(ao7 + 1);
data15 = *(ao8 + 0);
data16 = *(ao8 + 1);
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
if (offset > -2) ao3 += lda; else ao3 += 2;
@@ -289,7 +289,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 4) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
@@ -306,7 +306,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
if (offset > -2) ao3 += lda; else ao3 += 2;
@@ -388,7 +388,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
@@ -399,7 +399,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
@@ -444,7 +444,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
i = m;
@@ -452,7 +452,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > 0) {
diff --git a/kernel/generic/zhemm_utcopy_1.c b/kernel/generic/zhemm_utcopy_1.c
index 961b849..76e67b0 100644
--- a/kernel/generic/zhemm_utcopy_1.c
+++ b/kernel/generic/zhemm_utcopy_1.c
@@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
js = n;
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
i = m;
@@ -59,7 +59,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > 0) {
diff --git a/kernel/generic/zhemm_utcopy_2.c b/kernel/generic/zhemm_utcopy_2.c
index 91e7108..bd6f139 100644
--- a/kernel/generic/zhemm_utcopy_2.c
+++ b/kernel/generic/zhemm_utcopy_2.c
@@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
js = (n >> 1);
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
@@ -62,7 +62,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
@@ -107,7 +107,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
i = m;
@@ -115,7 +115,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > 0) {
diff --git a/kernel/generic/zhemm_utcopy_4.c b/kernel/generic/zhemm_utcopy_4.c
index 15671b4..6201b43 100644
--- a/kernel/generic/zhemm_utcopy_4.c
+++ b/kernel/generic/zhemm_utcopy_4.c
@@ -52,7 +52,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
@@ -69,7 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
if (offset > -2) ao3 += 2; else ao3 += lda;
@@ -138,7 +138,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
break;
}
}
-
+
b += 8;
offset --;
@@ -152,7 +152,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
@@ -163,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
@@ -207,7 +207,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
i = m;
@@ -215,7 +215,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > 0) {
diff --git a/kernel/generic/zhemm_utcopy_8.c b/kernel/generic/zhemm_utcopy_8.c
index 1cfd3bd..601ef26 100644
--- a/kernel/generic/zhemm_utcopy_8.c
+++ b/kernel/generic/zhemm_utcopy_8.c
@@ -40,7 +40,7 @@
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
-
+
BLASLONG i, js, offset;
FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
@@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
@@ -82,7 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data14 = *(ao7 + 1);
data15 = *(ao8 + 0);
data16 = *(ao8 + 1);
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
if (offset > -2) ao3 += 2; else ao3 += lda;
@@ -275,7 +275,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
break;
}
}
-
+
b += 16;
offset --;
@@ -288,7 +288,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 4) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
@@ -305,7 +305,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
if (offset > -2) ao3 += 2; else ao3 += lda;
@@ -374,7 +374,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
break;
}
}
-
+
b += 8;
offset --;
@@ -387,7 +387,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
@@ -398,7 +398,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
@@ -442,7 +442,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
i = m;
@@ -450,7 +450,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > 0) {
diff --git a/kernel/generic/zhemv_k.c b/kernel/generic/zhemv_k.c
index 3551938..bab1d6b 100644
--- a/kernel/generic/zhemv_k.c
+++ b/kernel/generic/zhemv_k.c
@@ -41,7 +41,7 @@
#include "common.h"
#include "symcopy.h"
-int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i,
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i,
FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
BLASLONG is, min_i;
@@ -76,7 +76,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i,
#ifndef LOWER
if (is > 0){
#ifndef HEMVREV
- GEMV_C(is, min_i, 0, alpha_r, alpha_i,
+ GEMV_C(is, min_i, 0, alpha_r, alpha_i,
a + is * lda * 2, lda,
X, 1,
Y + is * 2, 1, gemvbuffer);
@@ -86,7 +86,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i,
X + is * 2, 1,
Y, 1, gemvbuffer);
#else
- GEMV_T(is, min_i, 0, alpha_r, alpha_i,
+ GEMV_T(is, min_i, 0, alpha_r, alpha_i,
a + is * lda * 2, lda,
X, 1,
Y + is * 2, 1, gemvbuffer);
@@ -113,11 +113,11 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i,
#endif
#endif
- GEMV_N(min_i, min_i, 0, alpha_r, alpha_i,
+ GEMV_N(min_i, min_i, 0, alpha_r, alpha_i,
symbuffer, min_i,
- X + is * 2, 1,
+ X + is * 2, 1,
Y + is * 2, 1, gemvbuffer);
-
+
#ifdef LOWER
if (m - is - min_i > 0){
diff --git a/kernel/generic/zlaswp_ncopy_1.c b/kernel/generic/zlaswp_ncopy_1.c
index acbda68..0e15099 100644
--- a/kernel/generic/zlaswp_ncopy_1.c
+++ b/kernel/generic/zlaswp_ncopy_1.c
@@ -55,24 +55,24 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
k1 --;
ipiv += k1;
-
+
if (n <= 0) return 0;
-
+
j = n;
do {
piv = ipiv;
-
+
a1 = a + (k1 + 1) * 2;
-
+
ip1 = *(piv + 0) * 2;
ip2 = *(piv + 1) * 2;
piv += 2;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
i = ((k2 - k1) >> 1);
-
+
if (i > 0) {
do {
A1 = *(a1 + 0);
@@ -83,11 +83,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
-
+
ip1 = *(piv + 0) * 2;
ip2 = *(piv + 1) * 2;
piv += 2;
-
+
if (b1 == a1) {
if (b2 == a2) {
*(buffer + 0) = A1;
@@ -103,7 +103,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
- } else
+ } else
if (b1 == a2) {
if (b2 == a2) {
*(buffer + 0) = A3;
@@ -126,7 +126,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(buffer + 3) = A4;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
- } else
+ } else
if (b2 == b1) {
*(buffer + 0) = B1;
*(buffer + 1) = B2;
@@ -145,26 +145,26 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(b2 + 1) = A4;
}
}
-
+
buffer += 4;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
a1 += 4;
-
+
i --;
} while (i > 0);
}
-
+
i = ((k2 - k1) & 1);
-
+
if (i > 0) {
A1 = *(a1 + 0);
A2 = *(a1 + 1);
- B1 = *(b1 + 0);
+ B1 = *(b1 + 0);
B2 = *(b1 + 1);
-
+
if (a1 == b1) {
*(buffer + 0) = A1;
*(buffer + 1) = A2;
@@ -182,5 +182,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
} while (j > 0);
return 0;
-}
+}
diff --git a/kernel/generic/zlaswp_ncopy_2.c b/kernel/generic/zlaswp_ncopy_2.c
index 7fa56be..d02a788 100644
--- a/kernel/generic/zlaswp_ncopy_2.c
+++ b/kernel/generic/zlaswp_ncopy_2.c
@@ -60,27 +60,27 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
ipiv += k1;
if (n <= 0) return 0;
-
+
j = (n >> 1);
if (j > 0) {
do {
piv = ipiv;
-
+
a1 = a + (k1 + 1) * 2;
a3 = a1 + lda;
-
+
ip1 = *(piv + 0) * 2;
ip2 = *(piv + 1) * 2;
piv += 2;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + lda;
b4 = b2 + lda;
-
+
i = ((k2 - k1) >> 1);
-
+
if (i > 0) {
do {
A1 = *(a1 + 0);
@@ -104,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
ip1 = *(piv + 0) * 2;
ip2 = *(piv + 1) * 2;
piv += 2;
-
+
if (b1 == a1) {
if (b2 == a2) {
*(buffer + 0) = A1;
@@ -124,13 +124,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(buffer + 5) = B4;
*(buffer + 6) = B7;
*(buffer + 7) = B8;
-
+
*(b2 + 0) = A3;
*(b2 + 1) = A4;
*(b4 + 0) = A7;
*(b4 + 1) = A8;
}
- } else
+ } else
if (b1 == a2) {
if (b2 == a2) {
*(buffer + 0) = A3;
@@ -171,7 +171,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(b1 + 1) = A2;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
- } else
+ } else
if (b2 == b1) {
*(buffer + 0) = B1;
*(buffer + 1) = B2;
@@ -205,24 +205,24 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(b4 + 1) = A8;
}
}
-
+
buffer += 8;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + lda;
b4 = b2 + lda;
-
+
a1 += 4;
a3 += 4;
-
+
i --;
} while (i > 0);
}
-
+
i = ((k2 - k1) & 1);
-
+
if (i > 0) {
A1 = *(a1 + 0);
A2 = *(a1 + 1);
@@ -232,7 +232,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
A4 = *(a3 + 1);
B3 = *(b3 + 0);
B4 = *(b3 + 1);
-
+
if (a1 == b1) {
*(buffer + 0) = A1;
*(buffer + 1) = A2;
@@ -251,26 +251,26 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
}
buffer += 4;
}
-
+
a += 2 * lda;
j --;
} while (j > 0);
}
-
+
if (n & 1) {
piv = ipiv;
-
+
a1 = a + (k1 + 1) * 2;
-
+
ip1 = *(piv + 0) * 2;
ip2 = *(piv + 1) * 2;
piv += 2;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
i = ((k2 - k1) >> 1);
-
+
if (i > 0) {
do {
A1 = *(a1 + 0);
@@ -281,11 +281,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
-
+
ip1 = *(piv + 0) * 2;
ip2 = *(piv + 1) * 2;
piv += 2;
-
+
if (b1 == a1) {
if (b2 == a2) {
*(buffer + 0) = A1;
@@ -297,11 +297,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(buffer + 1) = A2;
*(buffer + 2) = B3;
*(buffer + 3) = B4;
-
+
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
- } else
+ } else
if (b1 == a2) {
if (b2 == a2) {
*(buffer + 0) = A3;
@@ -324,7 +324,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(buffer + 3) = A4;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
- } else
+ } else
if (b2 == b1) {
*(buffer + 0) = B1;
*(buffer + 1) = B2;
@@ -345,24 +345,24 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
}
buffer += 4;
-
+
b1 = a + ip1;
b2 = a + ip2;
a1 += 4;
-
+
i --;
} while (i > 0);
}
-
+
i = ((k2 - k1) & 1);
-
+
if (i > 0) {
A1 = *(a1 + 0);
A2 = *(a1 + 1);
- B1 = *(b1 + 0);
+ B1 = *(b1 + 0);
B2 = *(b1 + 1);
-
+
if (a1 == b1) {
*(buffer + 0) = A1;
*(buffer + 1) = A2;
@@ -377,5 +377,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
}
return 0;
-}
+}
diff --git a/kernel/generic/zlaswp_ncopy_4.c b/kernel/generic/zlaswp_ncopy_4.c
index c9c44fc..b791666 100644
--- a/kernel/generic/zlaswp_ncopy_4.c
+++ b/kernel/generic/zlaswp_ncopy_4.c
@@ -71,7 +71,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
piv = ipiv;
a1 = a + (k1 + 1) * 2;
-
+
a3 = a1 + 1 * lda;
a5 = a1 + 2 * lda;
a7 = a1 + 3 * lda;
@@ -79,10 +79,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
ip1 = *(piv + 0) * 2;
ip2 = *(piv + 1) * 2;
piv += 2;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
b5 = b1 + 2 * lda;
@@ -91,7 +91,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
b8 = b2 + 3 * lda;
i = ((k2 - k1) >> 1);
-
+
if (i > 0) {
do {
A1 = *(a1 + 0);
@@ -131,7 +131,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
ip1 = *(piv + 0) * 2;
ip2 = *(piv + 1) * 2;
piv += 2;
-
+
if (b1 == a1) {
if (b2 == a2) {
*(buffer + 0) = A1;
@@ -179,7 +179,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(b8 + 0) = A8;
*(b8 + 1) = A16;
}
- } else
+ } else
if (b1 == a2) {
if (b2 == a2) {
*(buffer + 0) = A2;
@@ -253,7 +253,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(b5 + 1) = A13;
*(b7 + 0) = A7;
*(b7 + 1) = A15;
- } else
+ } else
if (b2 == b1) {
*(buffer + 0) = B1;
*(buffer + 1) = B9;
@@ -316,19 +316,19 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(b8 + 1) = A16;
}
}
-
+
buffer += 16;
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
b5 = b1 + 2 * lda;
b6 = b2 + 2 * lda;
b7 = b1 + 3 * lda;
b8 = b2 + 3 * lda;
-
+
a1 += 4;
a3 += 4;
a5 += 4;
@@ -337,9 +337,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
i --;
} while (i > 0);
}
-
+
i = ((k2 - k1) & 1);
-
+
if (i > 0) {
A1 = *(a1 + 0);
A9 = *(a1 + 1);
@@ -390,29 +390,29 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
}
a += 4 * lda;
-
+
j --;
} while (j > 0);
}
if (n & 2) {
piv = ipiv;
-
+
a1 = a + (k1 + 1) * 2;
a3 = a1 + lda;
-
+
ip1 = *(piv + 0) * 2;
ip2 = *(piv + 1) * 2;
piv += 2;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + lda;
b4 = b2 + lda;
-
+
i = ((k2 - k1) >> 1);
-
+
if (i > 0) {
do {
A1 = *(a1 + 0);
@@ -423,7 +423,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
A6 = *(a3 + 1);
A7 = *(a4 + 0);
A8 = *(a4 + 1);
-
+
B1 = *(b1 + 0);
B2 = *(b1 + 1);
B3 = *(b2 + 0);
@@ -432,11 +432,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
B6 = *(b3 + 1);
B7 = *(b4 + 0);
B8 = *(b4 + 1);
-
+
ip1 = *(piv + 0) * 2;
ip2 = *(piv + 1) * 2;
piv += 2;
-
+
if (b1 == a1) {
if (b2 == a2) {
*(buffer + 0) = A1;
@@ -456,13 +456,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(buffer + 5) = B4;
*(buffer + 6) = B7;
*(buffer + 7) = B8;
-
+
*(b2 + 0) = A3;
*(b2 + 1) = A4;
*(b4 + 0) = A7;
*(b4 + 1) = A8;
}
- } else
+ } else
if (b1 == a2) {
if (b2 == a2) {
*(buffer + 0) = A3;
@@ -503,7 +503,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(b1 + 1) = A2;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
- } else
+ } else
if (b2 == b1) {
*(buffer + 0) = B1;
*(buffer + 1) = B2;
@@ -537,24 +537,24 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(b4 + 1) = A8;
}
}
-
+
buffer += 8;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + lda;
b4 = b2 + lda;
-
+
a1 += 4;
a3 += 4;
-
+
i --;
} while (i > 0);
}
-
+
i = ((k2 - k1) & 1);
-
+
if (i > 0) {
A1 = *(a1 + 0);
A2 = *(a1 + 1);
@@ -564,13 +564,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
A4 = *(a3 + 1);
B3 = *(b3 + 0);
B4 = *(b3 + 1);
-
+
if (a1 == b1) {
*(buffer + 0) = A1;
*(buffer + 1) = A2;
*(buffer + 2) = A3;
*(buffer + 3) = A4;
-
+
} else {
*(buffer + 0) = B1;
*(buffer + 1) = B2;
@@ -583,24 +583,24 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
}
buffer += 4;
}
-
+
a += 2 * lda;
}
-
+
if (n & 1) {
piv = ipiv;
-
+
a1 = a + (k1 + 1) * 2;
-
+
ip1 = *(piv + 0) * 2;
ip2 = *(piv + 1) * 2;
piv += 2;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
i = ((k2 - k1) >> 1);
-
+
if (i > 0) {
do {
A1 = *(a1 + 0);
@@ -611,11 +611,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
-
+
ip1 = *(piv + 0) * 2;
ip2 = *(piv + 1) * 2;
piv += 2;
-
+
if (b1 == a1) {
if (b2 == a2) {
*(buffer + 0) = A1;
@@ -627,11 +627,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(buffer + 1) = A2;
*(buffer + 2) = B3;
*(buffer + 3) = B4;
-
+
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
- } else
+ } else
if (b1 == a2) {
if (b2 == a2) {
*(buffer + 0) = A3;
@@ -654,7 +654,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(buffer + 3) = A4;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
- } else
+ } else
if (b2 == b1) {
*(buffer + 0) = B1;
*(buffer + 1) = B2;
@@ -675,24 +675,24 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
}
buffer += 4;
-
+
b1 = a + ip1;
b2 = a + ip2;
a1 += 4;
-
+
i --;
} while (i > 0);
}
-
+
i = ((k2 - k1) & 1);
-
+
if (i > 0) {
A1 = *(a1 + 0);
A2 = *(a1 + 1);
- B1 = *(b1 + 0);
+ B1 = *(b1 + 0);
B2 = *(b1 + 1);
-
+
if (a1 == b1) {
*(buffer + 0) = A1;
*(buffer + 1) = A2;
@@ -707,5 +707,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
}
return 0;
-}
+}
diff --git a/kernel/generic/zneg_tcopy_1.c b/kernel/generic/zneg_tcopy_1.c
index 3701c9c..6b75e14 100644
--- a/kernel/generic/zneg_tcopy_1.c
+++ b/kernel/generic/zneg_tcopy_1.c
@@ -49,18 +49,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset = a;
b_offset = b;
-
+
lda *= 2;
j = m;
m *= 2;
-
+
if (j > 0){
do {
b_offset1 = b_offset;
b_offset += 2;
-
+
i = (n >> 2);
if (i > 0){
do{
@@ -68,45 +68,45 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset + 1);
ctemp3 = *(a_offset + 2);
ctemp4 = *(a_offset + 3);
-
+
ctemp5 = *(a_offset + 4);
ctemp6 = *(a_offset + 5);
ctemp7 = *(a_offset + 6);
ctemp8 = *(a_offset + 7);
-
+
*(b_offset1 + 0) = -ctemp1;
*(b_offset1 + 1) = -ctemp2;
-
+
b_offset1 += m;
-
+
*(b_offset1 + 0) = -ctemp3;
*(b_offset1 + 1) = -ctemp4;
-
+
b_offset1 += m;
-
+
*(b_offset1 + 0) = -ctemp5;
*(b_offset1 + 1) = -ctemp6;
b_offset1 += m;
-
+
*(b_offset1 + 0) = -ctemp7;
*(b_offset1 + 1) = -ctemp8;
-
+
b_offset1 += m;
a_offset += 8;
i --;
} while(i>0);
}
-
+
i = (n & 3);
if (i > 0){
do {
ctemp1 = *(a_offset + 0);
ctemp2 = *(a_offset + 1);
-
+
*(b_offset1 + 0) = -ctemp1;
*(b_offset1 + 1) = -ctemp2;
-
+
b_offset1 += m;
a_offset += 2;
i --;
diff --git a/kernel/generic/zneg_tcopy_2.c b/kernel/generic/zneg_tcopy_2.c
index 40dd115..074f2f1 100644
--- a/kernel/generic/zneg_tcopy_2.c
+++ b/kernel/generic/zneg_tcopy_2.c
@@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
a_offset = a;
b_offset = b;
-
+
b_offset2 = b + m * (n & ~1) * 2;
lda *= 2;
@@ -73,46 +73,46 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
-
+
ctemp5 = *(a_offset1 + 4);
ctemp6 = *(a_offset1 + 5);
ctemp7 = *(a_offset1 + 6);
ctemp8 = *(a_offset1 + 7);
-
+
ctemp9 = *(a_offset2 + 0);
ctemp10 = *(a_offset2 + 1);
ctemp11 = *(a_offset2 + 2);
ctemp12 = *(a_offset2 + 3);
-
+
ctemp13 = *(a_offset2 + 4);
ctemp14 = *(a_offset2 + 5);
ctemp15 = *(a_offset2 + 6);
ctemp16 = *(a_offset2 + 7);
-
+
*(b_offset1 + 0) = -ctemp1;
*(b_offset1 + 1) = -ctemp2;
*(b_offset1 + 2) = -ctemp3;
*(b_offset1 + 3) = -ctemp4;
-
+
*(b_offset1 + 4) = -ctemp9;
*(b_offset1 + 5) = -ctemp10;
*(b_offset1 + 6) = -ctemp11;
*(b_offset1 + 7) = -ctemp12;
-
+
b_offset1 += m * 4;
-
+
*(b_offset1 + 0) = -ctemp5;
*(b_offset1 + 1) = -ctemp6;
*(b_offset1 + 2) = -ctemp7;
*(b_offset1 + 3) = -ctemp8;
-
+
*(b_offset1 + 4) = -ctemp13;
*(b_offset1 + 5) = -ctemp14;
*(b_offset1 + 6) = -ctemp15;
*(b_offset1 + 7) = -ctemp16;
-
+
b_offset1 += m * 4;
-
+
a_offset1 += 8;
a_offset2 += 8;
i --;
@@ -124,33 +124,33 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset1 + 1);
ctemp3 = *(a_offset1 + 2);
ctemp4 = *(a_offset1 + 3);
-
+
ctemp9 = *(a_offset2 + 0);
ctemp10 = *(a_offset2 + 1);
ctemp11 = *(a_offset2 + 2);
ctemp12 = *(a_offset2 + 3);
-
+
*(b_offset1 + 0) = -ctemp1;
*(b_offset1 + 1) = -ctemp2;
*(b_offset1 + 2) = -ctemp3;
*(b_offset1 + 3) = -ctemp4;
-
+
*(b_offset1 + 4) = -ctemp9;
*(b_offset1 + 5) = -ctemp10;
*(b_offset1 + 6) = -ctemp11;
*(b_offset1 + 7) = -ctemp12;
-
+
b_offset1 += m * 4;
a_offset1 += 4;
a_offset2 += 4;
}
-
+
if (n & 1){
ctemp1 = *(a_offset1 + 0);
ctemp2 = *(a_offset1 + 1);
ctemp9 = *(a_offset2 + 0);
ctemp10 = *(a_offset2 + 1);
-
+
*(b_offset2 + 0) = -ctemp1;
*(b_offset2 + 1) = -ctemp2;
*(b_offset2 + 2) = -ctemp9;
@@ -169,45 +169,45 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp2 = *(a_offset + 1);
ctemp3 = *(a_offset + 2);
ctemp4 = *(a_offset + 3);
-
+
ctemp5 = *(a_offset + 4);
ctemp6 = *(a_offset + 5);
ctemp7 = *(a_offset + 6);
ctemp8 = *(a_offset + 7);
-
+
*(b_offset + 0) = -ctemp1;
*(b_offset + 1) = -ctemp2;
*(b_offset + 2) = -ctemp3;
*(b_offset + 3) = -ctemp4;
-
+
b_offset += m * 4;
-
+
*(b_offset + 0) = -ctemp5;
*(b_offset + 1) = -ctemp6;
*(b_offset + 2) = -ctemp7;
*(b_offset + 3) = -ctemp8;
-
+
b_offset += m * 4;
a_offset += 8;
i --;
} while(i > 0);
}
-
+
if (n & 2){
ctemp1 = *(a_offset + 0);
ctemp2 = *(a_offset + 1);
ctemp3 = *(a_offset + 2);
ctemp4 = *(a_offset + 3);
-
+
*(b_offset + 0) = -ctemp1;
*(b_offset + 1) = -ctemp2;
*(b_offset + 2) = -ctemp3;
*(b_offset + 3) = -ctemp4;
-
+
b_offset += m * 4;
a_offset += 4;
}
-
+
if (n & 1){
ctemp1 = *(a_offset + 0);
ctemp2 = *(a_offset + 1);
diff --git a/kernel/generic/zneg_tcopy_4.c b/kernel/generic/zneg_tcopy_4.c
index 7cd9887..cfdd23b 100644
--- a/kernel/generic/zneg_tcopy_4.c
+++ b/kernel/generic/zneg_tcopy_4.c
@@ -90,7 +90,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
-
+
ctemp09 = *(aoffset2 + 0);
ctemp10 = *(aoffset2 + 1);
ctemp11 = *(aoffset2 + 2);
@@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp14 = *(aoffset2 + 5);
ctemp15 = *(aoffset2 + 6);
ctemp16 = *(aoffset2 + 7);
-
+
ctemp17 = *(aoffset3 + 0);
ctemp18 = *(aoffset3 + 1);
ctemp19 = *(aoffset3 + 2);
@@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp22 = *(aoffset3 + 5);
ctemp23 = *(aoffset3 + 6);
ctemp24 = *(aoffset3 + 7);
-
+
ctemp25 = *(aoffset4 + 0);
ctemp26 = *(aoffset4 + 1);
ctemp27 = *(aoffset4 + 2);
@@ -126,7 +126,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 5) = -ctemp06;
*(boffset1 + 6) = -ctemp07;
*(boffset1 + 7) = -ctemp08;
-
+
*(boffset1 + 8) = -ctemp09;
*(boffset1 + 9) = -ctemp10;
*(boffset1 + 10) = -ctemp11;
@@ -144,7 +144,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 21) = -ctemp22;
*(boffset1 + 22) = -ctemp23;
*(boffset1 + 23) = -ctemp24;
-
+
*(boffset1 + 24) = -ctemp25;
*(boffset1 + 25) = -ctemp26;
*(boffset1 + 26) = -ctemp27;
@@ -174,17 +174,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset2 + 1);
ctemp07 = *(aoffset2 + 2);
ctemp08 = *(aoffset2 + 3);
-
+
ctemp09 = *(aoffset3 + 0);
ctemp10 = *(aoffset3 + 1);
ctemp11 = *(aoffset3 + 2);
ctemp12 = *(aoffset3 + 3);
-
+
ctemp13 = *(aoffset4 + 0);
ctemp14 = *(aoffset4 + 1);
ctemp15 = *(aoffset4 + 2);
ctemp16 = *(aoffset4 + 3);
-
+
*(boffset2 + 0) = -ctemp01;
*(boffset2 + 1) = -ctemp02;
*(boffset2 + 2) = -ctemp03;
@@ -193,7 +193,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset2 + 5) = -ctemp06;
*(boffset2 + 6) = -ctemp07;
*(boffset2 + 7) = -ctemp08;
-
+
*(boffset2 + 8) = -ctemp09;
*(boffset2 + 9) = -ctemp10;
*(boffset2 + 10) = -ctemp11;
@@ -202,12 +202,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset2 + 13) = -ctemp14;
*(boffset2 + 14) = -ctemp15;
*(boffset2 + 15) = -ctemp16;
-
+
aoffset1 += 4;
aoffset2 += 4;
aoffset3 += 4;
aoffset4 += 4;
-
+
boffset2 += 16;
}
@@ -217,13 +217,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
-
+
ctemp05 = *(aoffset3 + 0);
ctemp06 = *(aoffset3 + 1);
-
+
ctemp07 = *(aoffset4 + 0);
ctemp08 = *(aoffset4 + 1);
-
+
*(boffset3 + 0) = -ctemp01;
*(boffset3 + 1) = -ctemp02;
*(boffset3 + 2) = -ctemp03;
@@ -232,12 +232,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset3 + 5) = -ctemp06;
*(boffset3 + 6) = -ctemp07;
*(boffset3 + 7) = -ctemp08;
-
+
aoffset1 += 2;
aoffset2 += 2;
aoffset3 += 2;
aoffset4 += 2;
-
+
boffset3 += 8;
}
j--;
@@ -248,10 +248,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset += 2 * lda;
-
+
boffset1 = boffset;
boffset += 16;
-
+
i = (n >> 2);
if (i > 0){
do{
@@ -263,7 +263,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
-
+
ctemp09 = *(aoffset2 + 0);
ctemp10 = *(aoffset2 + 1);
ctemp11 = *(aoffset2 + 2);
@@ -272,7 +272,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp14 = *(aoffset2 + 5);
ctemp15 = *(aoffset2 + 6);
ctemp16 = *(aoffset2 + 7);
-
+
*(boffset1 + 0) = -ctemp01;
*(boffset1 + 1) = -ctemp02;
*(boffset1 + 2) = -ctemp03;
@@ -281,7 +281,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 5) = -ctemp06;
*(boffset1 + 6) = -ctemp07;
*(boffset1 + 7) = -ctemp08;
-
+
*(boffset1 + 8) = -ctemp09;
*(boffset1 + 9) = -ctemp10;
*(boffset1 + 10) = -ctemp11;
@@ -290,12 +290,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 13) = -ctemp14;
*(boffset1 + 14) = -ctemp15;
*(boffset1 + 15) = -ctemp16;
-
+
aoffset1 += 8;
aoffset2 += 8;
aoffset3 += 8;
aoffset4 += 8;
-
+
boffset1 += m * 8;
i --;
}while(i > 0);
@@ -306,12 +306,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
-
+
ctemp05 = *(aoffset2 + 0);
ctemp06 = *(aoffset2 + 1);
ctemp07 = *(aoffset2 + 2);
ctemp08 = *(aoffset2 + 3);
-
+
*(boffset2 + 0) = -ctemp01;
*(boffset2 + 1) = -ctemp02;
*(boffset2 + 2) = -ctemp03;
@@ -320,34 +320,34 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset2 + 5) = -ctemp06;
*(boffset2 + 6) = -ctemp07;
*(boffset2 + 7) = -ctemp08;
-
+
aoffset1 += 4;
aoffset2 += 4;
-
+
boffset2 += 8;
}
-
+
if (n & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
-
+
*(boffset3 + 0) = -ctemp01;
*(boffset3 + 1) = -ctemp02;
*(boffset3 + 2) = -ctemp03;
*(boffset3 + 3) = -ctemp04;
-
+
aoffset1 += 2;
aoffset2 += 2;
boffset3 += 4;
}
}
-
+
if (m & 1){
aoffset1 = aoffset;
boffset1 = boffset;
-
+
i = (n >> 2);
if (i > 0){
do{
@@ -359,7 +359,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
-
+
*(boffset1 + 0) = -ctemp01;
*(boffset1 + 1) = -ctemp02;
*(boffset1 + 2) = -ctemp03;
@@ -368,7 +368,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset1 + 5) = -ctemp06;
*(boffset1 + 6) = -ctemp07;
*(boffset1 + 7) = -ctemp08;
-
+
aoffset1 += 8;
boffset1 += m * 8;
i --;
@@ -380,7 +380,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
-
+
*(boffset2 + 0) = -ctemp01;
*(boffset2 + 1) = -ctemp02;
*(boffset2 + 2) = -ctemp03;
@@ -389,11 +389,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 += 4;
boffset2 += 4;
}
-
+
if (n & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
-
+
*(boffset3 + 0) = -ctemp01;
*(boffset3 + 1) = -ctemp02;
}
diff --git a/kernel/generic/zneg_tcopy_8.c b/kernel/generic/zneg_tcopy_8.c
index fe8f25c..cb1a62d 100644
--- a/kernel/generic/zneg_tcopy_8.c
+++ b/kernel/generic/zneg_tcopy_8.c
@@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 16;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -117,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
-
+
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
@@ -126,7 +126,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 13) = -ctemp14;
*(boffset + 14) = -ctemp15;
*(boffset + 15) = -ctemp16;
-
+
*(boffset + 16) = -ctemp17;
*(boffset + 17) = -ctemp18;
*(boffset + 18) = -ctemp19;
@@ -170,7 +170,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp14 = *(aoffset1 + 13);
ctemp15 = *(aoffset1 + 14);
ctemp16 = *(aoffset1 + 15);
-
+
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
@@ -179,7 +179,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
-
+
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
@@ -200,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 8;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -212,7 +212,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
-
+
ctemp09 = *(aoffset2 + 0);
ctemp10 = *(aoffset2 + 1);
ctemp11 = *(aoffset2 + 2);
@@ -230,7 +230,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
-
+
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
@@ -239,15 +239,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 13) = -ctemp14;
*(boffset + 14) = -ctemp15;
*(boffset + 15) = -ctemp16;
-
+
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 16;
-
+
i --;
}while(i > 0);
}
-
+
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
@@ -257,7 +257,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
-
+
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
@@ -266,7 +266,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
-
+
boffset += 8;
}
}
@@ -275,7 +275,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 4;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -297,15 +297,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
-
+
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 8;
-
+
i --;
}while(i > 0);
}
-
+
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
@@ -316,7 +316,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
-
+
boffset += 4;
}
}
@@ -325,7 +325,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 2;
-
+
i = (m >> 1);
if (i > 0){
do{
@@ -338,15 +338,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
-
+
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 4;
-
+
i --;
}while(i > 0);
}
-
+
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
diff --git a/kernel/generic/zsymm3m_lcopy_1.c b/kernel/generic/zsymm3m_lcopy_1.c
index 0e0d5a3..4e5b29d 100644
--- a/kernel/generic/zsymm3m_lcopy_1.c
+++ b/kernel/generic/zsymm3m_lcopy_1.c
@@ -69,31 +69,31 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao1;
js = n;
-
+
while (js > 0){
-
+
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
-
+
i = m;
-
+
while (i > 0) {
data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
-
+
b[ 0] = data01;
-
+
b ++;
offset --;
i --;
}
-
+
posX ++;
js --;
}
-
+
return 0;
}
diff --git a/kernel/generic/zsymm3m_lcopy_2.c b/kernel/generic/zsymm3m_lcopy_2.c
index 96686c1..edab3a4 100644
--- a/kernel/generic/zsymm3m_lcopy_2.c
+++ b/kernel/generic/zsymm3m_lcopy_2.c
@@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
@@ -81,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
data02 = CMULT(*(ao2 + 0), *(ao2 + 1));
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
@@ -101,14 +101,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
i = m;
while (i > 0) {
data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
b[ 0] = data01;
diff --git a/kernel/generic/zsymm3m_lcopy_4.c b/kernel/generic/zsymm3m_lcopy_4.c
index 38a58cf..9c6f51f 100644
--- a/kernel/generic/zsymm3m_lcopy_4.c
+++ b/kernel/generic/zsymm3m_lcopy_4.c
@@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
js = (n >> 2);
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
@@ -107,7 +107,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
@@ -116,7 +116,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
data02 = CMULT(*(ao2 + 0), *(ao2 + 1));
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
@@ -134,14 +134,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
i = m;
while (i > 0) {
data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
b[ 0] = data01;
diff --git a/kernel/generic/zsymm3m_lcopy_8.c b/kernel/generic/zsymm3m_lcopy_8.c
index 4e5cddc..f385092 100644
--- a/kernel/generic/zsymm3m_lcopy_8.c
+++ b/kernel/generic/zsymm3m_lcopy_8.c
@@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
@@ -93,7 +93,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = CMULT(*(ao6 + 0), *(ao6 + 1));
data07 = CMULT(*(ao7 + 0), *(ao7 + 1));
data08 = CMULT(*(ao8 + 0), *(ao8 + 1));
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
if (offset > -2) ao3 += lda; else ao3 += 2;
@@ -124,7 +124,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 4) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
@@ -159,7 +159,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
@@ -168,7 +168,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
data02 = CMULT(*(ao2 + 0), *(ao2 + 1));
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
@@ -186,14 +186,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
i = m;
while (i > 0) {
data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
b[ 0] = data01;
diff --git a/kernel/generic/zsymm3m_ucopy_1.c b/kernel/generic/zsymm3m_ucopy_1.c
index 14ca6e7..8bf4c83 100644
--- a/kernel/generic/zsymm3m_ucopy_1.c
+++ b/kernel/generic/zsymm3m_ucopy_1.c
@@ -67,29 +67,29 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao1;
lda *= 2;
-
+
js = n;
-
+
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
-
+
i = m;
-
+
while (i > 0) {
data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
-
+
b[ 0] = data01;
-
+
b ++;
-
+
offset --;
i --;
}
-
+
posX ++;
js --;
}
diff --git a/kernel/generic/zsymm3m_ucopy_2.c b/kernel/generic/zsymm3m_ucopy_2.c
index 4ba1e69..deed9ee 100644
--- a/kernel/generic/zsymm3m_ucopy_2.c
+++ b/kernel/generic/zsymm3m_ucopy_2.c
@@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
@@ -81,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
data02 = CMULT(*(ao2 + 0), *(ao2 + 1));
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
@@ -93,21 +93,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
offset --;
i --;
}
-
+
posX += 2;
js --;
}
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
i = m;
while (i > 0) {
data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
b[ 0] = data01;
diff --git a/kernel/generic/zsymm3m_ucopy_4.c b/kernel/generic/zsymm3m_ucopy_4.c
index 8de026a..5737c0c 100644
--- a/kernel/generic/zsymm3m_ucopy_4.c
+++ b/kernel/generic/zsymm3m_ucopy_4.c
@@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
@@ -85,7 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = CMULT(*(ao2 + 0), *(ao2 + 1));
data03 = CMULT(*(ao3 + 0), *(ao3 + 1));
data04 = CMULT(*(ao4 + 0), *(ao4 + 1));
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
if (offset > -2) ao3 += 2; else ao3 += lda;
@@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
@@ -117,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
data02 = CMULT(*(ao2 + 0), *(ao2 + 1));
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
@@ -129,20 +129,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
offset --;
i --;
}
-
+
posX += 2;
}
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
i = m;
while (i > 0) {
data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
b[ 0] = data01;
diff --git a/kernel/generic/zsymm3m_ucopy_8.c b/kernel/generic/zsymm3m_ucopy_8.c
index 79ef364..3aa1b07 100644
--- a/kernel/generic/zsymm3m_ucopy_8.c
+++ b/kernel/generic/zsymm3m_ucopy_8.c
@@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
@@ -93,7 +93,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = CMULT(*(ao6 + 0), *(ao6 + 1));
data07 = CMULT(*(ao7 + 0), *(ao7 + 1));
data08 = CMULT(*(ao8 + 0), *(ao8 + 1));
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
if (offset > -2) ao3 += 2; else ao3 += lda;
@@ -125,7 +125,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 4) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
@@ -138,7 +138,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = CMULT(*(ao2 + 0), *(ao2 + 1));
data03 = CMULT(*(ao3 + 0), *(ao3 + 1));
data04 = CMULT(*(ao4 + 0), *(ao4 + 1));
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
if (offset > -2) ao3 += 2; else ao3 += lda;
@@ -160,7 +160,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
@@ -169,7 +169,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
data02 = CMULT(*(ao2 + 0), *(ao2 + 1));
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
@@ -181,20 +181,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
offset --;
i --;
}
-
+
posX += 2;
}
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
i = m;
while (i > 0) {
data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
b[ 0] = data01;
diff --git a/kernel/generic/zsymm_lcopy_1.c b/kernel/generic/zsymm_lcopy_1.c
index 1b4f58d..7f20a1f 100644
--- a/kernel/generic/zsymm_lcopy_1.c
+++ b/kernel/generic/zsymm_lcopy_1.c
@@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
i = m;
@@ -61,7 +61,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
b[ 0] = data01;
diff --git a/kernel/generic/zsymm_lcopy_2.c b/kernel/generic/zsymm_lcopy_2.c
index ce1b16e..735e8e7 100644
--- a/kernel/generic/zsymm_lcopy_2.c
+++ b/kernel/generic/zsymm_lcopy_2.c
@@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
@@ -64,7 +64,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
@@ -86,7 +86,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
i = m;
@@ -94,7 +94,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
b[ 0] = data01;
diff --git a/kernel/generic/zsymm_lcopy_4.c b/kernel/generic/zsymm_lcopy_4.c
index dd2034d..d2acea3 100644
--- a/kernel/generic/zsymm_lcopy_4.c
+++ b/kernel/generic/zsymm_lcopy_4.c
@@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
@@ -70,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
if (offset > -2) ao3 += lda; else ao3 += 2;
@@ -98,7 +98,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
@@ -109,7 +109,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
@@ -131,7 +131,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
i = m;
@@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
b[ 0] = data01;
diff --git a/kernel/generic/zsymm_lcopy_8.c b/kernel/generic/zsymm_lcopy_8.c
index 3397612..e3fbcb5 100644
--- a/kernel/generic/zsymm_lcopy_8.c
+++ b/kernel/generic/zsymm_lcopy_8.c
@@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
@@ -121,7 +121,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 4) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
@@ -138,7 +138,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
if (offset > -2) ao3 += lda; else ao3 += 2;
@@ -165,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
@@ -176,7 +176,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
@@ -198,7 +198,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
i = m;
@@ -206,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
if (offset > 0) ao1 += lda; else ao1 += 2;
b[ 0] = data01;
diff --git a/kernel/generic/zsymm_ucopy_1.c b/kernel/generic/zsymm_ucopy_1.c
index 9943a2d..d93b572 100644
--- a/kernel/generic/zsymm_ucopy_1.c
+++ b/kernel/generic/zsymm_ucopy_1.c
@@ -52,7 +52,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
i = m;
diff --git a/kernel/generic/zsymm_ucopy_2.c b/kernel/generic/zsymm_ucopy_2.c
index da64cde..4d948f7 100644
--- a/kernel/generic/zsymm_ucopy_2.c
+++ b/kernel/generic/zsymm_ucopy_2.c
@@ -52,7 +52,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
@@ -63,7 +63,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
@@ -85,7 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
i = m;
@@ -93,7 +93,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
b[ 0] = data01;
diff --git a/kernel/generic/zsymm_ucopy_4.c b/kernel/generic/zsymm_ucopy_4.c
index eed0bca..8cc326a 100644
--- a/kernel/generic/zsymm_ucopy_4.c
+++ b/kernel/generic/zsymm_ucopy_4.c
@@ -52,7 +52,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
@@ -69,7 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
if (offset > -2) ao3 += 2; else ao3 += lda;
@@ -97,7 +97,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
@@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
@@ -129,7 +129,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
i = m;
@@ -137,7 +137,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
b[ 0] = data01;
diff --git a/kernel/generic/zsymm_ucopy_8.c b/kernel/generic/zsymm_ucopy_8.c
index c81a7a8..ea86676 100644
--- a/kernel/generic/zsymm_ucopy_8.c
+++ b/kernel/generic/zsymm_ucopy_8.c
@@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (js > 0){
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
@@ -82,7 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data14 = *(ao7 + 1);
data15 = *(ao8 + 0);
data16 = *(ao8 + 1);
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
if (offset > -2) ao3 += 2; else ao3 += lda;
@@ -122,7 +122,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 4) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
@@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
if (offset > -2) ao3 += 2; else ao3 += lda;
@@ -166,7 +166,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
@@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
@@ -198,7 +198,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1) {
offset = posX - posY;
-
+
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
i = m;
@@ -206,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
if (offset > 0) ao1 += 2; else ao1 += lda;
b[ 0] = data01;
diff --git a/kernel/generic/zsymv_k.c b/kernel/generic/zsymv_k.c
index 211def3..1e762eb 100644
--- a/kernel/generic/zsymv_k.c
+++ b/kernel/generic/zsymv_k.c
@@ -72,14 +72,14 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i,
for(is = 0; is < offset; is += SYMV_P){
min_i = MIN(offset - is, SYMV_P);
#endif
-
+
#ifndef LOWER
if (is >0){
GEMV_T(is, min_i, 0, alpha_r, alpha_i,
a + is * lda * COMPSIZE, lda,
X, 1,
Y + is * COMPSIZE, 1, gemvbuffer);
-
+
GEMV_N(is, min_i, 0, alpha_r, alpha_i,
a + is * lda * COMPSIZE, lda,
X + is * COMPSIZE, 1,
@@ -92,12 +92,12 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i,
#else
ZSYMCOPY_U(min_i, a + (is + is * lda) * COMPSIZE, lda, symbuffer);
#endif
-
+
GEMV_N(min_i, min_i, 0, alpha_r, alpha_i,
symbuffer, min_i,
X + is * COMPSIZE, 1,
Y + is * COMPSIZE, 1, gemvbuffer);
-
+
#ifdef LOWER
if (m - is > min_i){
@@ -105,7 +105,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i,
a + ((is + min_i) + is * lda) * COMPSIZE, lda,
X + (is + min_i) * COMPSIZE, 1,
Y + is * COMPSIZE, 1, gemvbuffer);
-
+
GEMV_N(m - is - min_i, min_i, 0, alpha_r, alpha_i,
a + ((is + min_i) + is * lda) * COMPSIZE, lda,
X + is * COMPSIZE, 1,
diff --git a/kernel/generic/ztrmm_lncopy_1.c b/kernel/generic/ztrmm_lncopy_1.c
index 15a0509..f0f8827 100644
--- a/kernel/generic/ztrmm_lncopy_1.c
+++ b/kernel/generic/ztrmm_lncopy_1.c
@@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += 2;
b += 2;
- } else
+ } else
if (X < posY) {
ao1 += lda;
b += 2;
diff --git a/kernel/generic/ztrmm_lncopy_2.c b/kernel/generic/ztrmm_lncopy_2.c
index f41ee5b..c620c78 100644
--- a/kernel/generic/ztrmm_lncopy_2.c
+++ b/kernel/generic/ztrmm_lncopy_2.c
@@ -76,7 +76,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data05;
@@ -89,7 +89,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += 4;
ao2 += 4;
b += 8;
- } else
+ } else
if (X < posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
@@ -136,13 +136,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
if (m & 1) {
-
+
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -151,7 +151,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += 2;
ao2 += 2;
b += 4;
- } else
+ } else
if (X < posY) {
ao1 += lda;
b += 4;
@@ -200,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 1] = data02;
b += 2;
ao1 += 2;
- } else
+ } else
if (X < posY) {
b += 2;
ao1 += lda;
diff --git a/kernel/generic/ztrmm_lncopy_4.c b/kernel/generic/ztrmm_lncopy_4.c
index 76170c7..5442105 100644
--- a/kernel/generic/ztrmm_lncopy_4.c
+++ b/kernel/generic/ztrmm_lncopy_4.c
@@ -81,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
@@ -90,7 +90,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data14 = *(ao2 + 5);
data15 = *(ao2 + 6);
data16 = *(ao2 + 7);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data19 = *(ao3 + 2);
@@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data22 = *(ao3 + 5);
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
@@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data30 = *(ao4 + 5);
data31 = *(ao4 + 6);
data32 = *(ao4 + 7);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data09;
@@ -117,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data18;
b[ 6] = data25;
b[ 7] = data26;
-
+
b[ 8] = data03;
b[ 9] = data04;
b[10] = data11;
@@ -126,7 +126,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data20;
b[14] = data27;
b[15] = data28;
-
+
b[16] = data05;
b[17] = data06;
b[18] = data13;
@@ -135,7 +135,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = data22;
b[22] = data29;
b[23] = data30;
-
+
b[24] = data07;
b[25] = data08;
b[26] = data15;
@@ -144,14 +144,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[29] = data24;
b[30] = data31;
b[31] = data32;
-
+
ao1 += 8;
ao2 += 8;
ao3 += 8;
ao4 += 8;
b += 32;
- } else
+ } else
if (X < posY) {
ao1 += 4 * lda;
ao2 += 4 * lda;
@@ -167,15 +167,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
data13 = *(ao2 + 4);
data14 = *(ao2 + 5);
data15 = *(ao2 + 6);
data16 = *(ao2 + 7);
-
+
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
-
+
b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = ZERO;
@@ -184,7 +184,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ZERO;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = data03;
b[ 9] = data04;
b[10] = ONE;
@@ -193,7 +193,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = ZERO;
b[14] = ZERO;
b[15] = ZERO;
-
+
b[16] = data05;
b[17] = data06;
b[18] = data13;
@@ -202,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = ZERO;
b[22] = ZERO;
b[23] = ZERO;
-
+
b[24] = data07;
b[25] = data08;
b[26] = data15;
@@ -220,22 +220,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
data13 = *(ao2 + 4);
data14 = *(ao2 + 5);
data15 = *(ao2 + 6);
data16 = *(ao2 + 7);
-
+
data21 = *(ao3 + 4);
data22 = *(ao3 + 5);
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
-
+
data31 = *(ao4 + 6);
data32 = *(ao4 + 7);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = ZERO;
@@ -244,7 +244,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ZERO;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = data03;
b[ 9] = data04;
b[10] = data11;
@@ -253,7 +253,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = ZERO;
b[14] = ZERO;
b[15] = ZERO;
-
+
b[16] = data05;
b[17] = data06;
b[18] = data13;
@@ -262,7 +262,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = data22;
b[22] = ZERO;
b[23] = ZERO;
-
+
b[24] = data07;
b[25] = data08;
b[26] = data15;
@@ -286,7 +286,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X > posY) {
if (m & 2) {
@@ -294,22 +294,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data19 = *(ao3 + 2);
data20 = *(ao3 + 3);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
data28 = *(ao4 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data09;
@@ -318,7 +318,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data18;
b[ 6] = data25;
b[ 7] = data26;
-
+
b[ 8] = data03;
b[ 9] = data04;
b[10] = data11;
@@ -327,14 +327,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data20;
b[14] = data27;
b[15] = data28;
-
+
ao1 += 4;
ao2 += 4;
ao3 += 4;
ao4 += 4;
b += 16;
}
-
+
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -345,7 +345,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data18 = *(ao3 + 1);
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data09;
@@ -354,27 +354,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data18;
b[ 6] = data25;
b[ 7] = data26;
-
+
ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2;
b += 8;
}
-
- } else
+
+ } else
if (X < posY) {
if (m & 2) {
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 16;
}
-
+
if (m & 1) {
ao1 += lda;
b += 8;
}
-
+
} else {
#ifdef UNIT
@@ -389,7 +389,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data13 = *(ao2 + 4);
data14 = *(ao2 + 5);
}
-
+
b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = ZERO;
@@ -411,7 +411,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = ZERO;
b += 8;
}
-
+
if (i >= 3) {
b[ 0] = data05;
b[ 1] = data06;
@@ -442,7 +442,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data21 = *(ao3 + 4);
data22 = *(ao3 + 5);
}
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = ZERO;
@@ -464,7 +464,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = ZERO;
b += 8;
}
-
+
if (i >= 3) {
b[ 0] = data05;
b[ 1] = data06;
@@ -505,12 +505,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data09;
@@ -524,17 +524,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 4;
b += 8;
- } else
+ } else
if (X < posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
-
+
b += 8;
} else {
#ifdef UNIT
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = ZERO;
@@ -548,10 +548,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = ZERO;
@@ -563,7 +563,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#endif
ao1 += 4;
ao2 += 4;
-
+
b += 8;
}
@@ -574,13 +574,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 1);
if (i) {
-
+
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data09;
@@ -589,7 +589,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += 2;
ao2 += 2;
b += 4;
- } else
+ } else
if (X < posY) {
ao1 += lda;
b += 4;
@@ -651,7 +651,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += lda;
b += 2;
}
-
+
X ++;
i --;
} while (i > 0);
diff --git a/kernel/generic/ztrmm_lncopy_8.c b/kernel/generic/ztrmm_lncopy_8.c
index 308ddd7..71d3bf1 100644
--- a/kernel/generic/ztrmm_lncopy_8.c
+++ b/kernel/generic/ztrmm_lncopy_8.c
@@ -79,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
do {
if (X > posY) {
for (ii = 0; ii < 8; ii++){
-
+
b[ 0] = *(ao1 + 0);
b[ 1] = *(ao1 + 1);
b[ 2] = *(ao2 + 0);
@@ -88,7 +88,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(ao3 + 1);
b[ 6] = *(ao4 + 0);
b[ 7] = *(ao4 + 1);
-
+
b[ 8] = *(ao5 + 0);
b[ 9] = *(ao5 + 1);
b[ 10] = *(ao6 + 0);
@@ -97,7 +97,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(ao7 + 1);
b[ 14] = *(ao8 + 0);
b[ 15] = *(ao8 + 1);
-
+
ao1 += 2;
ao2 += 2;
ao3 += 2;
@@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao8 += 2;
b += 16;
}
- } else
+ } else
if (X < posY) {
ao1 += 8 * lda;
ao2 += 8 * lda;
@@ -118,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao6 += 8 * lda;
ao7 += 8 * lda;
ao8 += 8 * lda;
-
+
b += 128;
} else {
@@ -143,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = ZERO;
b[ 14] = ZERO;
b[ 15] = ZERO;
-
+
b[ 16] = *(ao1 + 2);
b[ 17] = *(ao1 + 3);
#ifdef UNIT
@@ -297,7 +297,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[126] = *(ao8 + 14);
b[127] = *(ao8 + 15);
#endif
-
+
ao1 += 16;
ao2 += 16;
ao3 += 16;
@@ -316,7 +316,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 7);
if (i) {
-
+
if (X > posY) {
for (ii = 0; ii < i; ii++){
@@ -328,7 +328,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(ao3 + 1);
b[ 6] = *(ao4 + 0);
b[ 7] = *(ao4 + 1);
-
+
b[ 8] = *(ao5 + 0);
b[ 9] = *(ao5 + 1);
b[ 10] = *(ao6 + 0);
@@ -337,7 +337,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(ao7 + 1);
b[ 14] = *(ao8 + 0);
b[ 15] = *(ao8 + 1);
-
+
ao1 += 2;
ao2 += 2;
ao3 += 2;
@@ -348,7 +348,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao8 += 2;
b += 16;
}
- } else
+ } else
if (X < posY) {
ao1 += i * lda;
ao2 += i * lda;
@@ -569,14 +569,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(ao3 + 1);
b[ 6] = *(ao4 + 0);
b[ 7] = *(ao4 + 1);
-
+
ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2;
b += 8;
}
- } else
+ } else
if (X < posY) {
ao1 += 4 * lda;
ao2 += 4 * lda;
@@ -597,7 +597,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ZERO;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = *(ao1 + 2);
b[ 9] = *(ao1 + 3);
#ifdef UNIT
@@ -654,7 +654,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X > posY) {
for (ii = 0; ii < i; ii++){
@@ -666,14 +666,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(ao3 + 1);
b[ 6] = *(ao4 + 0);
b[ 7] = *(ao4 + 1);
-
+
ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2;
b += 8;
}
- } else
+ } else
if (X < posY) {
ao1 += i * lda;
ao2 += i * lda;
@@ -695,7 +695,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 6] = ZERO;
b[ 7] = ZERO;
b += 8;
-
+
if (i >= 2) {
b[ 0] = *(ao1 + 2);
b[ 1] = *(ao1 + 3);
@@ -758,11 +758,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(ao1 + 3);
b[ 6] = *(ao2 + 2);
b[ 7] = *(ao2 + 3);
-
+
ao1 += 4;
ao2 += 4;
b += 8;
- } else
+ } else
if (X < posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
@@ -777,7 +777,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#endif
b[ 2] = ZERO;
b[ 3] = ZERO;
-
+
b[ 4] = *(ao1 + 2);
b[ 5] = *(ao1 + 3);
#ifdef UNIT
@@ -798,7 +798,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
if (m & 1) {
-
+
if (X > posY) {
b[ 0] = *(ao1 + 0);
b[ 1] = *(ao1 + 1);
@@ -807,7 +807,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += 2;
ao2 += 2;
b += 4;
- } else
+ } else
if (X < posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
@@ -823,7 +823,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 2] = ZERO;
b[ 3] = ZERO;
b += 4;
- }
+ }
}
posY += 2;
}
@@ -845,7 +845,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 1] = *(ao1 + 1);
ao1 += 2;
b += 2;
- } else
+ } else
if (X < posY) {
ao1 += lda;
b += 2;
diff --git a/kernel/generic/ztrmm_ltcopy_1.c b/kernel/generic/ztrmm_ltcopy_1.c
index 1229b45..2fcd8db 100644
--- a/kernel/generic/ztrmm_ltcopy_1.c
+++ b/kernel/generic/ztrmm_ltcopy_1.c
@@ -66,11 +66,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (X > posY) {
ao1 += 2;
b += 2;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
ao1 += lda;
@@ -82,7 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
#endif
diff --git a/kernel/generic/ztrmm_ltcopy_2.c b/kernel/generic/ztrmm_ltcopy_2.c
index 7bcadf3..457890c 100644
--- a/kernel/generic/ztrmm_ltcopy_2.c
+++ b/kernel/generic/ztrmm_ltcopy_2.c
@@ -72,18 +72,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 4;
b += 8;
- } else
+ } else
if (X < posY) {
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);
-
+
data5 = *(ao2 + 0);
data6 = *(ao2 + 1);
data7 = *(ao2 + 2);
data8 = *(ao2 + 3);
-
+
b[ 0] = data1;
b[ 1] = data2;
b[ 2] = data3;
@@ -92,7 +92,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data6;
b[ 6] = data7;
b[ 7] = data8;
-
+
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 8;
@@ -101,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#ifdef UNIT
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);
-
+
b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = data3;
@@ -115,10 +115,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data2 = *(ao1 + 1);
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);
-
+
data7 = *(ao2 + 2);
data8 = *(ao2 + 3);
-
+
b[ 0] = data1;
b[ 1] = data2;
b[ 2] = data3;
@@ -127,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ZERO;
b[ 6] = data7;
b[ 7] = data8;
-#endif
+#endif
ao1 += 4;
ao2 += 4;
b += 8;
@@ -139,19 +139,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
if (m & 1) {
-
+
if (X > posY) {
ao1 += 2;
ao2 += 2;
b += 4;
-
- } else
+
+ } else
if (X < posY) {
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);
-
+
b[ 0] = data1;
b[ 1] = data2;
b[ 2] = data3;
@@ -208,7 +208,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (X < posY) {
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);
-
+
b[ 0] = data1;
b[ 1] = data2;
b += 2;
@@ -220,7 +220,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#else
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);
-
+
b[ 0] = data1;
b[ 1] = data2;
#endif
diff --git a/kernel/generic/ztrmm_ltcopy_4.c b/kernel/generic/ztrmm_ltcopy_4.c
index e43ed12..42a809b 100644
--- a/kernel/generic/ztrmm_ltcopy_4.c
+++ b/kernel/generic/ztrmm_ltcopy_4.c
@@ -80,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao4 += 8;
b += 32;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -90,7 +90,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
@@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data14 = *(ao2 + 5);
data15 = *(ao2 + 6);
data16 = *(ao2 + 7);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data19 = *(ao3 + 2);
@@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data22 = *(ao3 + 5);
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
@@ -117,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data30 = *(ao4 + 5);
data31 = *(ao4 + 6);
data32 = *(ao4 + 7);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -126,7 +126,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
@@ -135,7 +135,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data14;
b[14] = data15;
b[15] = data16;
-
+
b[16] = data17;
b[17] = data18;
b[18] = data19;
@@ -144,7 +144,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = data22;
b[22] = data23;
b[23] = data24;
-
+
b[24] = data25;
b[25] = data26;
b[26] = data27;
@@ -153,7 +153,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[29] = data30;
b[30] = data31;
b[31] = data32;
-
+
ao1 += 4 * lda;
ao2 += 4 * lda;
ao3 += 4 * lda;
@@ -168,16 +168,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
data13 = *(ao2 + 4);
data14 = *(ao2 + 5);
data15 = *(ao2 + 6);
data16 = *(ao2 + 7);
-
+
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
-
-
+
+
b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = data03;
@@ -186,7 +186,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = ONE;
@@ -195,7 +195,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data14;
b[14] = data15;
b[15] = data16;
-
+
b[16] = ZERO;
b[17] = ZERO;
b[18] = ZERO;
@@ -204,7 +204,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = ZERO;
b[22] = data23;
b[23] = data24;
-
+
b[24] = ZERO;
b[25] = ZERO;
b[26] = ZERO;
@@ -222,22 +222,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
data13 = *(ao2 + 4);
data14 = *(ao2 + 5);
data15 = *(ao2 + 6);
data16 = *(ao2 + 7);
-
+
data21 = *(ao3 + 4);
data22 = *(ao3 + 5);
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
-
+
data31 = *(ao4 + 6);
data32 = *(ao4 + 7);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -246,7 +246,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = data11;
@@ -255,7 +255,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data14;
b[14] = data15;
b[15] = data16;
-
+
b[16] = ZERO;
b[17] = ZERO;
b[18] = ZERO;
@@ -264,7 +264,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = data22;
b[22] = data23;
b[23] = data24;
-
+
b[24] = ZERO;
b[25] = ZERO;
b[26] = ZERO;
@@ -288,7 +288,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X > posY) {
if (m & 2) {
@@ -298,7 +298,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao4 += 4;
b += 16;
}
-
+
if (m & 1) {
ao1 += 2;
ao2 += 2;
@@ -306,8 +306,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao4 += 2;
b += 8;
}
-
- } else
+
+ } else
if (X < posY) {
if (m & 2) {
data01 = *(ao1 + 0);
@@ -318,7 +318,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
@@ -327,7 +327,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data14 = *(ao2 + 5);
data15 = *(ao2 + 6);
data16 = *(ao2 + 7);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -336,7 +336,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
@@ -345,13 +345,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data14;
b[14] = data15;
b[15] = data16;
-
+
ao1 += 2 * lda;
ao2 += 2 * lda;
-
+
b += 16;
}
-
+
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -361,7 +361,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -370,11 +370,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
ao1 += lda;
b += 8;
}
-
+
} else {
#ifdef UNIT
data03 = *(ao1 + 2);
@@ -383,7 +383,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
if (i >= 2) {
data13 = *(ao2 + 4);
data14 = *(ao2 + 5);
@@ -395,7 +395,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
}
-
+
b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = data03;
@@ -438,7 +438,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
if (i >= 2) {
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
@@ -454,7 +454,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
}
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -517,18 +517,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 4;
b += 8;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -537,7 +537,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data10;
b[ 6] = data11;
b[ 7] = data12;
-
+
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 8;
@@ -546,7 +546,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#ifdef UNIT
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = data03;
@@ -560,10 +560,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -586,19 +586,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 1);
if (i) {
-
+
if (X > posY) {
ao1 += 2;
ao2 += 2;
-
+
b += 4;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -611,7 +611,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#ifdef UNIT
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = data03;
@@ -621,7 +621,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -645,18 +645,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = m;
if (i > 0) {
do {
-
+
if (X > posY) {
b += 2;
ao1 += 2;
- } else
+ } else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
- ao1 += lda;
+ ao1 += lda;
b += 2;
} else {
@@ -666,7 +666,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
#endif
diff --git a/kernel/generic/ztrmm_ltcopy_8.c b/kernel/generic/ztrmm_ltcopy_8.c
index e25d922..09cb803 100644
--- a/kernel/generic/ztrmm_ltcopy_8.c
+++ b/kernel/generic/ztrmm_ltcopy_8.c
@@ -86,11 +86,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a07 += 16;
a08 += 16;
b += 128;
- } else
+ } else
if (X < posY) {
for (ii = 0; ii < 8; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
b[ 2] = *(a01 + 2);
@@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a01 + 5);
b[ 6] = *(a01 + 6);
b[ 7] = *(a01 + 7);
-
+
b[ 8] = *(a01 + 8);
b[ 9] = *(a01 + 9);
b[ 10] = *(a01 + 10);
@@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(a01 + 13);
b[ 14] = *(a01 + 14);
b[ 15] = *(a01 + 15);
-
+
a01 += lda;
b += 16;
}
@@ -120,7 +120,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a06 += 8 * lda;
a07 += 8 * lda;
a08 += 8 * lda;
-
+
} else {
#ifdef UNIT
b[ 0] = ONE;
@@ -143,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(a01 + 13);
b[ 14] = *(a01 + 14);
b[ 15] = *(a01 + 15);
-
+
b[ 16] = ZERO;
b[ 17] = ZERO;
#ifdef UNIT
@@ -313,7 +313,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i --;
} while (i > 0);
}
-
+
i = (m & 7);
if (i > 0) {
if (X > posY) {
@@ -326,7 +326,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a07 += 2 * i;
a08 += 2 * i;
b += 16 * i;
- } else
+ } else
if (X < posY) {
for (ii = 0; ii < i; ii++){
b[ 0] = *(a01 + 0);
@@ -337,7 +337,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a01 + 5);
b[ 6] = *(a01 + 6);
b[ 7] = *(a01 + 7);
-
+
b[ 8] = *(a01 + 8);
b[ 9] = *(a01 + 9);
b[ 10] = *(a01 + 10);
@@ -346,7 +346,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(a01 + 13);
b[ 14] = *(a01 + 14);
b[ 15] = *(a01 + 15);
-
+
a01 += lda;
a02 += lda;
a03 += lda;
@@ -371,7 +371,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a01 + 5);
b[ 6] = *(a01 + 6);
b[ 7] = *(a01 + 7);
-
+
b[ 8] = *(a01 + 8);
b[ 9] = *(a01 + 9);
b[ 10] = *(a01 + 10);
@@ -396,7 +396,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a02 + 5);
b[ 6] = *(a02 + 6);
b[ 7] = *(a02 + 7);
-
+
b[ 8] = *(a02 + 8);
b[ 9] = *(a02 + 9);
b[10] = *(a02 + 10);
@@ -422,7 +422,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#endif
b[ 6] = *(a03 + 6);
b[ 7] = *(a03 + 7);
-
+
b[ 8] = *(a03 + 8);
b[ 9] = *(a03 + 9);
b[10] = *(a03 + 10);
@@ -448,7 +448,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 6] = *(a04 + 6);
b[ 7] = *(a04 + 7);
#endif
-
+
b[ 8] = *(a04 + 8);
b[ 9] = *(a04 + 9);
b[10] = *(a04 + 10);
@@ -469,7 +469,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ZERO;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
#ifdef UNIT
b[ 8] = ONE;
b[ 9] = ZERO;
@@ -495,7 +495,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ZERO;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = ZERO;
b[ 9] = ZERO;
#ifdef UNIT
@@ -521,7 +521,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ZERO;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = ZERO;
@@ -537,7 +537,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[15] = *(a07 + 15);
b += 16;
}
- }
+ }
}
posY += 8;
@@ -548,7 +548,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 4){
X = posX;
-
+
if (posX <= posY) {
a01 = a + posY * 2 + (posX + 0) * lda;
a02 = a + posY * 2 + (posX + 1) * lda;
@@ -560,7 +560,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a03 = a + posX * 2 + (posY + 2) * lda;
a04 = a + posX * 2 + (posY + 3) * lda;
}
-
+
i = (m >> 2);
if (i > 0) {
do {
@@ -570,7 +570,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a03 += 8;
a04 += 8;
b += 32;
- } else
+ } else
if (X < posY) {
for (ii = 0; ii < 4; ii++){
b[ 0] = *(a01 + 0);
@@ -581,7 +581,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a01 + 5);
b[ 6] = *(a01 + 6);
b[ 7] = *(a01 + 7);
-
+
a01 += lda;
b += 8;
}
@@ -603,7 +603,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a01 + 5);
b[ 6] = *(a01 + 6);
b[ 7] = *(a01 + 7);
-
+
b[ 8] = ZERO;
b[ 9] = ZERO;
#ifdef UNIT
@@ -631,7 +631,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#endif
b[ 22] = *(a03 + 6);
b[ 23] = *(a03 + 7);
-
+
b[ 24] = ZERO;
b[ 25] = ZERO;
b[ 26] = ZERO;
@@ -645,19 +645,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 30] = *(a04 + 6);
b[ 31] = *(a04 + 7);
#endif
-
+
a01 += 8;
a02 += 8;
a03 += 8;
a04 += 8;
b += 32;
}
-
+
X += 4;
i --;
} while (i > 0);
}
-
+
i = (m & 3);
if (i > 0) {
if (X > posY) {
@@ -666,7 +666,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a03 += 2 * i;
a04 += 2 * i;
b += 8 * i;
- } else
+ } else
if (X < posY) {
for (ii = 0; ii < i; ii++){
b[ 0] = *(a01 + 0);
@@ -677,7 +677,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a01 + 5);
b[ 6] = *(a01 + 6);
b[ 7] = *(a01 + 7);
-
+
a01 += lda;
a02 += lda;
a03 += lda;
@@ -740,7 +740,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 2){
X = posX;
-
+
if (posX <= posY) {
a01 = a + posY * 2 + (posX + 0) * lda;
a02 = a + posY * 2 + (posX + 1) * lda;
@@ -748,7 +748,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a01 = a + posX * 2 + (posY + 0) * lda;
a02 = a + posX * 2 + (posY + 1) * lda;
}
-
+
i = (m >> 1);
if (i > 0) {
do {
@@ -756,7 +756,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a01 += 4;
a02 += 4;
b += 8;
- } else
+ } else
if (X < posY) {
b[0] = *(a01 + 0);
b[1] = *(a01 + 1);
@@ -779,7 +779,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#endif
b[2] = *(a01 + 2);
b[3] = *(a01 + 3);
-
+
b[4] = ZERO;
b[5] = ZERO;
#ifdef UNIT
@@ -788,30 +788,30 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#else
b[6] = *(a02 + 2);
b[7] = *(a02 + 3);
-#endif
+#endif
a01 += 4;
a02 += 4;
b += 8;
}
-
+
X += 2;
i --;
} while (i > 0);
}
-
+
i = (m & 1);
if (i > 0) {
if (X > posY) {
a01 += 2;
a02 += 2;
b += 4;
- } else
+ } else
if (X < posY) {
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
b[ 2] = *(a01 + 2);
b[ 3] = *(a01 + 3);
-
+
a01 += lda;
a02 += lda;
b += 4;
@@ -833,17 +833,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1){
X = posX;
-
+
if (posX <= posY) {
a01 = a + posY * 2 + (posX + 0) * lda;
} else {
a01 = a + posX * 2 + (posY + 0) * lda;
}
-
+
i = m;
if (i > 0) {
do {
-
+
if (X > posY) {
a01 += 2;
b += 2;
@@ -864,7 +864,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a01 += 2;
b += 2;
}
-
+
X += 1;
i --;
} while (i > 0);
diff --git a/kernel/generic/ztrmm_uncopy_1.c b/kernel/generic/ztrmm_uncopy_1.c
index 595f009..2782cdd 100644
--- a/kernel/generic/ztrmm_uncopy_1.c
+++ b/kernel/generic/ztrmm_uncopy_1.c
@@ -63,7 +63,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = m;
if (i > 0) {
do {
-
+
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -74,7 +74,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += 2;
b += 2;
- } else
+ } else
if (X > posY) {
ao1 += lda;
b += 2;
diff --git a/kernel/generic/ztrmm_uncopy_2.c b/kernel/generic/ztrmm_uncopy_2.c
index 6beddf5..c2521d3 100644
--- a/kernel/generic/ztrmm_uncopy_2.c
+++ b/kernel/generic/ztrmm_uncopy_2.c
@@ -76,7 +76,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao2 + 1);
data07 = *(ao2 + 2);
data08 = *(ao2 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data05;
@@ -90,7 +90,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 4;
b += 8;
- } else
+ } else
if (X > posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
@@ -141,22 +141,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
if (m & 1) {
-
+
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
-
+
ao1 += 2;
ao2 += 2;
b += 4;
- } else
+ } else
if (X > posY) {
ao1 += lda;
b += 4;
@@ -174,7 +174,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -191,17 +191,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1){
X = posX;
-
+
if (posX <= posY) {
ao1 = a + posX * 2 + (posY + 0) * lda;
} else {
ao1 = a + posY * 2 + (posX + 0) * lda;
}
-
+
i = m;
if (m > 0) {
do {
-
+
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -228,7 +228,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 2;
ao1 += lda;
}
-
+
X += 1;
i --;
} while (i > 0);
diff --git a/kernel/generic/ztrmm_uncopy_4.c b/kernel/generic/ztrmm_uncopy_4.c
index f885b0d..249faac 100644
--- a/kernel/generic/ztrmm_uncopy_4.c
+++ b/kernel/generic/ztrmm_uncopy_4.c
@@ -81,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
@@ -90,7 +90,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data14 = *(ao2 + 5);
data15 = *(ao2 + 6);
data16 = *(ao2 + 7);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data19 = *(ao3 + 2);
@@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data22 = *(ao3 + 5);
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
@@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data30 = *(ao4 + 5);
data31 = *(ao4 + 6);
data32 = *(ao4 + 7);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data09;
@@ -117,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data18;
b[ 6] = data25;
b[ 7] = data26;
-
+
b[ 8] = data03;
b[ 9] = data04;
b[10] = data11;
@@ -126,7 +126,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data20;
b[14] = data27;
b[15] = data28;
-
+
b[16] = data05;
b[17] = data06;
b[18] = data13;
@@ -135,7 +135,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = data22;
b[22] = data29;
b[23] = data30;
-
+
b[24] = data07;
b[25] = data08;
b[26] = data15;
@@ -150,7 +150,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao3 += 8;
ao4 += 8;
b += 32;
- } else
+ } else
if (X > posY) {
ao1 += 4 * lda;
ao2 += 4 * lda;
@@ -161,19 +161,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#ifdef UNIT
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data19 = *(ao3 + 2);
data20 = *(ao3 + 3);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
data28 = *(ao4 + 3);
data29 = *(ao4 + 4);
data30 = *(ao4 + 5);
-
+
b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = data09;
@@ -182,7 +182,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data18;
b[ 6] = data25;
b[ 7] = data26;
-
+
b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = ONE;
@@ -191,7 +191,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data20;
b[14] = data27;
b[15] = data28;
-
+
b[16] = ZERO;
b[17] = ZERO;
b[18] = ZERO;
@@ -200,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = ZERO;
b[22] = data29;
b[23] = data30;
-
+
b[24] = ZERO;
b[25] = ZERO;
b[26] = ZERO;
@@ -212,19 +212,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data19 = *(ao3 + 2);
data20 = *(ao3 + 3);
data21 = *(ao3 + 4);
data22 = *(ao3 + 5);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
@@ -233,7 +233,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data30 = *(ao4 + 5);
data31 = *(ao4 + 6);
data32 = *(ao4 + 7);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data09;
@@ -242,7 +242,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data18;
b[ 6] = data25;
b[ 7] = data26;
-
+
b[ 8] = ZERO;
b[ 9] = ZERO;
b[10] = data11;
@@ -251,7 +251,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data20;
b[14] = data27;
b[15] = data28;
-
+
b[16] = ZERO;
b[17] = ZERO;
b[18] = ZERO;
@@ -260,7 +260,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = data22;
b[22] = data29;
b[23] = data30;
-
+
b[24] = ZERO;
b[25] = ZERO;
b[26] = ZERO;
@@ -274,7 +274,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 8;
ao3 += 8;
ao4 += 8;
-
+
b += 32;
}
@@ -285,7 +285,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X < posY) {
if (m & 2) {
@@ -293,22 +293,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data19 = *(ao3 + 2);
data20 = *(ao3 + 3);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
data28 = *(ao4 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data09;
@@ -317,7 +317,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data18;
b[ 6] = data25;
b[ 7] = data26;
-
+
b[ 8] = data03;
b[ 9] = data04;
b[10] = data11;
@@ -326,25 +326,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data20;
b[14] = data27;
b[15] = data28;
-
+
ao1 += 4;
ao2 += 4;
ao3 += 4;
ao4 += 4;
b += 16;
}
-
+
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data09;
@@ -353,27 +353,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data18;
b[ 6] = data25;
b[ 7] = data26;
-
+
ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2;
b += 8;
}
-
- } else
+
+ } else
if (X > posY) {
if (m & 2) {
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 16;
}
-
+
if (m & 1) {
ao1 += lda;
b += 8;
}
-
+
} else {
#ifdef UNIT
@@ -405,7 +405,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 6] = data25;
b[ 7] = data26;
b += 8;
-
+
if (i >= 2) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -465,7 +465,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 6] = data25;
b[ 7] = data26;
b += 8;
-
+
if (i >= 2) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -517,12 +517,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data09;
@@ -536,7 +536,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 4;
b += 8;
- } else
+ } else
if (X > posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
@@ -558,12 +558,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data09;
@@ -585,13 +585,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 1);
if (i) {
-
+
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data09;
@@ -599,7 +599,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += 2;
ao2 += 2;
b += 4;
- } else
+ } else
if (X > posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
@@ -608,7 +608,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#ifdef UNIT
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
-
+
b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = data09;
@@ -618,7 +618,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data02 = *(ao1 + 1);
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data09;
diff --git a/kernel/generic/ztrmm_uncopy_8.c b/kernel/generic/ztrmm_uncopy_8.c
index c02c1de..faadd21 100644
--- a/kernel/generic/ztrmm_uncopy_8.c
+++ b/kernel/generic/ztrmm_uncopy_8.c
@@ -72,14 +72,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao7 = a + posY * 2 + (posX + 6) * lda;
ao8 = a + posY * 2 + (posX + 7) * lda;
}
-
+
i = (m >> 3);
if (i > 0) {
do {
if (X < posY) {
for (ii = 0; ii < 8; ii++){
-
+
b[ 0] = *(ao1 + 0);
b[ 1] = *(ao1 + 1);
b[ 2] = *(ao2 + 0);
@@ -88,7 +88,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(ao3 + 1);
b[ 6] = *(ao4 + 0);
b[ 7] = *(ao4 + 1);
-
+
b[ 8] = *(ao5 + 0);
b[ 9] = *(ao5 + 1);
b[ 10] = *(ao6 + 0);
@@ -97,7 +97,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(ao7 + 1);
b[ 14] = *(ao8 + 0);
b[ 15] = *(ao8 + 1);
-
+
ao1 += 2;
ao2 += 2;
ao3 += 2;
@@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao8 += 2;
b += 16;
}
- } else
+ } else
if (X > posY) {
ao1 += 8 * lda;
ao2 += 8 * lda;
@@ -118,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao6 += 8 * lda;
ao7 += 8 * lda;
ao8 += 8 * lda;
-
+
b += 128;
} else {
@@ -135,7 +135,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(ao3 + 1);
b[ 6] = *(ao4 + 0);
b[ 7] = *(ao4 + 1);
-
+
b[ 8] = *(ao5 + 0);
b[ 9] = *(ao5 + 1);
b[ 10] = *(ao6 + 0);
@@ -144,7 +144,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(ao7 + 1);
b[ 14] = *(ao8 + 0);
b[ 15] = *(ao8 + 1);
-
+
b[ 16] = ZERO;
b[ 17] = ZERO;
#ifdef UNIT
@@ -298,7 +298,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[126] = *(ao8 + 14);
b[127] = *(ao8 + 15);
#endif
-
+
ao1 += 8 * lda;
ao2 += 8 * lda;
ao3 += 8 * lda;
@@ -317,7 +317,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 7);
if (i) {
-
+
if (X < posY) {
for (ii = 0; ii < i; ii++){
b[ 0] = *(ao1 + 0);
@@ -328,7 +328,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(ao3 + 1);
b[ 6] = *(ao4 + 0);
b[ 7] = *(ao4 + 1);
-
+
b[ 8] = *(ao5 + 0);
b[ 9] = *(ao5 + 1);
b[ 10] = *(ao6 + 0);
@@ -337,7 +337,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(ao7 + 1);
b[ 14] = *(ao8 + 0);
b[ 15] = *(ao8 + 1);
-
+
ao1 += 2;
ao2 += 2;
ao3 += 2;
@@ -348,7 +348,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao8 += 2;
b += 16;
}
- } else
+ } else
if (X > posY) {
ao1 += i * lda;
ao2 += i * lda;
@@ -382,7 +382,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[14] = *(ao8 + 0);
b[15] = *(ao8 + 1);
b += 16;
-
+
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -407,7 +407,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[15] = *(ao8 + 3);
b += 16;
}
-
+
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -432,8 +432,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[15] = *(ao8 + 5);
b += 16;
}
-
- if (i >= 4) {
+
+ if (i >= 4) {
b[ 0] = ZERO;
b[ 1] = ZERO;
b[ 2] = ZERO;
@@ -482,7 +482,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[15] = *(ao8 + 9);
b += 16;
}
-
+
if (i >= 6) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -561,7 +561,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
do {
if (X < posY) {
for (ii = 0; ii < 4; ii++){
-
+
b[ 0] = *(ao1 + 0);
b[ 1] = *(ao1 + 1);
b[ 2] = *(ao2 + 0);
@@ -570,14 +570,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(ao3 + 1);
b[ 6] = *(ao4 + 0);
b[ 7] = *(ao4 + 1);
-
+
ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2;
b += 8;
}
- } else
+ } else
if (X > posY) {
ao1 += 4 * lda;
ao2 += 4 * lda;
@@ -598,7 +598,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(ao3 + 1);
b[ 6] = *(ao4 + 0);
b[ 7] = *(ao4 + 1);
-
+
b[ 8] = ZERO;
b[ 9] = ZERO;
#ifdef UNIT
@@ -645,7 +645,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
-
+
b += 32;
}
@@ -656,7 +656,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X < posY) {
for (ii = 0; ii < i; ii++){
@@ -668,14 +668,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(ao3 + 1);
b[ 6] = *(ao4 + 0);
b[ 7] = *(ao4 + 1);
-
+
ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2;
b += 8;
}
- } else
+ } else
if (X > posY) {
ao1 += i * lda;
ao2 += i * lda;
@@ -697,7 +697,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 6] = *(ao4 + 0);
b[ 7] = *(ao4 + 1);
b += 8;
-
+
if(i >= 2) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -714,7 +714,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = *(ao4 + 3);
b += 8;
}
-
+
if (i >= 3) {
b[ 0] = ZERO;
b[ 1] = ZERO;
@@ -764,7 +764,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += 4;
ao2 += 4;
b += 8;
- } else
+ } else
if (X > posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
@@ -779,7 +779,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#endif
b[ 2] = *(ao2 + 0);
b[ 3] = *(ao2 + 1);
-
+
b[ 4] = ZERO;
b[ 5] = ZERO;
#ifdef UNIT
@@ -801,7 +801,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
if (m & 1) {
-
+
if (X < posY) {
b[ 0] = *(ao1 + 0);
b[ 1] = *(ao1 + 1);
@@ -810,7 +810,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += 2;
ao2 += 2;
b += 4;
- } else
+ } else
if (X > posY) {
ao1 += 2 * lda;
ao2 += 2 * lda;
@@ -850,7 +850,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 1] = *(ao1 + 1);
ao1 += 2;
b += 2;
- } else
+ } else
if (X > posY) {
ao1 += lda;
b += 2;
diff --git a/kernel/generic/ztrmm_utcopy_1.c b/kernel/generic/ztrmm_utcopy_1.c
index d4406c9..2746c5f 100644
--- a/kernel/generic/ztrmm_utcopy_1.c
+++ b/kernel/generic/ztrmm_utcopy_1.c
@@ -66,7 +66,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (X < posY) {
ao1 += 2;
b += 2;
- } else
+ } else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -76,7 +76,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 2;
} else {
-#ifdef UNIT
+#ifdef UNIT
b[ 0] = ONE;
b[ 1] = ZERO;
#else
diff --git a/kernel/generic/ztrmm_utcopy_2.c b/kernel/generic/ztrmm_utcopy_2.c
index c71a55c..840821e 100644
--- a/kernel/generic/ztrmm_utcopy_2.c
+++ b/kernel/generic/ztrmm_utcopy_2.c
@@ -72,18 +72,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 4;
b += 8;
- } else
+ } else
if (X > posY) {
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);
-
+
data5 = *(ao2 + 0);
data6 = *(ao2 + 1);
data7 = *(ao2 + 2);
data8 = *(ao2 + 3);
-
+
b[ 0] = data1;
b[ 1] = data2;
b[ 2] = data3;
@@ -92,7 +92,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data6;
b[ 6] = data7;
b[ 7] = data8;
-
+
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 8;
@@ -101,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#ifdef UNIT
data5 = *(ao2 + 0);
data6 = *(ao2 + 1);
-
+
b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = ZERO;
@@ -113,12 +113,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#else
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);
-
+
data5 = *(ao2 + 0);
data6 = *(ao2 + 1);
data7 = *(ao2 + 2);
data8 = *(ao2 + 3);
-
+
b[ 0] = data1;
b[ 1] = data2;
b[ 2] = ZERO;
@@ -128,10 +128,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 6] = data7;
b[ 7] = data8;
#endif
-
+
ao1 += 2 * lda;
ao2 += 2 * lda;
-
+
b += 8;
}
@@ -145,21 +145,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao1 += 2;
ao2 += 2;
b += 4;
- } else
+ } else
if (X > posY) {
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);
-
+
b[ 0] = data1;
b[ 1] = data2;
b[ 2] = data3;
b[ 3] = data4;
-
+
ao1 += lda;
b += 4;
-
+
} else {
#ifdef UNIT
data5 = *(ao2 + 0);
@@ -208,10 +208,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (X > posY) {
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);
-
+
b[ 0] = data1;
b[ 1] = data2;
-
+
ao1 += lda;
b += 2;
} else {
@@ -221,14 +221,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#else
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);
-
+
b[ 0] = data1;
b[ 1] = data2;
-#endif
+#endif
ao1 += lda;
b += 2;
}
-
+
X += 1;
i --;
} while (i > 0);
diff --git a/kernel/generic/ztrmm_utcopy_4.c b/kernel/generic/ztrmm_utcopy_4.c
index cda62bc..9a5c8c3 100644
--- a/kernel/generic/ztrmm_utcopy_4.c
+++ b/kernel/generic/ztrmm_utcopy_4.c
@@ -80,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao4 += 8;
b += 32;
- } else
+ } else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -90,7 +90,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
@@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data14 = *(ao2 + 5);
data15 = *(ao2 + 6);
data16 = *(ao2 + 7);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data19 = *(ao3 + 2);
@@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data22 = *(ao3 + 5);
data23 = *(ao3 + 6);
data24 = *(ao3 + 7);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
@@ -117,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data30 = *(ao4 + 5);
data31 = *(ao4 + 6);
data32 = *(ao4 + 7);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -126,7 +126,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
@@ -135,7 +135,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data14;
b[14] = data15;
b[15] = data16;
-
+
b[16] = data17;
b[17] = data18;
b[18] = data19;
@@ -144,7 +144,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = data22;
b[22] = data23;
b[23] = data24;
-
+
b[24] = data25;
b[25] = data26;
b[26] = data27;
@@ -153,7 +153,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[29] = data30;
b[30] = data31;
b[31] = data32;
-
+
ao1 += 4 * lda;
ao2 += 4 * lda;
ao3 += 4 * lda;
@@ -162,22 +162,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} else {
-#ifdef UNIT
+#ifdef UNIT
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data19 = *(ao3 + 2);
data20 = *(ao3 + 3);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
data28 = *(ao4 + 3);
data29 = *(ao4 + 4);
data30 = *(ao4 + 5);
-
+
b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = ZERO;
@@ -186,7 +186,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ZERO;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = ONE;
@@ -195,7 +195,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = ZERO;
b[14] = ZERO;
b[15] = ZERO;
-
+
b[16] = data17;
b[17] = data18;
b[18] = data19;
@@ -204,7 +204,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = ZERO;
b[22] = ZERO;
b[23] = ZERO;
-
+
b[24] = data25;
b[25] = data26;
b[26] = data27;
@@ -221,14 +221,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
-
+
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
data19 = *(ao3 + 2);
data20 = *(ao3 + 3);
data21 = *(ao3 + 4);
data22 = *(ao3 + 5);
-
+
data25 = *(ao4 + 0);
data26 = *(ao4 + 1);
data27 = *(ao4 + 2);
@@ -237,7 +237,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data30 = *(ao4 + 5);
data31 = *(ao4 + 6);
data32 = *(ao4 + 7);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = ZERO;
@@ -246,7 +246,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ZERO;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
@@ -255,7 +255,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = ZERO;
b[14] = ZERO;
b[15] = ZERO;
-
+
b[16] = data17;
b[17] = data18;
b[18] = data19;
@@ -264,7 +264,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[21] = data22;
b[22] = ZERO;
b[23] = ZERO;
-
+
b[24] = data25;
b[25] = data26;
b[26] = data27;
@@ -279,7 +279,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 4 * lda;
ao3 += 4 * lda;
ao4 += 4 * lda;
-
+
b += 32;
}
@@ -290,7 +290,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X < posY) {
if (m & 2) {
@@ -300,7 +300,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao4 += 4;
b += 16;
}
-
+
if (m & 1) {
ao1 += 2;
ao2 += 2;
@@ -308,8 +308,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao4 += 2;
b += 8;
}
-
- } else
+
+ } else
if (X > posY) {
if (m & 2) {
data01 = *(ao1 + 0);
@@ -320,7 +320,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
@@ -329,7 +329,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data14 = *(ao2 + 5);
data15 = *(ao2 + 6);
data16 = *(ao2 + 7);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -338,7 +338,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
@@ -347,12 +347,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[13] = data14;
b[14] = data15;
b[15] = data16;
-
+
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 16;
}
-
+
if (m & 1) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
@@ -362,7 +362,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data06 = *(ao1 + 5);
data07 = *(ao1 + 6);
data08 = *(ao1 + 7);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -371,19 +371,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
-
+
ao1 += lda;
b += 8;
}
-
+
} else {
-#ifdef UNIT
+#ifdef UNIT
if (i >= 2) {
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
}
-
+
if (i >= 3) {
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
@@ -434,7 +434,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
}
-
+
if (i >= 3) {
data17 = *(ao3 + 0);
data18 = *(ao3 + 1);
@@ -505,18 +505,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 4;
b += 8;
- } else
+ } else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
@@ -525,7 +525,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = data10;
b[ 6] = data11;
b[ 7] = data12;
-
+
ao1 += 2 * lda;
ao2 += 2 * lda;
b += 8;
@@ -534,7 +534,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#ifdef UNIT
data09 = *(ao2 + 0);
data10 = *(ao2 + 1);
-
+
b[ 0] = ONE;
b[ 1] = ZERO;
b[ 2] = ZERO;
@@ -551,7 +551,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
data10 = *(ao2 + 1);
data11 = *(ao2 + 2);
data12 = *(ao2 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = ZERO;
@@ -563,7 +563,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#endif
ao1 += 2 * lda;
ao2 += 2 * lda;
-
+
b += 8;
}
@@ -574,21 +574,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 1);
if (i) {
-
+
if (X < posY) {
b += 4;
- } else
+ } else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao1 + 2);
data04 = *(ao1 + 3);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
-
+
b += 4;
} else {
#ifdef UNIT
@@ -599,7 +599,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
-
+
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = ZERO;
@@ -628,7 +628,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (X < posY) {
b += 2;
ao1 += 2;
- } else
+ } else
if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
diff --git a/kernel/generic/ztrmm_utcopy_8.c b/kernel/generic/ztrmm_utcopy_8.c
index 08dd80c..6c04484 100644
--- a/kernel/generic/ztrmm_utcopy_8.c
+++ b/kernel/generic/ztrmm_utcopy_8.c
@@ -73,7 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a07 = a + posY * 2 + (posX + 6) * lda;
a08 = a + posY * 2 + (posX + 7) * lda;
}
-
+
i = (m >> 3);
if (i > 0) {
do {
@@ -87,11 +87,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a07 += 16;
a08 += 16;
b += 128;
- } else
+ } else
if (X > posY) {
for (ii = 0; ii < 8; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
b[ 2] = *(a01 + 2);
@@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a01 + 5);
b[ 6] = *(a01 + 6);
b[ 7] = *(a01 + 7);
-
+
b[ 8] = *(a01 + 8);
b[ 9] = *(a01 + 9);
b[ 10] = *(a01 + 10);
@@ -109,7 +109,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(a01 + 13);
b[ 14] = *(a01 + 14);
b[ 15] = *(a01 + 15);
-
+
a01 += lda;
b += 16;
}
@@ -136,7 +136,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ZERO;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = ZERO;
b[ 9] = ZERO;
b[ 10] = ZERO;
@@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = ZERO;
b[ 14] = ZERO;
b[ 15] = ZERO;
-
+
b[ 16] = *(a02 + 0);
b[ 17] = *(a02 + 1);
#ifdef UNIT
@@ -317,7 +317,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 7);
if (i) {
-
+
if (X < posY) {
a01 += 2 * i;
@@ -329,11 +329,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a07 += 2 * i;
a08 += 2 * i;
b += 16 * i;
- } else
+ } else
if (X > posY) {
for (ii = 0; ii < i; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
b[ 2] = *(a01 + 2);
@@ -342,7 +342,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a01 + 5);
b[ 6] = *(a01 + 6);
b[ 7] = *(a01 + 7);
-
+
b[ 8] = *(a01 + 8);
b[ 9] = *(a01 + 9);
b[ 10] = *(a01 + 10);
@@ -351,7 +351,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = *(a01 + 13);
b[ 14] = *(a01 + 14);
b[ 15] = *(a01 + 15);
-
+
a01 += lda;
a02 += lda;
a03 += lda;
@@ -386,7 +386,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[14] = ZERO;
b[15] = ZERO;
b += 16;
-
+
if(i >= 2) {
b[ 0] = *(a02 + 0);
b[ 1] = *(a02 + 1);
@@ -411,7 +411,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[15] = ZERO;
b += 16;
}
-
+
if (i >= 3) {
b[ 0] = *(a03 + 0);
b[ 1] = *(a03 + 1);
@@ -436,8 +436,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[15] = ZERO;
b += 16;
}
-
- if (i >= 4) {
+
+ if (i >= 4) {
b[ 0] = *(a04 + 0);
b[ 1] = *(a04 + 1);
b[ 2] = *(a04 + 2);
@@ -486,7 +486,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[15] = ZERO;
b += 16;
}
-
+
if (i >= 6) {
b[ 0] = *(a06 + 0);
b[ 1] = *(a06 + 1);
@@ -547,7 +547,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 4){
X = posX;
-
+
if (posX <= posY) {
a01 = a + posX * 2 + (posY + 0) * lda;
a02 = a + posX * 2 + (posY + 1) * lda;
@@ -559,7 +559,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a03 = a + posY * 2 + (posX + 2) * lda;
a04 = a + posY * 2 + (posX + 3) * lda;
}
-
+
i = (m >> 2);
if (i > 0) {
do {
@@ -569,11 +569,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a03 += 8;
a04 += 8;
b += 32;
- } else
+ } else
if (X > posY) {
-
+
for (ii = 0; ii < 4; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
b[ 2] = *(a01 + 2);
@@ -582,11 +582,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a01 + 5);
b[ 6] = *(a01 + 6);
b[ 7] = *(a01 + 7);
-
+
a01 += lda;
b += 8;
}
-
+
a02 += 4 * lda;
a03 += 4 * lda;
a04 += 4 * lda;
@@ -605,7 +605,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = ZERO;
b[ 6] = ZERO;
b[ 7] = ZERO;
-
+
b[ 8] = *(a02 + 0);
b[ 9] = *(a02 + 1);
#ifdef UNIT
@@ -619,7 +619,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 13] = ZERO;
b[ 14] = ZERO;
b[ 15] = ZERO;
-
+
b[ 16] = *(a03 + 0);
b[ 17] = *(a03 + 1);
b[ 18] = *(a03 + 2);
@@ -633,7 +633,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
#endif
b[ 22] = ZERO;
b[ 23] = ZERO;
-
+
b[ 24] = *(a04 + 0);
b[ 25] = *(a04 + 1);
b[ 26] = *(a04 + 2);
@@ -647,14 +647,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 30] = *(a04 + 6);
b[ 31] = *(a04 + 7);
#endif
-
+
a01 += 4 * lda;
a02 += 4 * lda;
a03 += 4 * lda;
a04 += 4 * lda;
b += 32;
}
-
+
X += 4;
i --;
} while (i > 0);
@@ -662,18 +662,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i) {
-
+
if (X < posY) {
a01 += 2 * i;
a02 += 2 * i;
a03 += 2 * i;
a04 += 2 * i;
b += 8 * i;
- } else
+ } else
if (X > posY) {
-
+
for (ii = 0; ii < i; ii++){
-
+
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
b[ 2] = *(a01 + 2);
@@ -682,7 +682,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a01 + 5);
b[ 6] = *(a01 + 6);
b[ 7] = *(a01 + 7);
-
+
a01 += lda;
a02 += lda;
a03 += lda;
@@ -690,7 +690,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 8;
}
} else {
-
+
#ifdef UNIT
b[ 0] = ONE;
b[ 1] = ZERO;
@@ -705,7 +705,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 6] = ZERO;
b[ 7] = ZERO;
b += 8;
-
+
if(i >= 2) {
b[ 0] = *(a02 + 0);
b[ 1] = *(a02 + 1);
@@ -722,7 +722,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = ZERO;
b += 8;
}
-
+
if (i >= 3) {
b[ 0] = *(a03 + 0);
b[ 1] = *(a03 + 1);
@@ -741,14 +741,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
}
}
-
+
posY += 4;
}
if (n & 2){
X = posX;
-
+
if (posX <= posY) {
a01 = a + posX * 2 + (posY + 0) * lda;
a02 = a + posX * 2 + (posY + 1) * lda;
@@ -756,7 +756,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a01 = a + posY * 2 + (posX + 0) * lda;
a02 = a + posY * 2 + (posX + 1) * lda;
}
-
+
i = (m >> 1);
if (i > 0) {
do {
@@ -764,7 +764,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a01 += 4;
a02 += 4;
b += 8;
- } else
+ } else
if (X > posY) {
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
@@ -774,12 +774,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 5] = *(a02 + 1);
b[ 6] = *(a02 + 2);
b[ 7] = *(a02 + 3);
-
+
a01 += 2 * lda;
a02 += 2 * lda;
b += 8;
} else {
-
+
#ifdef UNIT
b[ 0] = ONE;
b[ 1] = ZERO;
@@ -799,12 +799,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 6] = *(a02 + 2);
b[ 7] = *(a02 + 3);
#endif
-
+
a01 += 2 * lda;
a02 += 2 * lda;
b += 8;
}
-
+
X += 2;
i --;
} while (i > 0);
@@ -812,10 +812,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 1);
if (i) {
-
+
if (X < posY) {
b += 4;
- } else
+ } else
if (X > posY) {
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
@@ -840,20 +840,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (n & 1){
X = posX;
-
+
if (posX <= posY) {
a01 = a + posX * 2 + (posY + 0) * lda;
} else {
a01 = a + posY * 2 + (posX + 0) * lda;
}
-
+
i = m;
if (m > 0) {
do {
if (X < posY) {
a01 += 2;
b += 2;
- } else
+ } else
if (X > posY) {
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
@@ -875,6 +875,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} while (i > 0);
}
}
-
+
return 0;
}
diff --git a/kernel/generic/ztrmmkernel_2x2.c b/kernel/generic/ztrmmkernel_2x2.c
index b7c6539..ecb2a97 100644
--- a/kernel/generic/ztrmmkernel_2x2.c
+++ b/kernel/generic/ztrmmkernel_2x2.c
@@ -16,7 +16,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
#if defined(TRMMKERNEL) && !defined(LEFT)
off = -offset;
#endif
- for (j=0; j<bn/2; j+=1)
+ for (j=0; j<bn/2; j+=1)
{
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
@@ -24,7 +24,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
C0 = C;
C1 = C0+2*ldc;
ptrba = ba;
- for (i=0; i<bm/2; i+=1)
+ for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
@@ -44,10 +44,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
temp = bk - off;
#elif defined(LEFT)
temp = off + 2;
-#else
+#else
temp = off + 2;
#endif
- for (k=0; k<temp/4; k+=1)
+ for (k=0; k<temp/4; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
@@ -444,7 +444,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
ptrba = ptrba+16;
ptrbb = ptrbb+16;
}
- for (k=0; k<(temp&3); k+=1)
+ for (k=0; k<(temp&3); k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
@@ -590,11 +590,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
C0 = C0+4;
C1 = C1+4;
}
- for (i=0; i<(bm&1); i+=1)
+ for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
-#else
+#else
ptrba += off*2;
ptrbb = bb + off*2*2;
#endif
@@ -609,7 +609,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
#else
temp = off+2;
#endif
- for (k=0; k<temp; k+=1)
+ for (k=0; k<temp; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
@@ -692,7 +692,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
temp = bk - off;
#ifdef LEFT
temp -= 1;
-#else
+#else
temp -= 2;
#endif
ptrba += temp*2;
@@ -712,18 +712,18 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
i = (ldc<<2);
C = C+i;
}
- for (j=0; j<(bn&1); j+=1)
+ for (j=0; j<(bn&1); j+=1)
{
C0 = C;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
- for (i=0; i<bm/2; i+=1)
+ for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
-#else
+#else
ptrba += off*2*2;
ptrbb = bb+off*2;
#endif
@@ -738,7 +738,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
#else
temp = off + 1;
#endif
- for (k=0; k<temp; k+=1)
+ for (k=0; k<temp; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
@@ -832,11 +832,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
#endif
C0 = C0+4;
}
- for (i=0; i<(bm&1); i+=1)
+ for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
-#else
+#else
ptrba += off*2;
ptrbb = bb + off*2;
#endif
@@ -846,10 +846,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
temp = bk-off;
#elif defined(LEFT)
temp = off + 1;
-#else
+#else
temp = off + 1;
#endif
- for (k=0; k<temp; k+=1)
+ for (k=0; k<temp; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
@@ -903,7 +903,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
temp = bk - off;
#ifdef LEFT
temp -= 1;
-#else
+#else
temp -= 1;
#endif
ptrba += temp*2;
diff --git a/kernel/generic/ztrsm_lncopy_1.c b/kernel/generic/ztrsm_lncopy_1.c
index ec8ffbc..8dab451 100644
--- a/kernel/generic/ztrsm_lncopy_1.c
+++ b/kernel/generic/ztrsm_lncopy_1.c
@@ -74,7 +74,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 0) = data01;
*(b + 1) = data02;
}
-
+
a1 += 2;
b += 2;
diff --git a/kernel/generic/ztrsm_lncopy_2.c b/kernel/generic/ztrsm_lncopy_2.c
index 967b60c..1e76af7 100644
--- a/kernel/generic/ztrsm_lncopy_2.c
+++ b/kernel/generic/ztrsm_lncopy_2.c
@@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data07;
*(b + 7) = data08;
}
-
+
a1 += 4;
a2 += 4;
b += 8;
@@ -159,7 +159,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 0) = data01;
*(b + 1) = data02;
}
-
+
a1+= 2;
b += 2;
i --;
diff --git a/kernel/generic/ztrsm_lncopy_4.c b/kernel/generic/ztrsm_lncopy_4.c
index e4a3fb9..2ad0540 100644
--- a/kernel/generic/ztrsm_lncopy_4.c
+++ b/kernel/generic/ztrsm_lncopy_4.c
@@ -198,7 +198,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 30) = data31;
*(b + 31) = data32;
}
-
+
a1 += 8;
a2 += 8;
a3 += 8;
@@ -270,7 +270,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 14) = data27;
*(b + 15) = data28;
}
-
+
a1 += 4;
a2 += 4;
a3 += 4;
@@ -312,7 +312,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data25;
*(b + 7) = data26;
}
-
+
a1 += 2;
a2 += 2;
a3 += 2;
@@ -378,7 +378,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data11;
*(b + 7) = data12;
}
-
+
a1 += 4;
a2 += 4;
b += 8;
@@ -409,7 +409,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data09;
*(b + 3) = data10;
}
-
+
a1 += 2;
a2 += 2;
b += 4;
@@ -443,7 +443,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 0) = data01;
*(b + 1) = data02;
}
-
+
a1 += 2;
b += 2;
diff --git a/kernel/generic/ztrsm_lncopy_8.c b/kernel/generic/ztrsm_lncopy_8.c
index 0176f91..f6700de 100644
--- a/kernel/generic/ztrsm_lncopy_8.c
+++ b/kernel/generic/ztrsm_lncopy_8.c
@@ -45,7 +45,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8;
FLOAT data1, data2;
-
+
lda *= 2;
jj = offset;
@@ -65,7 +65,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 8)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
@@ -76,7 +76,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
compinv(b + (ii - jj) * 2, data1, data2);
}
-
+
if (ii - jj >= 8) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -122,7 +122,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 4)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
@@ -133,7 +133,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
compinv(b + (ii - jj) * 2, data1, data2);
}
-
+
if (ii - jj >= 4) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -164,7 +164,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 2)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
@@ -175,7 +175,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
compinv(b + (ii - jj) * 2, data1, data2);
}
-
+
if (ii - jj >= 2) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -198,7 +198,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 1)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
@@ -209,7 +209,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
compinv(b + (ii - jj) * 2, data1, data2);
}
-
+
if (ii - jj >= 1) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
diff --git a/kernel/generic/ztrsm_ltcopy_1.c b/kernel/generic/ztrsm_ltcopy_1.c
index ef49532..af4ac12 100644
--- a/kernel/generic/ztrsm_ltcopy_1.c
+++ b/kernel/generic/ztrsm_ltcopy_1.c
@@ -74,7 +74,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 0) = data01;
*(b + 1) = data02;
}
-
+
a1 += lda;
b += 2;
diff --git a/kernel/generic/ztrsm_ltcopy_2.c b/kernel/generic/ztrsm_ltcopy_2.c
index bcc2bbc..21bd0fa 100644
--- a/kernel/generic/ztrsm_ltcopy_2.c
+++ b/kernel/generic/ztrsm_ltcopy_2.c
@@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data07;
*(b + 7) = data08;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 8;
@@ -164,7 +164,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 0) = data01;
*(b + 1) = data02;
}
-
+
a1 += 1 * lda;
b += 2;
diff --git a/kernel/generic/ztrsm_ltcopy_4.c b/kernel/generic/ztrsm_ltcopy_4.c
index 8c4e66b..c115271 100644
--- a/kernel/generic/ztrsm_ltcopy_4.c
+++ b/kernel/generic/ztrsm_ltcopy_4.c
@@ -198,7 +198,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 30) = data31;
*(b + 31) = data32;
}
-
+
a1 += 4 * lda;
a2 += 4 * lda;
a3 += 4 * lda;
@@ -284,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 14) = data15;
*(b + 15) = data16;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 16;
@@ -334,7 +334,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data07;
*(b + 7) = data08;
}
-
+
a1 += lda;
b += 8;
ii += 1;
@@ -394,7 +394,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data11;
*(b + 7) = data12;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 8;
@@ -429,7 +429,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data03;
*(b + 3) = data04;
}
-
+
a1 += lda;
b += 4;
ii += 1;
@@ -463,7 +463,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 0) = data01;
*(b + 1) = data02;
}
-
+
a1 += lda;
b += 2;
diff --git a/kernel/generic/ztrsm_ltcopy_8.c b/kernel/generic/ztrsm_ltcopy_8.c
index 899c9ab..83f2811 100644
--- a/kernel/generic/ztrsm_ltcopy_8.c
+++ b/kernel/generic/ztrsm_ltcopy_8.c
@@ -45,7 +45,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
FLOAT *a1;
FLOAT data1, data2;
-
+
lda *= 2;
jj = offset;
@@ -57,7 +57,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 8)) {
data1 = *(a1 + (ii - jj) * 2 + 0);
@@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
}
}
-
+
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 4)) {
data1 = *(a1 + (ii - jj) * 2 + 0);
@@ -122,7 +122,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
}
}
-
+
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -150,7 +150,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 2)) {
data1 = *(a1 + (ii - jj) * 2 + 0);
@@ -164,7 +164,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
}
}
-
+
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -187,14 +187,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 1)) {
data1 = *(a1 + (ii - jj) * 2 + 0);
data2 = *(a1 + (ii - jj) * 2 + 1);
compinv(b + (ii - jj) * 2, data1, data2);
}
-
+
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
diff --git a/kernel/generic/ztrsm_uncopy_1.c b/kernel/generic/ztrsm_uncopy_1.c
index 0891300..dc9157b 100644
--- a/kernel/generic/ztrsm_uncopy_1.c
+++ b/kernel/generic/ztrsm_uncopy_1.c
@@ -73,7 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 0) = data01;
*(b + 1) = data02;
}
-
+
a1 += 2;
b += 2;
diff --git a/kernel/generic/ztrsm_uncopy_2.c b/kernel/generic/ztrsm_uncopy_2.c
index 45c2093..fecab88 100644
--- a/kernel/generic/ztrsm_uncopy_2.c
+++ b/kernel/generic/ztrsm_uncopy_2.c
@@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data07;
*(b + 7) = data08;
}
-
+
a1 += 4;
a2 += 4;
b += 8;
@@ -164,7 +164,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 0) = data01;
*(b + 1) = data02;
}
-
+
a1+= 2;
b += 2;
i --;
diff --git a/kernel/generic/ztrsm_uncopy_4.c b/kernel/generic/ztrsm_uncopy_4.c
index 9cbc6c7..9d0e243 100644
--- a/kernel/generic/ztrsm_uncopy_4.c
+++ b/kernel/generic/ztrsm_uncopy_4.c
@@ -197,7 +197,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 30) = data31;
*(b + 31) = data32;
}
-
+
a1 += 8;
a2 += 8;
a3 += 8;
@@ -287,7 +287,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 14) = data27;
*(b + 15) = data28;
}
-
+
a1 += 4;
a2 += 4;
a3 += 4;
@@ -343,7 +343,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data25;
*(b + 7) = data26;
}
-
+
a1 += 2;
a2 += 2;
a3 += 2;
@@ -407,7 +407,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data11;
*(b + 7) = data12;
}
-
+
a1 += 4;
a2 += 4;
b += 8;
@@ -443,7 +443,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data09;
*(b + 3) = data10;
}
-
+
a1 += 2;
a2 += 2;
b += 4;
@@ -480,7 +480,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 0) = data01;
*(b + 1) = data02;
}
-
+
a1 += 2;
b += 2;
diff --git a/kernel/generic/ztrsm_uncopy_8.c b/kernel/generic/ztrsm_uncopy_8.c
index 2ce1c72..453a6c0 100644
--- a/kernel/generic/ztrsm_uncopy_8.c
+++ b/kernel/generic/ztrsm_uncopy_8.c
@@ -45,7 +45,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8;
FLOAT data1, data2;
-
+
lda *= 2;
jj = offset;
@@ -65,7 +65,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 8)) {
data1 = *(a1 + (ii - jj) * lda + 0);
@@ -78,7 +78,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
}
}
-
+
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -124,7 +124,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 4)) {
data1 = *(a1 + (ii - jj) * lda + 0);
data2 = *(a1 + (ii - jj) * lda + 1);
@@ -136,7 +136,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
}
}
-
+
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -167,7 +167,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 2)) {
data1 = *(a1 + (ii - jj) * lda + 0);
data2 = *(a1 + (ii - jj) * lda + 1);
@@ -178,7 +178,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
}
}
-
+
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -201,7 +201,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 1)) {
data1 = *(a1 + (ii - jj) * lda + 0);
data2 = *(a1 + (ii - jj) * lda + 1);
@@ -212,7 +212,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
}
}
-
+
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
diff --git a/kernel/generic/ztrsm_utcopy_1.c b/kernel/generic/ztrsm_utcopy_1.c
index 42ecc47..08f85e8 100644
--- a/kernel/generic/ztrsm_utcopy_1.c
+++ b/kernel/generic/ztrsm_utcopy_1.c
@@ -73,7 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 0) = data01;
*(b + 1) = data02;
}
-
+
a1 += lda;
b += 2;
diff --git a/kernel/generic/ztrsm_utcopy_2.c b/kernel/generic/ztrsm_utcopy_2.c
index fd7affb..387bb25 100644
--- a/kernel/generic/ztrsm_utcopy_2.c
+++ b/kernel/generic/ztrsm_utcopy_2.c
@@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data07;
*(b + 7) = data08;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 8;
@@ -158,7 +158,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 0) = data01;
*(b + 1) = data02;
}
-
+
a1 += 1 * lda;
b += 2;
diff --git a/kernel/generic/ztrsm_utcopy_4.c b/kernel/generic/ztrsm_utcopy_4.c
index fd3483c..f19badd 100644
--- a/kernel/generic/ztrsm_utcopy_4.c
+++ b/kernel/generic/ztrsm_utcopy_4.c
@@ -196,7 +196,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 30) = data31;
*(b + 31) = data32;
}
-
+
a1 += 4 * lda;
a2 += 4 * lda;
a3 += 4 * lda;
@@ -264,7 +264,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 14) = data15;
*(b + 15) = data16;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 16;
@@ -302,7 +302,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data07;
*(b + 7) = data08;
}
-
+
a1 += lda;
b += 8;
@@ -363,7 +363,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 6) = data11;
*(b + 7) = data12;
}
-
+
a1 += 2 * lda;
a2 += 2 * lda;
b += 8;
@@ -393,7 +393,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 2) = data03;
*(b + 3) = data04;
}
-
+
a1 += lda;
b += 4;
@@ -428,7 +428,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
*(b + 0) = data01;
*(b + 1) = data02;
}
-
+
a1 += lda;
b += 2;
diff --git a/kernel/generic/ztrsm_utcopy_8.c b/kernel/generic/ztrsm_utcopy_8.c
index 52c7ed5..be270f1 100644
--- a/kernel/generic/ztrsm_utcopy_8.c
+++ b/kernel/generic/ztrsm_utcopy_8.c
@@ -57,7 +57,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 8)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
@@ -69,7 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
compinv(b + (ii - jj) * 2, data1, data2);
}
-
+
if (ii - jj >= 8) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -106,7 +106,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 4)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
@@ -118,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
compinv(b + (ii - jj) * 2, data1, data2);
}
-
+
if (ii - jj >= 4) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -146,7 +146,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 2)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
@@ -158,7 +158,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
compinv(b + (ii - jj) * 2, data1, data2);
}
-
+
if (ii - jj >= 2) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
@@ -181,7 +181,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
ii = 0;
for (i = 0; i < m; i++) {
-
+
if ((ii >= jj ) && (ii - jj < 1)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
@@ -193,7 +193,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
compinv(b + (ii - jj) * 2, data1, data2);
}
-
+
if (ii - jj >= 1) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
diff --git a/kernel/ia64/amax.S b/kernel/ia64/amax.S
index fae96f1..72eae44 100644
--- a/kernel/ia64/amax.S
+++ b/kernel/ia64/amax.S
@@ -314,18 +314,18 @@
;;
{ .mmf
(p13) LDFD f42 = [DX], INCX
- nop.m 0
+ nop.m 0
(p12) FMAX DMAX1 = f32, DMAX1
}
{ .mmf
(p15) LDFD f46 = [X3], INCX
- nop.m 0
+ nop.m 0
(p12) FMAX DMAX5 = f36, DMAX5
}
;;
{ .mmf
(p13) LDFD f43 = [DX], INCX
- nop.m 0
+ nop.m 0
(p12) FMAX DMAX2 = f33, DMAX2
}
(p12) FMAX DMAX6 = f37, DMAX6
@@ -343,7 +343,7 @@
(p13) FMAX DMAX4 = f43, DMAX4
;;
.align 32
-
+
.L99:
{ .mfi
nop.m 0
diff --git a/kernel/ia64/asum.S b/kernel/ia64/asum.S
index 6114f57..55c6892 100644
--- a/kernel/ia64/asum.S
+++ b/kernel/ia64/asum.S
@@ -54,7 +54,7 @@
#define COMPADD 1
#define STRIDE SIZE
#endif
-
+
#define PRE1 r2
#define I r17
@@ -128,7 +128,7 @@
mov f11 = f0
shl INCX = INCX, BASE_SHIFT + COMPADD
}
- ;;
+ ;;
{ .mmi
#ifdef XDOUBLE
shladd INCX16 = INCX, (3 - COMPADD), r0
@@ -260,7 +260,7 @@
}
;;
{ .mmf
- (p16) LDFD f71 = [X], INCX
+ (p16) LDFD f71 = [X], INCX
(p18) fabs f73 = f73
}
{ .mfb
@@ -365,7 +365,7 @@
#ifndef COMPLEX
(p15) FADD f10 = f10, f46
#endif
- ;;
+ ;;
.align 32
.L998:
diff --git a/kernel/ia64/caxpy.S b/kernel/ia64/caxpy.S
index 0a28ebe..1a994e7 100644
--- a/kernel/ia64/caxpy.S
+++ b/kernel/ia64/caxpy.S
@@ -504,10 +504,10 @@
}
;;
(p14) STFD [YY1] = f90, 1 * SIZE
- ;;
+ ;;
(p14) STFD [YY1] = f91
(p14) add YY1 = YY1, INCYM1
- ;;
+ ;;
(p15) STFD [YY1] = f92, 1 * SIZE
;;
{ .mmb
diff --git a/kernel/ia64/copy.S b/kernel/ia64/copy.S
index b5d7f48..9e7ef32 100644
--- a/kernel/ia64/copy.S
+++ b/kernel/ia64/copy.S
@@ -94,7 +94,7 @@
mov PR = pr
}
{ .mmi
- mov YY = Y1
+ mov YY = Y1
(p7) adds N = -1, N
(p7) add Y1 = Y1, INCY
}
@@ -600,7 +600,7 @@
;;
/* INCX != 1 */
-.L100:
+.L100:
{ .mmi
shladd INCX16 = INCX, 4, r0
shladd INCY16 = INCY, 4, r0
diff --git a/kernel/ia64/daxpy.S b/kernel/ia64/daxpy.S
index b971df6..72b9afa 100644
--- a/kernel/ia64/daxpy.S
+++ b/kernel/ia64/daxpy.S
@@ -62,7 +62,7 @@
#define YY r27
#define PR r30
#define ARLC r31
-
+
#define ALPHA f8
PROLOGUE
diff --git a/kernel/ia64/ddot.S b/kernel/ia64/ddot.S
index 082c303..6654f72 100644
--- a/kernel/ia64/ddot.S
+++ b/kernel/ia64/ddot.S
@@ -344,7 +344,7 @@
(p12) FMA f13 = f41, f43, f13
(p12) FMA f14 = f44, f46, f14
(p12) FMA f15 = f45, f47, f15
- ;;
+ ;;
(p13) FMA f8 = f48, f50, f8
(p13) FMA f9 = f49, f51, f9
(p13) FMA f10 = f52, f54, f10
@@ -516,7 +516,7 @@
(p12) FMA f13 = f41, f43, f13
(p12) FMA f14 = f44, f46, f14
(p12) FMA f15 = f45, f47, f15
- ;;
+ ;;
(p13) FMA f8 = f48, f50, f8
(p13) FMA f9 = f49, f51, f9
(p13) FMA f10 = f52, f54, f10
@@ -748,7 +748,7 @@
(p12) FMA f13 = f41, f43, f13
(p12) FMA f14 = f44, f46, f14
(p12) FMA f15 = f45, f47, f15
- ;;
+ ;;
(p13) FMA f8 = f48, f50, f8
(p13) FMA f9 = f49, f51, f9
(p13) FMA f10 = f52, f54, f10
@@ -920,7 +920,7 @@
(p12) FMA f13 = f41, f43, f13
(p12) FMA f14 = f44, f46, f14
(p12) FMA f15 = f45, f47, f15
- ;;
+ ;;
(p13) FMA f8 = f48, f50, f8
(p13) FMA f9 = f49, f51, f9
(p13) FMA f10 = f52, f54, f10
@@ -1156,7 +1156,7 @@
(p12) FMA f13 = f41, f43, f13
(p12) FMA f14 = f44, f46, f14
(p12) FMA f15 = f45, f47, f15
- ;;
+ ;;
(p13) FMA f8 = f48, f50, f8
(p13) FMA f9 = f49, f51, f9
(p13) FMA f10 = f52, f54, f10
diff --git a/kernel/ia64/gemm_beta.S b/kernel/ia64/gemm_beta.S
index ceeca4a..b4cf816 100644
--- a/kernel/ia64/gemm_beta.S
+++ b/kernel/ia64/gemm_beta.S
@@ -81,7 +81,7 @@
{ .mfb
cmp.ge p6, p0 = 0, N
fcmp.eq p0, p15 = BETA, f0
- (p6) br.ret.sptk.many b0
+ (p6) br.ret.sptk.many b0
}
;;
.body
@@ -197,7 +197,7 @@
{ .mmi
(p12) STFD [CO1] = f0, 1 * SIZE
(p12) STFD [CO2] = f0, 1 * SIZE
- (p12) adds CO3 = 8 * SIZE, CO3
+ (p12) adds CO3 = 8 * SIZE, CO3
}
;;
{ .mmi
@@ -397,7 +397,7 @@
{ .mmi
(p12) LDFD f34 = [CO1], 1 * SIZE
(p12) LDFD f38 = [CO2], 1 * SIZE
- (p12) adds CO3 = 8 * SIZE, CO3
+ (p12) adds CO3 = 8 * SIZE, CO3
}
;;
{ .mmi
@@ -446,7 +446,7 @@
(p13) FMPY f40 = BETA, f40
}
{ .mmf
- (p12) adds DO3 = 8 * SIZE, DO3
+ (p12) adds DO3 = 8 * SIZE, DO3
(p14) FMPY f44 = BETA, f44
}
;;
@@ -456,7 +456,7 @@
(p13) FMPY f41 = BETA, f41
}
{ .mmf
- (p13) adds DO3 = 4 * SIZE, DO3
+ (p13) adds DO3 = 4 * SIZE, DO3
(p14) FMPY f45 = BETA, f45
}
;;
diff --git a/kernel/ia64/gemm_kernel.S b/kernel/ia64/gemm_kernel.S
index d1d4731..c3277a4 100644
--- a/kernel/ia64/gemm_kernel.S
+++ b/kernel/ia64/gemm_kernel.S
@@ -119,11 +119,11 @@
stf.spill [r8] = f16, 32
stf.spill [r9] = f17, 32
mov PR = pr
- ;;
+ ;;
stf.spill [r8] = f18, 32
stf.spill [r9] = f19, 32
shladd LDC = LDC, BASE_SHIFT, r0
- ;;
+ ;;
stf.spill [r8] = f20, 32
stf.spill [r9] = f21, 32
shr J = N, 3
@@ -131,18 +131,18 @@
stf.spill [r8] = f22, 32
stf.spill [r9] = f23, 32
mov AOFFSET = A
- ;;
+ ;;
stf.spill [r8] = f24, 32
stf.spill [r9] = f25, 32
cmp.ge p6, p0 = 0, J
- ;;
+ ;;
stf.spill [r8] = f26, 32
stf.spill [r9] = f27, 32
shr BB = K, 3
- ;;
+ ;;
stf.spill [r8] = f28, 32
stf.spill [r9] = f29, 32
- ;;
+ ;;
stf.spill [r8] = f30
stf.spill [r9] = f31
#ifndef TRMMKERNEL
@@ -182,7 +182,7 @@
nop __LINE__
#endif
mov f80 = f0
- }
+ }
{ .mmf
add C2 = LDC, C // coffset2 = c + 1 * ldc
shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc
@@ -546,7 +546,7 @@
FMA f106 = f34, f53, f106 // A3 * B6
nop __LINE__
}
- ;;
+ ;;
/* 12 */
{ .mfi
FMA f114 = f34, f54, f114 // A3 * B7
@@ -4233,7 +4233,7 @@
#else
nop __LINE__
#endif
- }
+ }
{ .mfi
shladd C4 = LDC, 1, C2
mov f73 = f0
@@ -5862,7 +5862,7 @@
;;
{ .mfi
STFD [C4 ] = f89, 3 * SIZE
- mov f89 = f0
+ mov f89 = f0
#ifdef TRMMKERNEL
shladd KK8 = KK, BASE_SHIFT, r0
#else
@@ -8855,7 +8855,7 @@
;;
#endif
;;
- { .mii
+ { .mii
LDFD f32 = [AOFFSET], 1 * SIZE
tbit.z p12, p0 = L, 0
shr L = L, 1
@@ -8928,13 +8928,13 @@
;;
ldf.fill f16 = [SP], 32
ldf.fill f17 = [r9], 32
- ;;
+ ;;
ldf.fill f18 = [SP], 32
ldf.fill f19 = [r9], 32
- ;;
+ ;;
ldf.fill f20 = [SP], 32
ldf.fill f21 = [r9], 32
- ;;
+ ;;
ldf.fill f22 = [SP], 32
ldf.fill f23 = [r9], 32
mov ar.lc = ARLC
diff --git a/kernel/ia64/gemm_ncopy.S b/kernel/ia64/gemm_ncopy.S
index ebb80bf..aa0d1cb 100644
--- a/kernel/ia64/gemm_ncopy.S
+++ b/kernel/ia64/gemm_ncopy.S
@@ -49,7 +49,7 @@
#define LD LDFD
#define ST STFD_NTA
#endif
-
+
#define J r15
#define PREB r17
#define PREA r18
@@ -82,7 +82,7 @@
.prologue
PROFCODE
- .body
+ .body
{ .mii
shladd LDA = LDA, BASE_SHIFT, r0
mov PR = pr
diff --git a/kernel/ia64/gemv_n.S b/kernel/ia64/gemv_n.S
index 4826bf5..972dd01 100644
--- a/kernel/ia64/gemv_n.S
+++ b/kernel/ia64/gemv_n.S
@@ -84,13 +84,13 @@
#define AO61 loc13
#define AO71 loc14
#define AO81 loc15
-
+
#define PREB r8
#define ARLC r29
#define PR r30
#define ARPFS r31
-
+
#ifdef DOUBLE
#define RPREFETCH (16 * 3 + 8)
#else
@@ -120,17 +120,17 @@
;;
stf.spill [r8] = f16, 32
stf.spill [r9] = f17, 32
- ;;
+ ;;
stf.spill [r8] = f18, 32
stf.spill [r9] = f19, 32
- ;;
+ ;;
stf.spill [r8] = f20, 32
stf.spill [r9] = f21, 32
;;
stf.spill [r8] = f22
stf.spill [r9] = f23
.body
- ;;
+ ;;
ld8 Y = [r14]
ld8 INCY = [r15]
@@ -3301,15 +3301,15 @@
ldf.fill f16 = [SP], 32
ldf.fill f17 = [r9], 32
mov ar.lc = ARLC
- ;;
+ ;;
ldf.fill f18 = [SP], 32
ldf.fill f19 = [r9], 32
mov pr = PR, -1
- ;;
+ ;;
ldf.fill f20 = [SP], 32
ldf.fill f21 = [r9], 32
mov ar.pfs = ARPFS
- ;;
+ ;;
ldf.fill f22 = [SP], 32
ldf.fill f23 = [r9]
br.ret.sptk.many b0
diff --git a/kernel/ia64/gemv_t.S b/kernel/ia64/gemv_t.S
index 6bc579e..0dc4578 100644
--- a/kernel/ia64/gemv_t.S
+++ b/kernel/ia64/gemv_t.S
@@ -80,7 +80,7 @@
#define AO41 loc9
#define AO61 loc10
#define AO81 loc11
-
+
#define PREB r8
#define WPRE r9
#define OFFSET PREB
@@ -89,7 +89,7 @@
#define ARLC r29
#define PR r30
#define ARPFS r31
-
+
#ifdef DOUBLE
#define RPREFETCH (16 * 3 + 8)
#else
@@ -149,7 +149,7 @@
xmpy.l f10 = f10, f11
}
.body
- ;;
+ ;;
;;
{ .mmi
ld8 BUFFER = [r16]
@@ -3541,15 +3541,15 @@
ldf.fill f16 = [SP], 32
ldf.fill f17 = [r9], 32
mov ar.lc = ARLC
- ;;
+ ;;
ldf.fill f18 = [SP], 32
ldf.fill f19 = [r9], 32
mov pr = PR, -1
- ;;
+ ;;
ldf.fill f20 = [SP], 32
ldf.fill f21 = [r9], 32
mov ar.pfs = ARPFS
- ;;
+ ;;
ldf.fill f22 = [SP], 32
ldf.fill f23 = [r9]
br.ret.sptk.many b0
diff --git a/kernel/ia64/iamax.S b/kernel/ia64/iamax.S
index a091675..57d34a3 100644
--- a/kernel/ia64/iamax.S
+++ b/kernel/ia64/iamax.S
@@ -636,4 +636,4 @@
}
;;
EPILOGUE
-
+
diff --git a/kernel/ia64/izamax.S b/kernel/ia64/izamax.S
index c43bcca..6d98ce4 100644
--- a/kernel/ia64/izamax.S
+++ b/kernel/ia64/izamax.S
@@ -60,7 +60,7 @@
#define N r32
#define DX r33
#define INCX r34
-
+
#define PRE1 r2
#define I r14
@@ -107,7 +107,7 @@
sxt4 N = N
sxt4 INCX = INCX
}
- ;;
+ ;;
#endif
#endif
@@ -288,7 +288,7 @@
(p16) LDFD f82 = [DX], SIZE
(p8 ) mov DMAX1 = DATA6
(p19) fabs f85 = f85
- }
+ }
{ .mmf
nop.m 0
nop.m 0
@@ -563,7 +563,7 @@
}
;;
.align 32
-
+
.L999:
{ .mmi
setf.d f8 = DMAX1
diff --git a/kernel/ia64/lsame.S b/kernel/ia64/lsame.S
index 3f2a7db..26da80e 100644
--- a/kernel/ia64/lsame.S
+++ b/kernel/ia64/lsame.S
@@ -58,7 +58,7 @@
;;
cmp4.eq p6, p7 = r15, r14
mov r8 = 1
- ;;
+ ;;
(p7) mov r8 = 0
br.ret.sptk.many b0
diff --git a/kernel/ia64/nrm2.S b/kernel/ia64/nrm2.S
index bb88cfb..52dc3d8 100644
--- a/kernel/ia64/nrm2.S
+++ b/kernel/ia64/nrm2.S
@@ -153,7 +153,7 @@
.align 32
.L51:
- (p16) LDFD f32 = [X], STRIDE
+ (p16) LDFD f32 = [X], STRIDE
(p16) lfetch.nt1 [PRE1], INCX16
(p18) fma.d.s1 f8 = f34, f34, f8
@@ -199,7 +199,7 @@
(p16) LDFD f68 = [X], STRIDE
(p18) fma.d.s1 f12 = f70, f70, f12
nop.b 0
- (p16) LDFD f71 = [X2], STRIDE
+ (p16) LDFD f71 = [X2], STRIDE
(p18) fma.d.s1 f13 = f73, f73, f13
nop.b 0
;;
@@ -271,7 +271,7 @@
;;
#ifndef COMPLEX
(p15) fma.d.s1 f14 = f46, f46, f14
- ;;
+ ;;
#endif
.align 32
diff --git a/kernel/ia64/qaxpy.S b/kernel/ia64/qaxpy.S
index 2acb86b..2cca492 100644
--- a/kernel/ia64/qaxpy.S
+++ b/kernel/ia64/qaxpy.S
@@ -74,7 +74,7 @@
#define PR r30
#define ARLC r31
-
+
#define ALPHA f8
#define SP r12
@@ -268,7 +268,7 @@
(p16) lfetch.nt1 [PRE1], INCX8
nop __LINE__
(p17) FMA f11 = ALPHA, f42, f90
- }
+ }
;;
{ .mmi
(p16) LDFD f56 = [X1], INCX4
diff --git a/kernel/ia64/qgemm_kernel.S b/kernel/ia64/qgemm_kernel.S
index 3c9fb69..0120952 100644
--- a/kernel/ia64/qgemm_kernel.S
+++ b/kernel/ia64/qgemm_kernel.S
@@ -115,10 +115,10 @@
stf.spill [r8] = f16, 32
stf.spill [r9] = f17, 32
mov PR = pr
- ;;
+ ;;
stf.spill [r8] = f18, 32
stf.spill [r9] = f19, 32
- ;;
+ ;;
stf.spill [r8] = f20, 32
stf.spill [r9] = f21, 32
shr J = N, 3
@@ -126,17 +126,17 @@
stf.spill [r8] = f22, 32
stf.spill [r9] = f23, 32
mov AOFFSET = A
- ;;
+ ;;
stf.spill [r8] = f24, 32
stf.spill [r9] = f25, 32
cmp.ge p6, p0 = 0, J
- ;;
+ ;;
stf.spill [r8] = f26, 32
stf.spill [r9] = f27, 32
- ;;
+ ;;
stf.spill [r8] = f28, 32
stf.spill [r9] = f29, 32
- ;;
+ ;;
stf.spill [r8] = f30
stf.spill [r9] = f31
ld8 C = [r14], 8
@@ -183,7 +183,7 @@
nop __LINE__
#endif
mov f80 = f0
- }
+ }
{ .mmf
add C2 = LDC, C // coffset2 = c + 1 * ldc
shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc
@@ -568,7 +568,7 @@
FMA f106 = f34, f53, f106 // A3 * B6
nop __LINE__
}
- ;;
+ ;;
/* 12 */
{ .mfb
nop __LINE__
@@ -4264,7 +4264,7 @@
#else
nop __LINE__
#endif
- }
+ }
{ .mfi
shladd C4 = LDC, 1, C2
mov f73 = f0
@@ -5893,7 +5893,7 @@
;;
{ .mfi
STFD [C4 ] = f89, 3 * SIZE
- mov f89 = f0
+ mov f89 = f0
#ifdef TRMMKERNEL
shladd KK8 = KK, BASE_SHIFT, r0
#else
@@ -8890,7 +8890,7 @@
;;
#endif
;;
- { .mii
+ { .mii
LDFD f32 = [AOFFSET], 1 * SIZE
tbit.z p12, p0 = L, 0
shr L = L, 1
@@ -8963,13 +8963,13 @@
;;
ldf.fill f16 = [SP], 32
ldf.fill f17 = [r9], 32
- ;;
+ ;;
ldf.fill f18 = [SP], 32
ldf.fill f19 = [r9], 32
- ;;
+ ;;
ldf.fill f20 = [SP], 32
ldf.fill f21 = [r9], 32
- ;;
+ ;;
ldf.fill f22 = [SP], 32
ldf.fill f23 = [r9], 32
mov ar.lc = ARLC
diff --git a/kernel/ia64/qgemv_n.S b/kernel/ia64/qgemv_n.S
index 4eeac12..228a00c 100644
--- a/kernel/ia64/qgemv_n.S
+++ b/kernel/ia64/qgemv_n.S
@@ -80,7 +80,7 @@
#define ARLC r30
#define PR r31
-
+
#define LDA7M8 r8
#define PREA r9
#define PREB r10
@@ -114,7 +114,7 @@
adds r15 = 24, SP
adds r16 = 32, SP
.body
- ;;
+ ;;
#ifdef XDOUBLE
ld8 X = [r14], 16
@@ -179,10 +179,10 @@
.L11:
shladd LDA7M8 = LDA, 3, r0
;;
- sub LDA7M8 = LDA, LDA7M8
+ sub LDA7M8 = LDA, LDA7M8
;;
adds LDA7M8 = 8 * SIZE, LDA7M8
- ;;
+ ;;
mov YLD1 = YY
mov YST1 = YY
adds YLD2 = 1 * SIZE, YY
@@ -558,7 +558,7 @@
nop __LINE__
(p17) FMA f122 = ALPHA7, f88, f122
}
- ;;
+ ;;
{ .mmf
(p16) LDFD f84 = [AO5], LDA
(p16) LDFD f85 = [AO6], LDA
@@ -788,7 +788,7 @@
(p14) FMA f100 = ALPHA7, f84, f100
(p14) FMA f101 = ALPHA7, f85, f101
(p15) FMA f102 = ALPHA7, f86, f102
- ;;
+ ;;
(p13) FMA f16 = ALPHA8, f88, f96
(p13) FMA f17 = ALPHA8, f89, f97
(p13) FMA f18 = ALPHA8, f90, f98
@@ -832,10 +832,10 @@
shladd LDA7M8 = LDA, 2, r0
;;
- sub LDA7M8 = LDA, LDA7M8
+ sub LDA7M8 = LDA, LDA7M8
;;
adds LDA7M8 = 8 * SIZE, LDA7M8
- ;;
+ ;;
mov YLD1 = YY
mov YST1 = YY
adds YLD2 = 2 * SIZE, YY
@@ -1123,10 +1123,10 @@
shladd LDA7M8 = LDA, 1, r0
;;
- sub LDA7M8 = LDA, LDA7M8
+ sub LDA7M8 = LDA, LDA7M8
;;
adds LDA7M8 = 8 * SIZE, LDA7M8
- ;;
+ ;;
mov YLD1 = YY
mov YST1 = YY
adds YLD2 = 2 * SIZE, YY
@@ -1334,7 +1334,7 @@
(p6) br.cond.dpnt .L990
;;
mov LDA7M8 = 8 * SIZE
- ;;
+ ;;
mov YLD1 = YY
mov YST1 = YY
adds YLD2 = 2 * SIZE, YY
diff --git a/kernel/ia64/qgemv_t.S b/kernel/ia64/qgemv_t.S
index f3fc693..5b27e09 100644
--- a/kernel/ia64/qgemv_t.S
+++ b/kernel/ia64/qgemv_t.S
@@ -82,7 +82,7 @@
#define ARLC r30
#define PR r31
-
+
#ifdef DOUBLE
#define RPREFETCH (16 * 3 + 8)
#else
@@ -150,7 +150,7 @@
adds YY2 = 4 * SIZE, BUFFER
;;
shr I = M, 3
- ;;
+ ;;
{ .mmi
adds I = -1, I
cmp.eq p16, p0 = r0, r0
@@ -254,10 +254,10 @@
;;
shladd LDA7M8 = LDA, 3, r0
;;
- sub LDA7M8 = LDA, LDA7M8
+ sub LDA7M8 = LDA, LDA7M8
;;
adds LDA7M8 = 8 * SIZE, LDA7M8
- ;;
+ ;;
mov f8 = f0
mov f9 = f0
mov f10 = f0
@@ -386,8 +386,8 @@
(p16) FMA f14 = f96, f80, f14
(p16) FMA f15 = f96, f88, f15
;;
- (p16) FMA f8 = f97, f33, f8
- (p16) FMA f9 = f97, f41, f9
+ (p16) FMA f8 = f97, f33, f8
+ (p16) FMA f9 = f97, f41, f9
(p16) FMA f10 = f97, f49, f10
(p16) FMA f11 = f97, f57, f11
(p16) FMA f12 = f97, f65, f12
@@ -404,8 +404,8 @@
(p16) FMA f14 = f98, f82, f14
(p16) FMA f15 = f98, f90, f15
;;
- (p16) FMA f8 = f99, f35, f8
- (p16) FMA f9 = f99, f43, f9
+ (p16) FMA f8 = f99, f35, f8
+ (p16) FMA f9 = f99, f43, f9
(p16) FMA f10 = f99, f51, f10
(p16) FMA f11 = f99, f59, f11
(p16) FMA f12 = f99, f67, f12
@@ -422,8 +422,8 @@
(p16) FMA f14 = f100, f84, f14
(p16) FMA f15 = f100, f92, f15
;;
- (p16) FMA f8 = f101, f37, f8
- (p16) FMA f9 = f101, f45, f9
+ (p16) FMA f8 = f101, f37, f8
+ (p16) FMA f9 = f101, f45, f9
(p16) FMA f10 = f101, f53, f10
(p16) FMA f11 = f101, f61, f11
(p16) FMA f12 = f101, f69, f12
@@ -440,8 +440,8 @@
(p16) FMA f14 = f102, f86, f14
(p16) FMA f15 = f102, f94, f15
;;
- (p16) FMA f8 = f103, f39, f8
- (p16) FMA f9 = f103, f47, f9
+ (p16) FMA f8 = f103, f39, f8
+ (p16) FMA f9 = f103, f47, f9
(p16) FMA f10 = f103, f55, f10
(p16) FMA f11 = f103, f63, f11
(p16) FMA f12 = f103, f71, f12
@@ -563,8 +563,8 @@
(p13) FMA f14 = f96, f80, f14
(p13) FMA f15 = f96, f88, f15
;;
- (p13) FMA f8 = f97, f33, f8
- (p13) FMA f9 = f97, f41, f9
+ (p13) FMA f8 = f97, f33, f8
+ (p13) FMA f9 = f97, f41, f9
(p13) FMA f10 = f97, f49, f10
(p13) FMA f11 = f97, f57, f11
(p13) FMA f12 = f97, f65, f12
@@ -581,8 +581,8 @@
(p13) FMA f14 = f98, f82, f14
(p13) FMA f15 = f98, f90, f15
;;
- (p13) FMA f8 = f99, f35, f8
- (p13) FMA f9 = f99, f43, f9
+ (p13) FMA f8 = f99, f35, f8
+ (p13) FMA f9 = f99, f43, f9
(p13) FMA f10 = f99, f51, f10
(p13) FMA f11 = f99, f59, f11
(p13) FMA f12 = f99, f67, f12
@@ -599,8 +599,8 @@
(p14) FMA f14 = f100, f84, f14
(p14) FMA f15 = f100, f92, f15
;;
- (p14) FMA f8 = f101, f37, f8
- (p14) FMA f9 = f101, f45, f9
+ (p14) FMA f8 = f101, f37, f8
+ (p14) FMA f9 = f101, f45, f9
(p14) FMA f10 = f101, f53, f10
(p14) FMA f11 = f101, f61, f11
(p14) FMA f12 = f101, f69, f12
@@ -690,10 +690,10 @@
;;
shladd LDA7M8 = LDA, 2, r0
;;
- sub LDA7M8 = LDA, LDA7M8
+ sub LDA7M8 = LDA, LDA7M8
;;
adds LDA7M8 = 8 * SIZE, LDA7M8
- ;;
+ ;;
mov f8 = f0
mov f9 = f0
mov f10 = f0
@@ -778,8 +778,8 @@
(p16) FMA f10 = f96, f48, f10
(p16) FMA f11 = f96, f56, f11
;;
- (p16) FMA f8 = f97, f33, f8
- (p16) FMA f9 = f97, f41, f9
+ (p16) FMA f8 = f97, f33, f8
+ (p16) FMA f9 = f97, f41, f9
(p16) FMA f10 = f97, f49, f10
(p16) FMA f11 = f97, f57, f11
;;
@@ -788,8 +788,8 @@
(p16) FMA f10 = f98, f50, f10
(p16) FMA f11 = f98, f58, f11
;;
- (p16) FMA f8 = f99, f35, f8
- (p16) FMA f9 = f99, f43, f9
+ (p16) FMA f8 = f99, f35, f8
+ (p16) FMA f9 = f99, f43, f9
(p16) FMA f10 = f99, f51, f10
(p16) FMA f11 = f99, f59, f11
;;
@@ -799,8 +799,8 @@
(p16) FMA f11 = f100, f60, f11
;;
- (p16) FMA f8 = f101, f37, f8
- (p16) FMA f9 = f101, f45, f9
+ (p16) FMA f8 = f101, f37, f8
+ (p16) FMA f9 = f101, f45, f9
(p16) FMA f10 = f101, f53, f10
(p16) FMA f11 = f101, f61, f11
;;
@@ -809,8 +809,8 @@
(p16) FMA f10 = f102, f54, f10
(p16) FMA f11 = f102, f62, f11
;;
- (p16) FMA f8 = f103, f39, f8
- (p16) FMA f9 = f103, f47, f9
+ (p16) FMA f8 = f103, f39, f8
+ (p16) FMA f9 = f103, f47, f9
(p16) FMA f10 = f103, f55, f10
(p16) FMA f11 = f103, f63, f11
br.ctop.sptk.few .L22
@@ -888,8 +888,8 @@
(p13) FMA f10 = f96, f48, f10
(p13) FMA f11 = f96, f56, f11
;;
- (p13) FMA f8 = f97, f33, f8
- (p13) FMA f9 = f97, f41, f9
+ (p13) FMA f8 = f97, f33, f8
+ (p13) FMA f9 = f97, f41, f9
(p13) FMA f10 = f97, f49, f10
(p13) FMA f11 = f97, f57, f11
;;
@@ -898,8 +898,8 @@
(p13) FMA f10 = f98, f50, f10
(p13) FMA f11 = f98, f58, f11
;;
- (p13) FMA f8 = f99, f35, f8
- (p13) FMA f9 = f99, f43, f9
+ (p13) FMA f8 = f99, f35, f8
+ (p13) FMA f9 = f99, f43, f9
(p13) FMA f10 = f99, f51, f10
(p13) FMA f11 = f99, f59, f11
;;
@@ -908,8 +908,8 @@
(p14) FMA f10 = f100, f52, f10
(p14) FMA f11 = f100, f60, f11
;;
- (p14) FMA f8 = f101, f37, f8
- (p14) FMA f9 = f101, f45, f9
+ (p14) FMA f8 = f101, f37, f8
+ (p14) FMA f9 = f101, f45, f9
(p14) FMA f10 = f101, f53, f10
(p14) FMA f11 = f101, f61, f11
;;
@@ -962,10 +962,10 @@
;;
shladd LDA7M8 = LDA, 1, r0
;;
- sub LDA7M8 = LDA, LDA7M8
+ sub LDA7M8 = LDA, LDA7M8
;;
adds LDA7M8 = 8 * SIZE, LDA7M8
- ;;
+ ;;
mov f8 = f0
mov f9 = f0
mov f10 = f0
@@ -1028,26 +1028,26 @@
(p16) FMA f8 = f96, f32, f8
(p16) FMA f9 = f96, f40, f9
;;
- (p16) FMA f8 = f97, f33, f8
- (p16) FMA f9 = f97, f41, f9
+ (p16) FMA f8 = f97, f33, f8
+ (p16) FMA f9 = f97, f41, f9
;;
(p16) FMA f8 = f98, f34, f8
(p16) FMA f9 = f98, f42, f9
;;
- (p16) FMA f8 = f99, f35, f8
- (p16) FMA f9 = f99, f43, f9
+ (p16) FMA f8 = f99, f35, f8
+ (p16) FMA f9 = f99, f43, f9
;;
(p16) FMA f8 = f100, f36, f8
(p16) FMA f9 = f100, f44, f9
;;
- (p16) FMA f8 = f101, f37, f8
- (p16) FMA f9 = f101, f45, f9
+ (p16) FMA f8 = f101, f37, f8
+ (p16) FMA f9 = f101, f45, f9
;;
(p16) FMA f8 = f102, f38, f8
(p16) FMA f9 = f102, f46, f9
;;
- (p16) FMA f8 = f103, f39, f8
- (p16) FMA f9 = f103, f47, f9
+ (p16) FMA f8 = f103, f39, f8
+ (p16) FMA f9 = f103, f47, f9
br.ctop.sptk.few .L32
;;
.align 16
@@ -1103,20 +1103,20 @@
(p13) FMA f8 = f96, f32, f8
(p13) FMA f9 = f96, f40, f9
;;
- (p13) FMA f8 = f97, f33, f8
- (p13) FMA f9 = f97, f41, f9
+ (p13) FMA f8 = f97, f33, f8
+ (p13) FMA f9 = f97, f41, f9
;;
(p13) FMA f8 = f98, f34, f8
(p13) FMA f9 = f98, f42, f9
;;
- (p13) FMA f8 = f99, f35, f8
- (p13) FMA f9 = f99, f43, f9
+ (p13) FMA f8 = f99, f35, f8
+ (p13) FMA f9 = f99, f43, f9
;;
(p14) FMA f8 = f100, f36, f8
(p14) FMA f9 = f100, f44, f9
;;
- (p14) FMA f8 = f101, f37, f8
- (p14) FMA f9 = f101, f45, f9
+ (p14) FMA f8 = f101, f37, f8
+ (p14) FMA f9 = f101, f45, f9
;;
(p15) FMA f8 = f102, f38, f8
(p15) FMA f9 = f102, f46, f9
@@ -1202,19 +1202,19 @@
;;
(p16) FMA f8 = f96, f32, f8
;;
- (p16) FMA f8 = f97, f33, f8
+ (p16) FMA f8 = f97, f33, f8
;;
(p16) FMA f8 = f98, f34, f8
;;
- (p16) FMA f8 = f99, f35, f8
+ (p16) FMA f8 = f99, f35, f8
;;
(p16) FMA f8 = f100, f36, f8
;;
- (p16) FMA f8 = f101, f37, f8
+ (p16) FMA f8 = f101, f37, f8
;;
(p16) FMA f8 = f102, f38, f8
;;
- (p16) FMA f8 = f103, f39, f8
+ (p16) FMA f8 = f103, f39, f8
br.ctop.sptk.few .L42
;;
.align 16
@@ -1260,15 +1260,15 @@
;;
(p13) FMA f8 = f96, f32, f8
;;
- (p13) FMA f8 = f97, f33, f8
+ (p13) FMA f8 = f97, f33, f8
;;
(p13) FMA f8 = f98, f34, f8
;;
- (p13) FMA f8 = f99, f35, f8
+ (p13) FMA f8 = f99, f35, f8
;;
(p14) FMA f8 = f100, f36, f8
;;
- (p14) FMA f8 = f101, f37, f8
+ (p14) FMA f8 = f101, f37, f8
;;
(p15) FMA f8 = f102, f38, f8
;;
diff --git a/kernel/ia64/qscal.S b/kernel/ia64/qscal.S
index 3f978af..7a45d9a 100644
--- a/kernel/ia64/qscal.S
+++ b/kernel/ia64/qscal.S
@@ -75,7 +75,7 @@
}
;;
{ .mmi
- mov XX = X1
+ mov XX = X1
mov PR = pr
}
{ .mmi
diff --git a/kernel/ia64/saxpy.S b/kernel/ia64/saxpy.S
index c3b2c1b..fb8f9ff 100644
--- a/kernel/ia64/saxpy.S
+++ b/kernel/ia64/saxpy.S
@@ -64,7 +64,7 @@
#define XB r29
#define PR r30
#define ARLC r31
-
+
#define ALPHA f8
#define ALPHA_P f9
@@ -92,7 +92,7 @@
}
{ .mib
(p10) adds N = -1, N
- mov YYY = Y
+ mov YYY = Y
(p7) br.ret.sptk.many b0
}
;;
@@ -548,7 +548,7 @@
(p18) fpma f14 = ALPHA_P, f52, f100
}
{ .mmi
- (p17) ldf8 f66 = [X], 2 * SIZE
+ (p17) ldf8 f66 = [X], 2 * SIZE
(p16) ldf8 f86 = [Y], 2 * SIZE
}
;;
@@ -1485,7 +1485,7 @@
(p16) LDFD f71 = [X], INCX
(p16) LDFD f119 = [Y], INCY
(p17) FMA f13 = ALPHA, f48, f96
- }
+ }
;;
{ .mmi
(p18) STFD [Y1] = f14
diff --git a/kernel/ia64/scal.S b/kernel/ia64/scal.S
index e3d93dd..ad59b58 100644
--- a/kernel/ia64/scal.S
+++ b/kernel/ia64/scal.S
@@ -81,7 +81,7 @@
.body
;;
{ .mmi
- mov XX = X1
+ mov XX = X1
(p10) LDFD f32 = [X1], INCX
mov PR = pr
}
diff --git a/kernel/ia64/sdot.S b/kernel/ia64/sdot.S
index 5a058e7..c611c11 100644
--- a/kernel/ia64/sdot.S
+++ b/kernel/ia64/sdot.S
@@ -343,7 +343,7 @@
(p12) FMA f13 = f41, f43, f13
(p12) FMA f14 = f44, f46, f14
(p12) FMA f15 = f45, f47, f15
- ;;
+ ;;
(p13) FMA f8 = f48, f50, f8
(p13) FMA f9 = f49, f51, f9
(p13) FMA f10 = f52, f54, f10
@@ -514,7 +514,7 @@
(p12) FMA f13 = f41, f43, f13
(p12) FMA f14 = f44, f46, f14
(p12) FMA f15 = f45, f47, f15
- ;;
+ ;;
(p13) FMA f8 = f48, f50, f8
(p13) FMA f9 = f49, f51, f9
(p13) FMA f10 = f52, f54, f10
@@ -745,7 +745,7 @@
(p12) FMA f13 = f41, f43, f13
(p12) FMA f14 = f44, f46, f14
(p12) FMA f15 = f45, f47, f15
- ;;
+ ;;
(p13) FMA f8 = f48, f50, f8
(p13) FMA f9 = f49, f51, f9
(p13) FMA f10 = f52, f54, f10
@@ -916,7 +916,7 @@
(p12) FMA f13 = f41, f43, f13
(p12) FMA f14 = f44, f46, f14
(p12) FMA f15 = f45, f47, f15
- ;;
+ ;;
(p13) FMA f8 = f48, f50, f8
(p13) FMA f9 = f49, f51, f9
(p13) FMA f10 = f52, f54, f10
@@ -1148,7 +1148,7 @@
(p12) FMA f13 = f41, f43, f13
(p12) FMA f14 = f44, f46, f14
(p12) FMA f15 = f45, f47, f15
- ;;
+ ;;
(p13) FMA f8 = f48, f50, f8
(p13) FMA f9 = f49, f51, f9
(p13) FMA f10 = f52, f54, f10
diff --git a/kernel/ia64/sgemv_n.S b/kernel/ia64/sgemv_n.S
index f5949e6..e44a8ec 100644
--- a/kernel/ia64/sgemv_n.S
+++ b/kernel/ia64/sgemv_n.S
@@ -84,13 +84,13 @@
#define AO61 loc13
#define AO71 loc14
#define AO81 loc15
-
+
#define PREB r8
#define ARLC r29
#define PR r30
#define ARPFS r31
-
+
#ifdef DOUBLE
#define RPREFETCH (16 * 3 + 8)
#else
@@ -120,17 +120,17 @@
;;
stf.spill [r8] = f16, 32
stf.spill [r9] = f17, 32
- ;;
+ ;;
stf.spill [r8] = f18, 32
stf.spill [r9] = f19, 32
- ;;
+ ;;
stf.spill [r8] = f20, 32
stf.spill [r9] = f21, 32
;;
stf.spill [r8] = f22
stf.spill [r9] = f23
.body
- ;;
+ ;;
ld8 Y = [r14]
ld8 INCY = [r15]
@@ -3225,15 +3225,15 @@
ldf.fill f16 = [SP], 32
ldf.fill f17 = [r9], 32
mov ar.lc = ARLC
- ;;
+ ;;
ldf.fill f18 = [SP], 32
ldf.fill f19 = [r9], 32
mov pr = PR, -1
- ;;
+ ;;
ldf.fill f20 = [SP], 32
ldf.fill f21 = [r9], 32
mov ar.pfs = ARPFS
- ;;
+ ;;
ldf.fill f22 = [SP], 32
ldf.fill f23 = [r9]
br.ret.sptk.many b0
diff --git a/kernel/ia64/symv_U.S b/kernel/ia64/symv_U.S
index 4f6c451..aa125d5 100644
--- a/kernel/ia64/symv_U.S
+++ b/kernel/ia64/symv_U.S
@@ -73,14 +73,14 @@
#define A21 loc5
#define A31 loc6
#define A41 loc7
-
+
#define PREX r8
#define PREY r9
#define ARLC r29
#define PR r30
#define ARPFS r31
-
+
#ifdef DOUBLE
#define RPREFETCH (16 * 3 + 4)
#else
@@ -119,17 +119,17 @@
;;
stf.spill [r8] = f16, 32
stf.spill [r9] = f17, 32
- ;;
+ ;;
stf.spill [r8] = f18, 32
stf.spill [r9] = f19, 32
- ;;
+ ;;
stf.spill [r8] = f20, 32
stf.spill [r9] = f21, 32
;;
stf.spill [r8] = f22
stf.spill [r9] = f23
.body
- ;;
+ ;;
ld8 BUFFER = [r14]
;;
shladd LDA = LDA, BASE_SHIFT, r0
@@ -447,15 +447,15 @@
ldf.fill f16 = [SP], 32
ldf.fill f17 = [r9], 32
mov ar.lc = ARLC
- ;;
+ ;;
ldf.fill f18 = [SP], 32
ldf.fill f19 = [r9], 32
mov pr = PR, -1
- ;;
+ ;;
ldf.fill f20 = [SP], 32
ldf.fill f21 = [r9], 32
mov ar.pfs = ARPFS
- ;;
+ ;;
ldf.fill f22 = [SP], 32
ldf.fill f23 = [r9]
br.ret.sptk.many b0
diff --git a/kernel/ia64/trsm_kernel_LN.S b/kernel/ia64/trsm_kernel_LN.S
index 9b1f2b2..6c18b72 100644
--- a/kernel/ia64/trsm_kernel_LN.S
+++ b/kernel/ia64/trsm_kernel_LN.S
@@ -365,7 +365,7 @@
;;
add AOFFSET = r2, AORIG
shladd BOFFSET = r2, 3, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -673,7 +673,7 @@
FNMA f112 = f104, f17, f112
;;
FNMA f120 = f104, f18, f120
- ;;
+ ;;
FMPY f112 = f112, f19
;;
FNMA f120 = f112, f20, f120
@@ -1207,7 +1207,7 @@
;;
shladd AOFFSET = r2, 1, AORIG
shladd BOFFSET = r2, 3, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -1750,7 +1750,7 @@
;;
FNMA f120 = f104, f18, f120
FNMA f121 = f105, f18, f121
- ;;
+ ;;
FMPY f112 = f112, f19
FMPY f113 = f113, f19
;;
@@ -2584,7 +2584,7 @@
;;
shladd AOFFSET = r2, 2, AORIG
shladd BOFFSET = r2, 3, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -3561,7 +3561,7 @@
FNMA f121 = f105, f18, f121
FNMA f122 = f106, f18, f122
FNMA f123 = f107, f18, f123
- ;;
+ ;;
FMPY f112 = f112, f19
FMPY f113 = f113, f19
FMPY f114 = f114, f19
@@ -4487,7 +4487,7 @@
FMA f106 = f34, f53, f106 // A3 * B6
nop __LINE__
}
- ;;
+ ;;
/* 12 */
{ .mfb
FMA f114 = f34, f54, f114 // A3 * B7
@@ -5127,7 +5127,7 @@
;;
shladd AOFFSET = r2, 3, AORIG
shladd BOFFSET = r2, 3, B
- ;;
+ ;;
LDFPD f32, f33 = [BOFFSET], 2 * SIZE
;;
@@ -6628,7 +6628,7 @@
;;
add AOFFSET = r2, AORIG
shladd BOFFSET = r2, 2, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -7055,7 +7055,7 @@
;;
shladd AOFFSET = r2, 1, AORIG
shladd BOFFSET = r2, 2, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -7699,7 +7699,7 @@
;;
shladd AOFFSET = r2, 2, AORIG
shladd BOFFSET = r2, 2, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -8837,7 +8837,7 @@
;;
shladd AOFFSET = r2, 3, AORIG
shladd BOFFSET = r2, 2, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -9228,7 +9228,7 @@
FMPY f80 = f80, f21
FMPY f88 = f88, f21
;;
-
+
adds BOFFSET = 24 * SIZE, BOFFSET
adds BOFFSET2 = 24 * SIZE, BOFFSET2
;;
@@ -9579,7 +9579,7 @@
;;
LDFPD f37, f38 = [BOFFSET]
adds BOFFSET = 4 * SIZE, BOFFSET
- ;;
+ ;;
LDFPD f39, f40 = [BOFFSET]
adds BOFFSET = 5 * SIZE, BOFFSET
;;
@@ -10276,7 +10276,7 @@
;;
add AOFFSET = r2, AORIG
shladd BOFFSET = r2, 1, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -10556,7 +10556,7 @@
;;
shladd AOFFSET = r2, 1, AORIG
shladd BOFFSET = r2, 1, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -10954,7 +10954,7 @@
;;
shladd AOFFSET = r2, 2, AORIG
shladd BOFFSET = r2, 1, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -11612,7 +11612,7 @@
;;
shladd AOFFSET = r2, 3, AORIG
shladd BOFFSET = r2, 1, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -12525,7 +12525,7 @@
;;
add AOFFSET = r2, AORIG
add BOFFSET = r2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -12721,7 +12721,7 @@
;;
shladd AOFFSET = r2, 1, AORIG
add BOFFSET = r2, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -13019,7 +13019,7 @@
;;
shladd AOFFSET = r2, 2, AORIG
add BOFFSET = r2, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -13493,7 +13493,7 @@
;;
shladd AOFFSET = r2, 3, AORIG
add BOFFSET = r2, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -14011,13 +14011,13 @@
;;
ldf.fill f16 = [SP], 32
ldf.fill f17 = [r9], 32
- ;;
+ ;;
ldf.fill f18 = [SP], 32
ldf.fill f19 = [r9], 32
- ;;
+ ;;
ldf.fill f20 = [SP], 32
ldf.fill f21 = [r9], 32
- ;;
+ ;;
mov ar.lc = ARLC
;;
mov pr = PR, -1
diff --git a/kernel/ia64/trsm_kernel_LT.S b/kernel/ia64/trsm_kernel_LT.S
index eef4e00..c11167e 100644
--- a/kernel/ia64/trsm_kernel_LT.S
+++ b/kernel/ia64/trsm_kernel_LT.S
@@ -171,7 +171,7 @@
cmp.eq p6, p7 = 0, I
mov AOFFSET = A
mov f80 = f0
- }
+ }
{ .mmf
add C2 = LDC, C // coffset2 = c + 1 * ldc
shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc
@@ -496,7 +496,7 @@
FMA f106 = f34, f53, f106 // A3 * B6
nop __LINE__
}
- ;;
+ ;;
/* 12 */
{ .mfb
FMA f114 = f34, f54, f114 // A3 * B7
@@ -4472,7 +4472,7 @@
LDFD f21 = [BOFFSET]
adds BOFFSET = -63 * SIZE, BOFFSET
;;
-
+
FMPY f64 = f64, f32
FMPY f65 = f65, f32
@@ -5497,7 +5497,7 @@
;;
FNMA f112 = f104, f17, f112
FNMA f113 = f105, f17, f113
- ;;
+ ;;
{ .mfi
STFD [AOFFSET] = f64, SIZE
FNMA f120 = f104, f18, f120
@@ -6029,7 +6029,7 @@
FNMA f112 = f104, f17, f112
;;
FNMA f120 = f104, f18, f120
- ;;
+ ;;
FMPY f112 = f112, f19
;;
FNMA f120 = f112, f20, f120
@@ -6118,7 +6118,7 @@
setf.d f72 = r0
mov f80 = f0
shr I = M, 3
- }
+ }
{ .mfi
mov C1 = C // coffset1 = c + 0 * ldc
mov f88 = f0
@@ -7059,7 +7059,7 @@
;;
LDFPD f37, f38 = [BOFFSET]
adds BOFFSET = 4 * SIZE, BOFFSET
- ;;
+ ;;
LDFPD f39, f40 = [BOFFSET]
adds BOFFSET = 5 * SIZE, BOFFSET
;;
@@ -8548,7 +8548,7 @@
;;
{ .mfi
shr I = M, 3
- }
+ }
{ .mfi
mov C1 = C // coffset1 = c + 0 * ldc
#ifdef LT
@@ -9987,7 +9987,7 @@
{ .mfi
shr I = M, 3
- }
+ }
{ .mfi
mov C1 = C // coffset1 = c + 0 * ldc
#ifdef LT
@@ -11010,13 +11010,13 @@
;;
ldf.fill f16 = [SP], 32
ldf.fill f17 = [r9], 32
- ;;
+ ;;
ldf.fill f18 = [SP], 32
ldf.fill f19 = [r9], 32
- ;;
+ ;;
ldf.fill f20 = [SP], 32
ldf.fill f21 = [r9], 32
- ;;
+ ;;
mov ar.lc = ARLC
;;
mov pr = PR, -1
diff --git a/kernel/ia64/trsm_kernel_RT.S b/kernel/ia64/trsm_kernel_RT.S
index f3482ae..5e09112 100644
--- a/kernel/ia64/trsm_kernel_RT.S
+++ b/kernel/ia64/trsm_kernel_RT.S
@@ -234,7 +234,7 @@
{ .mfi
shr I = M, 3
- }
+ }
{ .mfi
mov C1 = C // coffset1 = c + 0 * ldc
#ifdef LN
@@ -467,7 +467,7 @@
;;
shladd AOFFSET = r2, 3, AORIG
add BOFFSET = r2, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -1103,7 +1103,7 @@
;;
shladd AOFFSET = r2, 2, AORIG
add BOFFSET = r2, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -1463,7 +1463,7 @@
;;
shladd AOFFSET = r2, 1, AORIG
add BOFFSET = r2, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -1711,7 +1711,7 @@
;;
add AOFFSET = r2, AORIG
add BOFFSET = r2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -1846,7 +1846,7 @@
;;
{ .mfi
shr I = M, 3
- }
+ }
{ .mfi
mov C1 = C // coffset1 = c + 0 * ldc
#ifdef LN
@@ -2184,7 +2184,7 @@
;;
shladd AOFFSET = r2, 3, AORIG
shladd BOFFSET = r2, 1, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -3097,7 +3097,7 @@
;;
shladd AOFFSET = r2, 2, AORIG
shladd BOFFSET = r2, 1, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -3585,7 +3585,7 @@
;;
shladd AOFFSET = r2, 1, AORIG
shladd BOFFSET = r2, 1, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -3914,7 +3914,7 @@
;;
add AOFFSET = r2, AORIG
shladd BOFFSET = r2, 1, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -4109,7 +4109,7 @@
setf.d f72 = r0
mov f80 = f0
shr I = M, 3
- }
+ }
{ .mfi
mov C1 = C // coffset1 = c + 0 * ldc
mov f88 = f0
@@ -4656,7 +4656,7 @@
;;
shladd AOFFSET = r2, 3, AORIG
shladd BOFFSET = r2, 2, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -5047,7 +5047,7 @@
FMPY f80 = f80, f21
FMPY f88 = f88, f21
;;
-
+
adds BOFFSET = 24 * SIZE, BOFFSET
adds BOFFSET2 = 24 * SIZE, BOFFSET2
;;
@@ -5398,7 +5398,7 @@
;;
LDFPD f37, f38 = [BOFFSET]
adds BOFFSET = 4 * SIZE, BOFFSET
- ;;
+ ;;
LDFPD f39, f40 = [BOFFSET]
adds BOFFSET = 5 * SIZE, BOFFSET
;;
@@ -6175,7 +6175,7 @@
;;
shladd AOFFSET = r2, 2, AORIG
shladd BOFFSET = r2, 2, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -6981,7 +6981,7 @@
;;
shladd AOFFSET = r2, 1, AORIG
shladd BOFFSET = r2, 2, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -7469,7 +7469,7 @@
;;
add AOFFSET = r2, AORIG
shladd BOFFSET = r2, 2, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -7786,7 +7786,7 @@
mov AOFFSET = A
#endif
mov f80 = f0
- }
+ }
{ .mmf
add C2 = LDC, C // coffset2 = c + 1 * ldc
shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc
@@ -8153,7 +8153,7 @@
FMA f106 = f34, f53, f106 // A3 * B6
nop __LINE__
}
- ;;
+ ;;
/* 12 */
{ .mfb
FMA f114 = f34, f54, f114 // A3 * B7
@@ -8798,7 +8798,7 @@
;;
shladd AOFFSET = r2, 3, AORIG
shladd BOFFSET = r2, 3, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -13126,7 +13126,7 @@
;;
shladd AOFFSET = r2, 2, AORIG
shladd BOFFSET = r2, 3, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -13937,7 +13937,7 @@
LDFD f21 = [BOFFSET]
adds BOFFSET = -63 * SIZE, BOFFSET
;;
-
+
FMPY f64 = f64, f32
FMPY f65 = f65, f32
@@ -14103,7 +14103,7 @@
FNMA f121 = f105, f18, f121
FNMA f122 = f106, f18, f122
FNMA f123 = f107, f18, f123
- ;;
+ ;;
FMPY f112 = f112, f19
FMPY f113 = f113, f19
FMPY f114 = f114, f19
@@ -14972,7 +14972,7 @@
;;
shladd AOFFSET = r2, 1, AORIG
shladd BOFFSET = r2, 3, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -15515,7 +15515,7 @@
;;
FNMA f120 = f104, f18, f120
FNMA f121 = f105, f18, f121
- ;;
+ ;;
FMPY f112 = f112, f19
FMPY f113 = f113, f19
;;
@@ -16061,7 +16061,7 @@
;;
add AOFFSET = r2, AORIG
shladd BOFFSET = r2, 3, B
- ;;
+ ;;
#endif
adds AOFFSET2 = 4 * SIZE, AOFFSET
adds BOFFSET2 = 4 * SIZE, BOFFSET
@@ -16369,7 +16369,7 @@
FNMA f112 = f104, f17, f112
;;
FNMA f120 = f104, f18, f120
- ;;
+ ;;
FMPY f112 = f112, f19
;;
FNMA f120 = f112, f20, f120
@@ -16671,13 +16671,13 @@
;;
ldf.fill f16 = [SP], 32
ldf.fill f17 = [r9], 32
- ;;
+ ;;
ldf.fill f18 = [SP], 32
ldf.fill f19 = [r9], 32
- ;;
+ ;;
ldf.fill f20 = [SP], 32
ldf.fill f21 = [r9], 32
- ;;
+ ;;
mov ar.lc = ARLC
;;
mov pr = PR, -1
diff --git a/kernel/ia64/xdot.S b/kernel/ia64/xdot.S
index 9322b4b..cdf85cc 100644
--- a/kernel/ia64/xdot.S
+++ b/kernel/ia64/xdot.S
@@ -112,7 +112,7 @@
;;
shl r26 = r26, ZBASE_SHIFT
shl r27 = r27, ZBASE_SHIFT
- ;;
+ ;;
(p6) add X1 = r26, X1
(p7) add Y1 = r27, Y1
;;
@@ -481,12 +481,12 @@
(p13) FMA f13 = f43, f58, f13
(p13) FMA f14 = f42, f59, f14
(p13) FMA f15 = f43, f59, f15
- ;;
+ ;;
(p14) FMA f8 = f44, f60, f8
(p14) FMA f9 = f45, f60, f9
(p14) FMA f10 = f44, f61, f10
(p14) FMA f11 = f45, f61, f11
- ;;
+ ;;
.align 32
.L999:
@@ -505,7 +505,7 @@
#endif
;;
.align 32
-
+
.L1000:
#ifdef F_INTERFACE
STFD [r32] = f8, SIZE
diff --git a/kernel/ia64/zcopy.S b/kernel/ia64/zcopy.S
index 91d90e0..90c09bb 100644
--- a/kernel/ia64/zcopy.S
+++ b/kernel/ia64/zcopy.S
@@ -75,7 +75,7 @@
PROLOGUE
.prologue
PROFCODE
-
+
{ .mmi
shladd INCX = INCX, ZBASE_SHIFT, r0
shladd INCY = INCY, ZBASE_SHIFT, r0
diff --git a/kernel/ia64/zdot.S b/kernel/ia64/zdot.S
index 5c77ce6..35032b7 100644
--- a/kernel/ia64/zdot.S
+++ b/kernel/ia64/zdot.S
@@ -98,7 +98,7 @@
LDINT N = [N]
LDINT INCX = [INCX]
LDINT INCY = [INCY]
- ;;
+ ;;
#ifndef USE64BITINT
sxt4 N = N
sxt4 INCX = INCX
@@ -442,7 +442,7 @@
(p12) FMA f13 = f44, f47, f13
(p12) FMA f14 = f45, f46, f14
(p12) FMA f15 = f45, f47, f15
- ;;
+ ;;
(p13) FMA f8 = f48, f50, f8
(p13) FMA f9 = f48, f51, f9
(p13) FMA f10 = f49, f50, f10
@@ -451,7 +451,7 @@
(p13) FMA f13 = f52, f55, f13
(p13) FMA f14 = f53, f54, f14
(p13) FMA f15 = f53, f55, f15
- ;;
+ ;;
(p14) FMA f8 = f56, f58, f8
(p14) FMA f9 = f56, f59, f9
(p14) FMA f10 = f57, f58, f10
@@ -474,7 +474,7 @@
#endif
;;
.align 32
-
+
.L1000:
#if defined(F_INTERFACE) && defined(RETURN_BY_STACK)
STFD [r32] = f8, SIZE
diff --git a/kernel/ia64/zgemm3m_kernel.S b/kernel/ia64/zgemm3m_kernel.S
index 5adb66a..dc6d252 100644
--- a/kernel/ia64/zgemm3m_kernel.S
+++ b/kernel/ia64/zgemm3m_kernel.S
@@ -117,11 +117,11 @@
nop __LINE__
nop __LINE__
}
- ;;
+ ;;
stf.spill [r8] = f18, 32
stf.spill [r9] = f19, 32
shr J = N, 3
- ;;
+ ;;
stf.spill [r8] = f20, 32
stf.spill [r9] = f21, 32
shladd LDC = LDC, ZBASE_SHIFT, r0
@@ -129,17 +129,17 @@
stf.spill [r8] = f22, 32
stf.spill [r9] = f23, 32
mov AOFFSET = A
- ;;
+ ;;
stf.spill [r8] = f24, 32
stf.spill [r9] = f25, 32
cmp.ge p6, p0 = 0, J
- ;;
+ ;;
stf.spill [r8] = f26, 32
stf.spill [r9] = f27, 32
- ;;
+ ;;
stf.spill [r8] = f28, 32
stf.spill [r9] = f29, 32
- ;;
+ ;;
stf.spill [r8] = f30
stf.spill [r9] = f31
(p6) br.cond.dpnt .L050
@@ -162,7 +162,7 @@
cmp.eq p6, p7 = 0, I
nop __LINE__
mov f80 = f0
- }
+ }
{ .mmf
add C2 = LDC, C // coffset2 = c + 1 * ldc
shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc
@@ -492,7 +492,7 @@
FMA f106 = f34, f53, f106 // A3 * B6
nop __LINE__
}
- ;;
+ ;;
/* 12 */
{ .mfi
FMA f114 = f34, f54, f114 // A3 * B7
@@ -3450,7 +3450,7 @@
cmp.eq p6, p7 = 0, I
mov f65 = f0
nop __LINE__
- }
+ }
{ .mfi
shladd C4 = LDC, 1, C2
mov f73 = f0
@@ -6705,7 +6705,7 @@
adds L = 1, K
}
;;
- { .mii
+ { .mii
LDFD f32 = [AOFFSET], 1 * SIZE
tbit.z p12, p0 = L, 0
shr L = L, 1
@@ -6774,13 +6774,13 @@
;;
ldf.fill f16 = [SP], 32
ldf.fill f17 = [r9], 32
- ;;
+ ;;
ldf.fill f18 = [SP], 32
ldf.fill f19 = [r9], 32
- ;;
+ ;;
ldf.fill f20 = [SP], 32
ldf.fill f21 = [r9], 32
- ;;
+ ;;
ldf.fill f22 = [SP], 32
ldf.fill f23 = [r9], 32
mov ar.lc = ARLC
diff --git a/kernel/ia64/zgemm_beta.S b/kernel/ia64/zgemm_beta.S
index 00cf3e9..654cb86 100644
--- a/kernel/ia64/zgemm_beta.S
+++ b/kernel/ia64/zgemm_beta.S
@@ -77,7 +77,7 @@
{ .mfb
cmp.ge p6, p0 = 0, N
fcmp.eq p0, p14 = BETA_R, f0
- (p6) br.ret.sptk.many b0
+ (p6) br.ret.sptk.many b0
}
;;
.body
@@ -95,7 +95,7 @@
{ .mmb
cmp.ge p6, p0 = 0, M
adds I = -1, I
- (p6) br.ret.sptk.many b0
+ (p6) br.ret.sptk.many b0
}
;;
{ .mbb
@@ -199,7 +199,7 @@
{ .mmi
(p12) STFD [CO1] = f0, 1 * SIZE
(p12) STFD [CO2] = f0, 1 * SIZE
- (p12) adds CO3 = 8 * SIZE, CO3
+ (p12) adds CO3 = 8 * SIZE, CO3
}
;;
{ .mmi
@@ -397,7 +397,7 @@
{ .mmi
(p12) LDFD f34 = [CO1], 1 * SIZE
(p12) LDFD f38 = [CO2], 1 * SIZE
- (p12) adds CO3 = 8 * SIZE, CO3
+ (p12) adds CO3 = 8 * SIZE, CO3
}
;;
{ .mmi
@@ -462,7 +462,7 @@
(p12) STFD [DO2] = f36, 1 * SIZE
}
{ .mmf
- (p12) adds DO3 = 8 * SIZE, DO3
+ (p12) adds DO3 = 8 * SIZE, DO3
}
;;
{ .mmf
@@ -470,7 +470,7 @@
(p12) STFD [DO2] = f37, 1 * SIZE
}
{ .mmf
- (p13) adds DO3 = 4 * SIZE, DO3
+ (p13) adds DO3 = 4 * SIZE, DO3
}
;;
{ .mmf
diff --git a/kernel/ia64/zgemm_kernel.S b/kernel/ia64/zgemm_kernel.S
index bfdb92c..34207c5 100644
--- a/kernel/ia64/zgemm_kernel.S
+++ b/kernel/ia64/zgemm_kernel.S
@@ -462,7 +462,7 @@
FMA_B f99 = f34, f53, f99 // A3 * B6
nop __LINE__
}
- ;;
+ ;;
/* 12 */
{ .mfb
FMA f114 = f34, f54, f114 // A3 * B7
@@ -2112,7 +2112,7 @@
mov f82 = f0
tbit.z p12, p0 = L, 0
}
- { .mfi
+ { .mfi
LDFPD f50, f51 = [BOFFSET], 2 * SIZE
mov f83 = f0
shr L = L, 1
@@ -2134,7 +2134,7 @@
mov f114 = f0
mov ar.lc = L
}
- { .mfi
+ { .mfi
adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
mov f115 = f0
nop __LINE__
@@ -2650,7 +2650,7 @@
FMA f120 = ALPHA_R, f112, f120
nop __LINE__
}
- ;;
+ ;;
{ .mfb
STFD [C1] = f73, SIZE
FCALC_C f105 = ALPHA_R, f97, f105
@@ -2661,7 +2661,7 @@
FCALC_C f121 = ALPHA_R, f113, f121
nop __LINE__
}
- ;;
+ ;;
{ .mfb
STFD [C1] = f74, SIZE
FMA f106 = ALPHA_R, f98, f106
@@ -2672,7 +2672,7 @@
FMA f122 = ALPHA_R, f114, f122
nop __LINE__
}
- ;;
+ ;;
{ .mfb
STFD [C1] = f75, SIZE
FCALC_C f107 = ALPHA_R, f99, f107
@@ -3108,7 +3108,7 @@
}
;;
{ .mfi
- LDFPD f54, f55 = [BOFFSET], 2 * SIZE
+ LDFPD f54, f55 = [BOFFSET], 2 * SIZE
mov f120 = f0
mov ar.lc = L
}
@@ -5490,7 +5490,7 @@
STFD [C2] = f88, SIZE
mov f80 = f0
}
- ;;
+ ;;
{ .mmi
STFD [C1] = f73, SIZE
STFD [C2] = f89, SIZE
@@ -5585,7 +5585,7 @@
nop __LINE__
#endif
}
- ;;
+ ;;
{ .mmi
STFD [C1] = f73, SIZE
STFD [C2] = f89, SIZE
@@ -6772,7 +6772,7 @@
setf.d f64 = r0
mov f80 = f0
}
- ;;
+ ;;
{ .mmf
STFD [C1] = f73, SIZE
setf.d f65 = r0
@@ -6807,7 +6807,7 @@
setf.d f64 = r0
mov f80 = f0
}
- ;;
+ ;;
{ .mmf
STFD [C1] = f73, SIZE
setf.d f65 = r0
diff --git a/kernel/ia64/zgemm_ncopy.S b/kernel/ia64/zgemm_ncopy.S
index e7950e9..e62a2d8 100644
--- a/kernel/ia64/zgemm_ncopy.S
+++ b/kernel/ia64/zgemm_ncopy.S
@@ -44,7 +44,7 @@
#define LD LDF8
#define ST STF8_NTA
-
+
#define TEMP r2
#define I r14
@@ -77,7 +77,7 @@
.prologue
PROFCODE
- .body
+ .body
{ .mii
shladd LDA= LDA, ZBASE_SHIFT, r0
mov PR = pr
diff --git a/kernel/ia64/zgemv_n.S b/kernel/ia64/zgemv_n.S
index b3027a6..92294eb 100644
--- a/kernel/ia64/zgemv_n.S
+++ b/kernel/ia64/zgemv_n.S
@@ -67,7 +67,7 @@
#define YST2 r27
#define YY r28
#define XX r9
-
+
#define RPRE1 loc0
#define RPRE2 loc1
#define RPRE3 loc2
@@ -94,7 +94,7 @@
#define ARLC r29
#define PR r30
#define ARPFS r31
-
+
#ifdef DOUBLE
#define RPREFETCH (16 * 2 + 8)
#else
@@ -148,16 +148,16 @@
;;
stf.spill [r8] = f16, 32
stf.spill [r9] = f17, 32
- ;;
+ ;;
stf.spill [r8] = f18, 32
stf.spill [r9] = f19, 32
- ;;
+ ;;
stf.spill [r8] = f20, 32
stf.spill [r9] = f21, 32
;;
stf.spill [r8] = f22
stf.spill [r9] = f23
- ;;
+ ;;
ld8 INCX = [r14]
ld8 Y = [r15]
ld8 INCY = [r16]
@@ -2277,15 +2277,15 @@
ldf.fill f16 = [SP], 32
ldf.fill f17 = [r9], 32
mov ar.lc = ARLC
- ;;
+ ;;
ldf.fill f18 = [SP], 32
ldf.fill f19 = [r9], 32
mov pr = PR, -1
- ;;
+ ;;
ldf.fill f20 = [SP], 32
ldf.fill f21 = [r9], 32
mov ar.pfs = ARPFS
- ;;
+ ;;
ldf.fill f22 = [SP], 32
ldf.fill f23 = [r9]
br.ret.sptk.many b0
diff --git a/kernel/ia64/zgemv_t.S b/kernel/ia64/zgemv_t.S
index 73e6df0..831bc50 100644
--- a/kernel/ia64/zgemv_t.S
+++ b/kernel/ia64/zgemv_t.S
@@ -81,7 +81,7 @@
#define CLD2 loc13
#define CST1 loc14
#define CST2 loc15
-
+
#define PREB r8
#define WPRE r9
#define OFFSET PREB
@@ -91,7 +91,7 @@
#define ARLC r29
#define PR r30
#define ARPFS r31
-
+
#ifdef DOUBLE
#define RPREFETCH (16 * 2 + 8)
#else
@@ -145,7 +145,7 @@
stf.spill [r9] = f17, 32
mov PR = pr
}
- ;;
+ ;;
{ .mmi
stf.spill [r8] = f18, 32
stf.spill [r9] = f19, 32
@@ -164,7 +164,7 @@
adds r17 = 168, SP
}
.body
- ;;
+ ;;
{ .mmf
ld8 INCX = [r14]
ld8 Y = [r15]
@@ -2001,15 +2001,15 @@
ldf.fill f16 = [SP], 32
ldf.fill f17 = [r9], 32
mov ar.lc = ARLC
- ;;
+ ;;
ldf.fill f18 = [SP], 32
ldf.fill f19 = [r9], 32
mov pr = PR, -1
- ;;
+ ;;
ldf.fill f20 = [SP], 32
ldf.fill f21 = [r9], 32
mov ar.pfs = ARPFS
- ;;
+ ;;
ldf.fill f22 = [SP], 32
ldf.fill f23 = [r9]
br.ret.sptk.many b0
diff --git a/kernel/ia64/zscal.S b/kernel/ia64/zscal.S
index e97feda..1acc0ed 100644
--- a/kernel/ia64/zscal.S
+++ b/kernel/ia64/zscal.S
@@ -58,7 +58,7 @@
#define X1 r37
#define INCX r38
#endif
-
+
#define X2 r16
#define Y1 r17
#define INCX3 r18
diff --git a/kernel/ia64/zswap.S b/kernel/ia64/zswap.S
index 8251b14..165f387 100644
--- a/kernel/ia64/zswap.S
+++ b/kernel/ia64/zswap.S
@@ -93,7 +93,7 @@
cmp.gt p15, p0 = r0, N
(p15) br.ret.sptk.many b0
}
- ;;
+ ;;
#ifdef XDOUBLE
{ .mmi
ld8 X = [r14]
@@ -152,7 +152,7 @@
;;
{ .mmi
adds PRE1 = PREFETCH_SIZE * SIZE, X
- adds PRE2 = PREFETCH_SIZE * SIZE, Y
+ adds PRE2 = PREFETCH_SIZE * SIZE, Y
mov ar.lc = I
}
{ .mib
@@ -323,7 +323,7 @@
(p16) LDFD f125 = [Y], INCYM1
(p18) add YY = YY, INCYM1
}
- { .mmb
+ { .mmb
(p16) lfetch.excl.nt1 [PRE1], INCX8
(p16) lfetch.excl.nt1 [PRE2], INCY8
br.ctop.sptk.few .L52
diff --git a/kernel/ia64/ztrsm_kernel_LN.S b/kernel/ia64/ztrsm_kernel_LN.S
index ef903e3..c8461a2 100644
--- a/kernel/ia64/ztrsm_kernel_LN.S
+++ b/kernel/ia64/ztrsm_kernel_LN.S
@@ -362,7 +362,7 @@
}
;;
{ .mfi
- (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
+ (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
mov f120 = f0
mov ar.lc = L
}
@@ -566,7 +566,7 @@
;;
add AOFFSET = r2, AORIG
shladd BOFFSET = r2, 2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -1009,7 +1009,7 @@
mov f82 = f0
tbit.z p12, p0 = L, 0
}
- { .mfi
+ { .mfi
(p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
mov f83 = f0
shr L = L, 1
@@ -1031,7 +1031,7 @@
mov f114 = f0
mov ar.lc = L
}
- { .mfi
+ { .mfi
adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
mov f115 = f0
nop __LINE__
@@ -1404,7 +1404,7 @@
;;
shladd AOFFSET = r2, 1, AORIG
shladd BOFFSET = r2, 2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -2427,7 +2427,7 @@
FMA_B f99 = f34, f53, f99 // A3 * B6
nop __LINE__
}
- ;;
+ ;;
/* 12 */
{ .mfb
FMA f114 = f34, f54, f114 // A3 * B7
@@ -3072,7 +3072,7 @@
;;
shladd AOFFSET = r2, 2, AORIG
shladd BOFFSET = r2, 2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -5364,7 +5364,7 @@
nop __LINE__
}
;;
- { .mfi
+ { .mfi
STFD [C2 ] = f87, SIZE
mov f112 = f0
adds I = -1, I
@@ -7542,7 +7542,7 @@
;;
add AOFFSET = r2, AORIG
shladd BOFFSET = r2, 1, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -8003,7 +8003,7 @@
;;
shladd AOFFSET = r2, 1, AORIG
shladd BOFFSET = r2, 1, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -8787,7 +8787,7 @@
;;
shladd AOFFSET = r2, 2, AORIG
shladd BOFFSET = r2, 1, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -9622,7 +9622,7 @@
;;
add AOFFSET = r2, AORIG
add BOFFSET = r2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -9951,7 +9951,7 @@
;;
shladd AOFFSET = r2, 1, AORIG
add BOFFSET = r2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -10432,7 +10432,7 @@
;;
shladd AOFFSET = r2, 2, AORIG
add BOFFSET = r2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
diff --git a/kernel/ia64/ztrsm_kernel_LT.S b/kernel/ia64/ztrsm_kernel_LT.S
index 6c7a8ca..88d69e2 100644
--- a/kernel/ia64/ztrsm_kernel_LT.S
+++ b/kernel/ia64/ztrsm_kernel_LT.S
@@ -548,7 +548,7 @@
FMA_B f99 = f34, f53, f99 // A3 * B6
nop __LINE__
}
- ;;
+ ;;
/* 12 */
{ .mfb
FMA f114 = f34, f54, f114 // A3 * B7
@@ -1193,7 +1193,7 @@
;;
shladd AOFFSET = r2, 2, AORIG
shladd BOFFSET = r2, 2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -3485,7 +3485,7 @@
nop __LINE__
}
;;
- { .mfi
+ { .mfi
STFD [C2 ] = f87, SIZE
mov f112 = f0
adds I = -1, I
@@ -5453,7 +5453,7 @@
mov f82 = f0
tbit.z p12, p0 = L, 0
}
- { .mfi
+ { .mfi
(p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
mov f83 = f0
shr L = L, 1
@@ -5475,7 +5475,7 @@
mov f114 = f0
mov ar.lc = L
}
- { .mfi
+ { .mfi
adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
mov f115 = f0
nop __LINE__
@@ -5848,7 +5848,7 @@
;;
shladd AOFFSET = r2, 1, AORIG
shladd BOFFSET = r2, 2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -6687,7 +6687,7 @@
}
;;
{ .mfi
- (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
+ (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
mov f120 = f0
mov ar.lc = L
}
@@ -6891,7 +6891,7 @@
;;
add AOFFSET = r2, AORIG
shladd BOFFSET = r2, 2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -7819,7 +7819,7 @@
;;
shladd AOFFSET = r2, 2, AORIG
shladd BOFFSET = r2, 1, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -8692,7 +8692,7 @@
;;
shladd AOFFSET = r2, 1, AORIG
shladd BOFFSET = r2, 1, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -9199,7 +9199,7 @@
;;
add AOFFSET = r2, AORIG
shladd BOFFSET = r2, 1, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -9750,7 +9750,7 @@
;;
shladd AOFFSET = r2, 2, AORIG
add BOFFSET = r2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -10312,7 +10312,7 @@
;;
shladd AOFFSET = r2, 1, AORIG
add BOFFSET = r2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -10662,7 +10662,7 @@
;;
add AOFFSET = r2, AORIG
add BOFFSET = r2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
diff --git a/kernel/ia64/ztrsm_kernel_RT.S b/kernel/ia64/ztrsm_kernel_RT.S
index 582e2e5..c1c0ffc 100644
--- a/kernel/ia64/ztrsm_kernel_RT.S
+++ b/kernel/ia64/ztrsm_kernel_RT.S
@@ -550,7 +550,7 @@
;;
shladd AOFFSET = r2, 2, AORIG
add BOFFSET = r2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -1112,7 +1112,7 @@
;;
shladd AOFFSET = r2, 1, AORIG
add BOFFSET = r2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -1462,7 +1462,7 @@
;;
add AOFFSET = r2, AORIG
add BOFFSET = r2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -2133,7 +2133,7 @@
;;
shladd AOFFSET = r2, 2, AORIG
shladd BOFFSET = r2, 1, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -3006,7 +3006,7 @@
;;
shladd AOFFSET = r2, 1, AORIG
shladd BOFFSET = r2, 1, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -3513,7 +3513,7 @@
;;
add AOFFSET = r2, AORIG
shladd BOFFSET = r2, 1, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -4064,7 +4064,7 @@
FMA_B f99 = f34, f53, f99 // A3 * B6
nop __LINE__
}
- ;;
+ ;;
/* 12 */
{ .mfb
FMA f114 = f34, f54, f114 // A3 * B7
@@ -4709,7 +4709,7 @@
;;
shladd AOFFSET = r2, 2, AORIG
shladd BOFFSET = r2, 2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -7001,7 +7001,7 @@
nop __LINE__
}
;;
- { .mfi
+ { .mfi
STFD [C2 ] = f87, SIZE
mov f112 = f0
adds I = -1, I
@@ -8969,7 +8969,7 @@
mov f82 = f0
tbit.z p12, p0 = L, 0
}
- { .mfi
+ { .mfi
(p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
mov f83 = f0
shr L = L, 1
@@ -8991,7 +8991,7 @@
mov f114 = f0
mov ar.lc = L
}
- { .mfi
+ { .mfi
adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
mov f115 = f0
nop __LINE__
@@ -9364,7 +9364,7 @@
;;
shladd AOFFSET = r2, 1, AORIG
shladd BOFFSET = r2, 2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
@@ -10203,7 +10203,7 @@
}
;;
{ .mfi
- (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
+ (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
mov f120 = f0
mov ar.lc = L
}
@@ -10407,7 +10407,7 @@
;;
add AOFFSET = r2, AORIG
shladd BOFFSET = r2, 2, B
- ;;
+ ;;
#endif
#if defined(LN) || defined(LT)
diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A
index fc247e4..2d03ad7 100644
--- a/kernel/mips64/KERNEL.LOONGSON3A
+++ b/kernel/mips64/KERNEL.LOONGSON3A
@@ -11,7 +11,7 @@ ZGEMVNKERNEL = zgemv_n_loongson3a.c
ZGEMVTKERNEL = zgemv_t_loongson3a.c
-SGEMMKERNEL = sgemm_kernel_8x4_ps.S
+SGEMMKERNEL = sgemm_kernel_8x4_ps.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
diff --git a/kernel/mips64/KERNEL.LOONGSON3B b/kernel/mips64/KERNEL.LOONGSON3B
index df4380d..e476c63 100644
--- a/kernel/mips64/KERNEL.LOONGSON3B
+++ b/kernel/mips64/KERNEL.LOONGSON3B
@@ -15,13 +15,13 @@ DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
-SGEMMKERNEL = ../generic/gemmkernel_2x2.c
+SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
-DGEMMKERNEL = ../generic/gemmkernel_2x2.c
+DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
diff --git a/kernel/mips64/amax.S b/kernel/mips64/amax.S
index 30c35ba..4467879 100644
--- a/kernel/mips64/amax.S
+++ b/kernel/mips64/amax.S
@@ -42,7 +42,7 @@
#define N $4
#define X $5
#define INCX $6
-
+
#define I $2
#define TEMP $3
@@ -66,7 +66,7 @@
#define s4 $f3
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
diff --git a/kernel/mips64/amin.S b/kernel/mips64/amin.S
index 47108b1..c7d41a1 100644
--- a/kernel/mips64/amin.S
+++ b/kernel/mips64/amin.S
@@ -42,7 +42,7 @@
#define N $4
#define X $5
#define INCX $6
-
+
#define I $2
#define TEMP $3
@@ -66,7 +66,7 @@
#define s4 $f3
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
diff --git a/kernel/mips64/asum.S b/kernel/mips64/asum.S
index 447c2f7..2bf95c6 100644
--- a/kernel/mips64/asum.S
+++ b/kernel/mips64/asum.S
@@ -42,7 +42,7 @@
#define N $4
#define X $5
#define INCX $6
-
+
#define I $2
#define TEMP $3
@@ -64,7 +64,7 @@
#define s2 $f1
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
diff --git a/kernel/mips64/axpy.S b/kernel/mips64/axpy.S
index f7d8887..32694a9 100644
--- a/kernel/mips64/axpy.S
+++ b/kernel/mips64/axpy.S
@@ -78,7 +78,7 @@
#define t4 $f21
PROLOGUE
-
+
#ifndef __64BIT__
daddiu $sp, $sp, -16
sdc1 $f20, 0($sp)
diff --git a/kernel/mips64/axpy_loongson3a.S b/kernel/mips64/axpy_loongson3a.S
index 2e93612..801885e 100644
--- a/kernel/mips64/axpy_loongson3a.S
+++ b/kernel/mips64/axpy_loongson3a.S
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -71,9 +71,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ASSEMBLER
#include "common.h"
-
+
#define PREFETCH_DISTANCE 48
-
+
#define N $4
#define X $8
@@ -113,7 +113,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define t4 $f21
PROLOGUE
-
+
#ifndef __64BIT__
daddiu $sp, $sp, -16
sdc1 $f20, 0($sp)
@@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LD a7, 6 * SIZE(X)
LD a8, 7 * SIZE(X)
-
+
LD b1, 0 * SIZE(Y)
LD b2, 1 * SIZE(Y)
LD b3, 2 * SIZE(Y)
@@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LD b6, 5 * SIZE(Y)
LD b7, 6 * SIZE(Y)
LD b8, 7 * SIZE(Y)
-
+
blez I, .L13
NOP
.align 5
@@ -160,17 +160,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L12:
PREFETCHD(PREFETCH_DISTANCE*SIZE(X))
PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X))
-
- MADD t1, b1, ALPHA, a1
+
+ MADD t1, b1, ALPHA, a1
MADD t2, b2, ALPHA, a2
LD b1, 8 * SIZE(Y)
LD b2, 9 * SIZE(Y)
-
+
MADD t3, b3, ALPHA, a3
MADD t4, b4, ALPHA, a4
LD b3, 10 * SIZE(Y)
LD b4, 11 * SIZE(Y)
-
+
LD a1, 8 * SIZE(X)
LD a2, 9 * SIZE(X)
LD a3, 10 * SIZE(X)
@@ -190,12 +190,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MADD t2, b6, ALPHA, a6
LD b5, 12 * SIZE(Y)
LD b6, 13 * SIZE(Y)
-
+
MADD t3, b7, ALPHA, a7
MADD t4, b8, ALPHA, a8
LD b7, 14 * SIZE(Y)
- LD b8, 15 * SIZE(Y)
-
+ LD b8, 15 * SIZE(Y)
+
LD a5, 12 * SIZE(X)
LD a6, 13 * SIZE(X)
LD a7, 14 * SIZE(X)
diff --git a/kernel/mips64/cgemm_kernel_loongson3a_2x2.S b/kernel/mips64/cgemm_kernel_loongson3a_2x2.S
index 5ded7ae..675cad0 100644
--- a/kernel/mips64/cgemm_kernel_loongson3a_2x2.S
+++ b/kernel/mips64/cgemm_kernel_loongson3a_2x2.S
@@ -144,7 +144,7 @@
#endif
PROLOGUE
-
+
LDARG LDC, 0($sp)
daddiu $sp, $sp, -STACKSIZE
@@ -190,7 +190,7 @@
move KK, OFFSET
#endif
- daddiu J, J, -1
+ daddiu J, J, -1
dsra I, M, 1 # I=M/2
dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4
@@ -228,7 +228,7 @@
LD a3, 2 * SIZE(AO)
MOV c22, c11
LD a4, 3 * SIZE(AO)
-
+
MOV c23, c11
LD b3, 2 * SIZE(BO)
MOV c24, c11
@@ -241,7 +241,7 @@
FETCH $0, 0 * SIZE(CO1)
MOV c33, c11
MOV c34, c11
-
+
FETCH $0, 4 * SIZE(CO2)
MOV c41, c11
MOV c42, c11
@@ -264,7 +264,7 @@
#else
- dsra L, K, 2 # Unroll K 4 times
+ dsra L, K, 2 # Unroll K 4 times
move BO, B
MTC $0, c11 # Clear results regs
@@ -281,7 +281,7 @@
LD a3, 2 * SIZE(AO)
MOV c22, c11
LD a4, 3 * SIZE(AO)
-
+
MOV c23, c11
LD b3, 2 * SIZE(BO)
MOV c24, c11
@@ -294,7 +294,7 @@
MOV c33, c11
MOV c34, c11
FETCH $0, 0 * SIZE(CO1)
-
+
MOV c41, c11
MOV c42, c11
FETCH $0, 4 * SIZE(CO2)
@@ -313,7 +313,7 @@
.L12:
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
LD b5, 4 * SIZE(BO)
@@ -346,7 +346,7 @@
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
- MADD1 c11, c11, a5, b5 # axc A1xB1
+ MADD1 c11, c11, a5, b5 # axc A1xB1
MADD3 c13, c13, a5, b6 # axd
LD b1, 8 * SIZE(BO)
@@ -355,7 +355,7 @@
MADD4 c14, c14, a6, b6 # bxd
LD a3, 10 * SIZE(AO)
- LD a4, 11 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
MADD1 c21, c21, a7, b5 # A2xB1
MADD3 c23, c23, a7, b6
@@ -379,7 +379,7 @@
LD a5, 12 * SIZE(AO)
LD a6, 13 * SIZE(AO)
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
LD b5, 12 * SIZE(BO)
@@ -418,7 +418,7 @@
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
- MADD1 c11, c11, a5, b5 # axc A1xB1
+ MADD1 c11, c11, a5, b5 # axc A1xB1
MADD3 c13, c13, a5, b6 # axd
LD b1, 0 * SIZE(BO)
@@ -469,17 +469,17 @@
.L16:
daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx
daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
- daddiu PREA, PREA, 4 * SIZE
- daddiu PREB, PREB, 4 * SIZE
+ daddiu PREA, PREA, 4 * SIZE
+ daddiu PREB, PREB, 4 * SIZE
MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd
MADD1 c21, c21, a3, b1 # A2xB1
MADD3 c23, c23, a3, b2
-
+
MADD2 c22, c22, a4, b1
MADD4 c24, c24, a4, b2
@@ -624,9 +624,9 @@
#endif
dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4
- daddiu CO1,CO1, 4 * SIZE
+ daddiu CO1,CO1, 4 * SIZE
bgtz I, .L11
- daddiu CO2,CO2, 4 * SIZE
+ daddiu CO2,CO2, 4 * SIZE
.align 5
.L30:
@@ -652,7 +652,7 @@
LD a2, 1 * SIZE(AO)
MTC $0, c11 # Clear results regs
MOV c12, c11
-
+
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
MOV c13, c11
@@ -676,7 +676,7 @@
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1 # MR=1
-#else
+#else
daddiu TEMP, KK, 2 # NR=2
#endif
dsra L, TEMP, 2
@@ -687,14 +687,14 @@
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
- dsra L, K, 2 # Unroll K 4 times
+ dsra L, K, 2 # Unroll K 4 times
move BO, B
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
MTC $0, c11 # Clear results regs
MOV c12, c11
-
+
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
MOV c13, c11
@@ -719,19 +719,19 @@
.L32:
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd
-
+
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD1 c31, c31, a1, b3 # A1xB2
MADD3 c33, c33, a1, b4
-
+
FETCH $0, 4 * SIZE(PREB)
MADD2 c32, c32, a2, b3
MADD4 c34, c34, a2, b4
@@ -739,14 +739,14 @@
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
- MADD1 c11, c11, a3, b5 # axc A1xB1
+ MADD1 c11, c11, a3, b5 # axc A1xB1
MADD3 c13, c13, a3, b6 # axd
LD b1, 8 * SIZE(BO)
LD b2, 9 * SIZE(BO)
MADD2 c12, c12, a4, b5 # bxc
MADD4 c14, c14, a4, b6 # bxd
-
+
LD b3, 10 * SIZE(BO)
LD b4, 11 * SIZE(BO)
MADD1 c31, c31, a3, b7 # A1xB2
@@ -759,7 +759,7 @@
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
- MADD1 c11, c11, a5, b1 # axc A1xB1
+ MADD1 c11, c11, a5, b1 # axc A1xB1
MADD3 c13, c13, a5, b2 # axd
LD b5, 12 * SIZE(BO)
@@ -782,7 +782,7 @@
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
- MADD1 c11, c11, a7, b5 # axc A1xB1
+ MADD1 c11, c11, a7, b5 # axc A1xB1
MADD3 c13, c13, a7, b6 # axd
LD b1, 0 * SIZE(BO)
@@ -818,7 +818,7 @@
.L36:
daddiu L, L, -1
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx
@@ -828,8 +828,8 @@
daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx
MADD1 c31, c31, a1, b3 # A1xB2
MADD3 c33, c33, a1, b4
-
- daddiu PREB, PREB, 4 * SIZE
+
+ daddiu PREB, PREB, 4 * SIZE
MADD2 c32, c32, a2, b3
MADD4 c34, c34, a2, b4
@@ -873,8 +873,8 @@
ST a3, 0 * SIZE(CO2)
ST a4, 1 * SIZE(CO2)
- daddiu CO1,CO1, 2 * SIZE
- daddiu CO2,CO2, 2 * SIZE
+ daddiu CO1,CO1, 2 * SIZE
+ daddiu CO2,CO2, 2 * SIZE
#else
ADD c11, c14, c11
@@ -901,8 +901,8 @@
ST a3, 0 * SIZE(CO2)
ST a4, 1 * SIZE(CO2)
- daddiu CO1,CO1, 2 * SIZE
- daddiu CO2,CO2, 2 * SIZE
+ daddiu CO1,CO1, 2 * SIZE
+ daddiu CO2,CO2, 2 * SIZE
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
@@ -935,7 +935,7 @@
move B, BO
.align 5
-
+
.L20:
andi J, N, 1
blez J, .L999
@@ -998,7 +998,7 @@
NOP
#else
- dsra L, K, 2 # Unroll K 4 times
+ dsra L, K, 2 # Unroll K 4 times
move BO, B
LD a1, 0 * SIZE(AO)
@@ -1032,7 +1032,7 @@
.L22:
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
LD b3, 2 * SIZE(BO)
@@ -1044,14 +1044,14 @@
LD a8, 7 * SIZE(AO)
MADD1 c21, c21, a3, b1 # A2xB1
MADD3 c23, c23, a3, b2
-
+
FETCH $0, 4 * SIZE(PREA)
MADD2 c22, c22, a4, b1
MADD4 c24, c24, a4, b2
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
- MADD1 c11, c11, a5, b3 # axc A1xB1
+ MADD1 c11, c11, a5, b3 # axc A1xB1
MADD3 c13, c13, a5, b4 # axd
LD b5, 4 * SIZE(BO)
@@ -1071,7 +1071,7 @@
LD a5, 12 * SIZE(AO)
LD a6, 13 * SIZE(AO)
- MADD1 c11, c11, a1, b5 # axc A1xB1
+ MADD1 c11, c11, a1, b5 # axc A1xB1
MADD3 c13, c13, a1, b6 # axd
LD b7, 6 * SIZE(BO)
@@ -1090,11 +1090,11 @@
FETCH $0, 12 * SIZE(PREA)
MADD2 c22, c22, a4, b5
MADD4 c24, c24, a4, b6
- daddiu PREA, PREA, 16 * SIZE
+ daddiu PREA, PREA, 16 * SIZE
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
- MADD1 c11, c11, a5, b7 # axc A1xB1
+ MADD1 c11, c11, a5, b7 # axc A1xB1
MADD3 c13, c13, a5, b8 # axd
LD b1, 0 * SIZE(BO)
@@ -1127,7 +1127,7 @@
.L26:
daddiu L, L, -1
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx
@@ -1224,7 +1224,7 @@
daddiu KK, KK, 2
#endif
#endif
- daddiu CO1,CO1, 4 * SIZE
+ daddiu CO1,CO1, 4 * SIZE
bgtz I, .L21
NOP
@@ -1270,7 +1270,7 @@
NOP
#else
- dsra L, K, 2 # Unroll K 4 times
+ dsra L, K, 2 # Unroll K 4 times
move BO, B
# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
@@ -1297,7 +1297,7 @@
# gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
# gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
@@ -1306,27 +1306,27 @@
MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd
-# gsLQC1(R12, F9, F8, 2) # Unroll K=1
+# gsLQC1(R12, F9, F8, 2) # Unroll K=1
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
- MADD1 c11, c11, a3, b3 # axc A1xB1
+ MADD1 c11, c11, a3, b3 # axc A1xB1
MADD3 c13, c13, a3, b4 # axd
-# gsLQC1(R13, F13, F12, 2)
+# gsLQC1(R13, F13, F12, 2)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
MADD2 c12, c12, a4, b3 # bxc
MADD4 c14, c14, a4, b4 # bxd
-# gsLQC1(R12, F11, F10, 3)
+# gsLQC1(R12, F11, F10, 3)
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
- MADD1 c11, c11, a5, b5 # axc A1xB1
+ MADD1 c11, c11, a5, b5 # axc A1xB1
MADD3 c13, c13, a5, b6 # axd
daddiu L, L, -1
-# gsLQC1(R13, F16, F15, 3)
+# gsLQC1(R13, F16, F15, 3)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD2 c12, c12, a6, b5 # bxc
@@ -1338,7 +1338,7 @@
# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
- MADD1 c11, c11, a7, b7 # axc A1xB1
+ MADD1 c11, c11, a7, b7 # axc A1xB1
MADD3 c13, c13, a7, b8 # axd
# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
@@ -1369,7 +1369,7 @@
daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx
daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd
@@ -1432,7 +1432,7 @@
daddiu KK, KK, 1
#endif
- daddiu CO1,CO1, 2 * SIZE
+ daddiu CO1,CO1, 2 * SIZE
#endif
diff --git a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S
index e78ad20..489b124 100644
--- a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S
+++ b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S
@@ -1,4 +1,4 @@
-##define REALNAME gemm
+##define REALNAME gemm
#define ASSEMBLER
#include "common.h"
@@ -77,7 +77,7 @@
#define F27 27
#define F26 26
#define F25 25
-#define F24 24
+#define F24 24
#define F23 23
#define F22 22
#define F21 21
@@ -85,7 +85,7 @@
#define F19 19
#define F18 18
#define F17 17
-#define F16 16
+#define F16 16
#define F15 15
#define F14 14
#define F13 13
@@ -97,10 +97,10 @@
#define F7 7
#define F6 6
#define F5 5
-#define F4 4
-#define F3 3
-#define F2 2
-#define F1 1
+#define F4 4
+#define F3 3
+#define F2 2
+#define F1 1
#define F0 0
#define R12 12
@@ -195,12 +195,12 @@
daddu BO, B, TEMP
#endif
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
dsll PREB, K, ZBASE_SHIFT
MOV C21, C11
MOV C22, C11
-
+
gsLQC1(R13, F9, F8, 0) # B1 B2
MOV C31, C11
MOV C32, C11
@@ -218,7 +218,7 @@
MOV C33, C11
MOV C34, C11
-
+
MOV C43, C11
MOV C44, C11
@@ -246,12 +246,12 @@
move BO, B # Reset B
dsra L, K, 2 # UnRoll K=64
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
dsll PREB, K, ZBASE_SHIFT
MOV C21, C11
MOV C22, C11
-
+
gsLQC1(R13, F9, F8, 0) # B1 B2
MOV C31, C11
MOV C32, C11
@@ -278,7 +278,7 @@
PLU B3, B1, B1
PLU B4, B2, B2
-
+
FETCH $0, 8 * SIZE(CO1)
blez L, .L242
FETCH $0, 8 * SIZE(CO2)
@@ -349,7 +349,7 @@
MADPS C34, C34, A7, B8
MADPS C44, C44, A8, B8
-
+
gsLQC1(R13, F13, F12, 3) # B3 B4
MADPS C11, C11, A1, B1
MADPS C21, C21, A2, B1
@@ -488,7 +488,7 @@
MADPS C34, C34, A7, B8
MADPS C44, C44, A8, B8
-
+
.align 4
.L247:
#ifndef TRMMKERNEL
@@ -644,7 +644,7 @@
MADD C11, C11, C12, A2
MADD C21, C21, C22, A2
-
+
MADD C31, C31, C32, A2
MADD C41, C41, C42, A2
@@ -748,7 +748,7 @@
MADD C11, C11, C12, A2
MADD C21, C21, C22, A2
-
+
MADD C31, C31, C32, A2
MADD C41, C41, C42, A2
@@ -853,7 +853,7 @@
MADD C11, C11, C12, A2
MADD C21, C21, C22, A2
-
+
MADD C31, C31, C32, A2
MADD C41, C41, C42, A2
@@ -1045,7 +1045,7 @@
MADD B4, B4, C21, A2
MADD B6, B6, C31, A2
MADD B8, B8, C41, A2
-
+
ST B1, 0 * SIZE(CO1)
MUL C13, C12, A1
MUL C23, C22, A1
@@ -1073,7 +1073,7 @@
ST B6, 5 * SIZE(CO1)
MADD C11, C11, C12, A2
MADD C21, C21, C22, A2
-
+
ST B8, 7 * SIZE(CO1)
MADD C31, C31, C32, A2
MADD C41, C41, C42, A2
@@ -1391,7 +1391,7 @@
daddu BO, B, TEMP
#endif
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
MOV C21, C11
MOV C22, C11
@@ -1406,7 +1406,7 @@
FETCH $0, 8 * SIZE(CO1)
MOV C24, C11
-
+
FETCH $0, 0 * SIZE(CO2)
FETCH $0, 8 * SIZE(CO2)
@@ -1416,7 +1416,7 @@
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 2 # MR=2
-#else
+#else
daddiu TEMP, KK, 2 # NR=2
#endif
dsra L, TEMP, 2
@@ -1428,7 +1428,7 @@
dsra L, K, 2 # UnRoll K=64
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
MOV C21, C11
MOV C22, C11
@@ -1443,7 +1443,7 @@
FETCH $0, 8 * SIZE(CO1)
MOV C24, C11
-
+
FETCH $0, 0 * SIZE(CO2)
FETCH $0, 8 * SIZE(CO2)
@@ -1665,7 +1665,7 @@
MADD B6, B6, C12, A2
MADD B8, B8, C22, A2
-
+
ST B5, 0 * SIZE(CO2)
ST B7, 2 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
@@ -1723,7 +1723,7 @@
MADD B6, B6, C12, A2
MADD B8, B8, C22, A2
-
+
ST B5, 0 * SIZE(CO2)
ST B7, 2 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
@@ -1745,7 +1745,7 @@
ADD C22, A6, C22
SUB C14, C14, A7
SUB C24, C24, A8
-
+
LD B1, 0 * SIZE(CO1)
LD B3, 2 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
@@ -1782,7 +1782,7 @@
MADD B6, B6, C12, A2
MADD B8, B8, C22, A2
-
+
ST B5, 0 * SIZE(CO2)
ST B7, 2 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
@@ -1910,7 +1910,7 @@
MADD B6, B6, C12, A2
MADD B8, B8, C22, A2
-
+
ST B5, 0 * SIZE(CO2)
ST B7, 2 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
@@ -1958,7 +1958,7 @@
MADD B6, B6, C12, A2
MADD B8, B8, C22, A2
-
+
ST B5, 0 * SIZE(CO2)
ST B7, 2 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
@@ -1980,7 +1980,7 @@
ADD C22, A6, C22
SUB C14, C14, A7
SUB C24, C24, A8
-
+
MUL B1, C11, A1 # A1 = alpha_r
MUL B3, C21, A1
MUL B2, C13, A1
@@ -2007,7 +2007,7 @@
MADD B6, B6, C12, A2
MADD B8, B8, C22, A2
-
+
ST B5, 0 * SIZE(CO2)
ST B7, 2 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
@@ -2109,7 +2109,7 @@
daddu BO, B, TEMP
#endif
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
gsLQC1(R13, F9, F8, 0) # B1 B2
gsLQC1(R12, F1, F0, 0) # A1 A2
@@ -2125,7 +2125,7 @@
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1 # MR=1
-#else
+#else
daddiu TEMP, KK, 2 # NR=2
#endif
dsra L, TEMP, 2
@@ -2137,7 +2137,7 @@
dsra L, K, 2 # UnRoll K=64
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
gsLQC1(R13, F9, F8, 0) # B1 B2
gsLQC1(R12, F1, F0, 0) # A1 A2
@@ -2290,7 +2290,7 @@
NMSUB B5, B5, C14, A2
MADD B6, B6, C12, A2
-
+
ST B5, 0 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
#endif
@@ -2324,7 +2324,7 @@
NMSUB B5, B5, C14, A2
MADD B6, B6, C12, A2
-
+
ST B5, 0 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
@@ -2340,7 +2340,7 @@
LD A2, 160($sp) # load alpha_i
ADD C12, A5, C12
SUB C14, C14, A7
-
+
LD B1, 0 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
@@ -2359,7 +2359,7 @@
NMSUB B5, B5, C14, A2
MADD B6, B6, C12, A2
-
+
ST B5, 0 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
#endif
@@ -2429,7 +2429,7 @@
NMSUB B5, B5, C14, A2
MADD B6, B6, C12, A2
-
+
ST B5, 0 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
#endif
@@ -2457,7 +2457,7 @@
NMSUB B5, B5, C14, A2
MADD B6, B6, C12, A2
-
+
ST B5, 0 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
@@ -2473,7 +2473,7 @@
LD A2, 160($sp) # load alpha_i
ADD C12, A5, C12
SUB C14, C14, A7
-
+
MUL B1, C11, A4 # A1 = alpha_r
MUL B2, C13, A4
NMSUB B1, B1, C13, A2 # A2 = alpha_i
@@ -2486,7 +2486,7 @@
NMSUB B5, B5, C14, A2
MADD B6, B6, C12, A2
-
+
ST B5, 0 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
#endif
@@ -2679,7 +2679,7 @@
PLU B7, B5, B5
PLU B8, B6, B6
-
+
MADPS C11, C11, A1, B5
MADPS C21, C21, A2, B5
gsLQC1(R12, F5, F4, 6) # A5 A6
@@ -2757,7 +2757,7 @@
MADPS C43, C43, A8, B4
PLU B3, B1, B1
-
+
.align 4
.L147:
#ifndef TRMMKERNEL
@@ -3274,7 +3274,7 @@
FETCH $0, 0 * SIZE(CO1)
FETCH $0, 8 * SIZE(CO1)
-
+
PLU B3, B1, B1
PLU B4, B2, B2
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
@@ -3302,7 +3302,7 @@
FETCH $0, 0 * SIZE(CO1)
FETCH $0, 8 * SIZE(CO1)
-
+
PLU B3, B1, B1
blez L, .L122
PLU B4, B2, B2
@@ -3483,7 +3483,7 @@
LD A1, 152($sp) # load alpha_r
# LD A2, 0 * SIZE(A) # load alpha_r
LD A2, 160($sp) # load alpha_i
-
+
LD B1, 0 * SIZE(CO1)
LD B3, 2 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
@@ -3609,7 +3609,7 @@
LD A1, 152($sp) # load alpha_r
# LD A2, 0 * SIZE(A) # load alpha_r
LD A2, 160($sp) # load alpha_i
-
+
MUL B1, C11, A1 # A1 = alpha_r
MUL B3, C21, A1
MUL B2, C13, A1
@@ -3854,7 +3854,7 @@
SUB C13, C13, A3 # ad'+'cb
LD A4, 152($sp) # load alpha_r
LD A2, 160($sp) # load alpha_i
-
+
LD B1, 0 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
diff --git a/kernel/mips64/cgemm_kernel_loongson3b_2x2.S b/kernel/mips64/cgemm_kernel_loongson3b_2x2.S
index 5ded7ae..675cad0 100644
--- a/kernel/mips64/cgemm_kernel_loongson3b_2x2.S
+++ b/kernel/mips64/cgemm_kernel_loongson3b_2x2.S
@@ -144,7 +144,7 @@
#endif
PROLOGUE
-
+
LDARG LDC, 0($sp)
daddiu $sp, $sp, -STACKSIZE
@@ -190,7 +190,7 @@
move KK, OFFSET
#endif
- daddiu J, J, -1
+ daddiu J, J, -1
dsra I, M, 1 # I=M/2
dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4
@@ -228,7 +228,7 @@
LD a3, 2 * SIZE(AO)
MOV c22, c11
LD a4, 3 * SIZE(AO)
-
+
MOV c23, c11
LD b3, 2 * SIZE(BO)
MOV c24, c11
@@ -241,7 +241,7 @@
FETCH $0, 0 * SIZE(CO1)
MOV c33, c11
MOV c34, c11
-
+
FETCH $0, 4 * SIZE(CO2)
MOV c41, c11
MOV c42, c11
@@ -264,7 +264,7 @@
#else
- dsra L, K, 2 # Unroll K 4 times
+ dsra L, K, 2 # Unroll K 4 times
move BO, B
MTC $0, c11 # Clear results regs
@@ -281,7 +281,7 @@
LD a3, 2 * SIZE(AO)
MOV c22, c11
LD a4, 3 * SIZE(AO)
-
+
MOV c23, c11
LD b3, 2 * SIZE(BO)
MOV c24, c11
@@ -294,7 +294,7 @@
MOV c33, c11
MOV c34, c11
FETCH $0, 0 * SIZE(CO1)
-
+
MOV c41, c11
MOV c42, c11
FETCH $0, 4 * SIZE(CO2)
@@ -313,7 +313,7 @@
.L12:
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
LD b5, 4 * SIZE(BO)
@@ -346,7 +346,7 @@
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
- MADD1 c11, c11, a5, b5 # axc A1xB1
+ MADD1 c11, c11, a5, b5 # axc A1xB1
MADD3 c13, c13, a5, b6 # axd
LD b1, 8 * SIZE(BO)
@@ -355,7 +355,7 @@
MADD4 c14, c14, a6, b6 # bxd
LD a3, 10 * SIZE(AO)
- LD a4, 11 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
MADD1 c21, c21, a7, b5 # A2xB1
MADD3 c23, c23, a7, b6
@@ -379,7 +379,7 @@
LD a5, 12 * SIZE(AO)
LD a6, 13 * SIZE(AO)
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
LD b5, 12 * SIZE(BO)
@@ -418,7 +418,7 @@
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
- MADD1 c11, c11, a5, b5 # axc A1xB1
+ MADD1 c11, c11, a5, b5 # axc A1xB1
MADD3 c13, c13, a5, b6 # axd
LD b1, 0 * SIZE(BO)
@@ -469,17 +469,17 @@
.L16:
daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx
daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
- daddiu PREA, PREA, 4 * SIZE
- daddiu PREB, PREB, 4 * SIZE
+ daddiu PREA, PREA, 4 * SIZE
+ daddiu PREB, PREB, 4 * SIZE
MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd
MADD1 c21, c21, a3, b1 # A2xB1
MADD3 c23, c23, a3, b2
-
+
MADD2 c22, c22, a4, b1
MADD4 c24, c24, a4, b2
@@ -624,9 +624,9 @@
#endif
dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4
- daddiu CO1,CO1, 4 * SIZE
+ daddiu CO1,CO1, 4 * SIZE
bgtz I, .L11
- daddiu CO2,CO2, 4 * SIZE
+ daddiu CO2,CO2, 4 * SIZE
.align 5
.L30:
@@ -652,7 +652,7 @@
LD a2, 1 * SIZE(AO)
MTC $0, c11 # Clear results regs
MOV c12, c11
-
+
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
MOV c13, c11
@@ -676,7 +676,7 @@
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1 # MR=1
-#else
+#else
daddiu TEMP, KK, 2 # NR=2
#endif
dsra L, TEMP, 2
@@ -687,14 +687,14 @@
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
- dsra L, K, 2 # Unroll K 4 times
+ dsra L, K, 2 # Unroll K 4 times
move BO, B
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
MTC $0, c11 # Clear results regs
MOV c12, c11
-
+
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
MOV c13, c11
@@ -719,19 +719,19 @@
.L32:
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd
-
+
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD1 c31, c31, a1, b3 # A1xB2
MADD3 c33, c33, a1, b4
-
+
FETCH $0, 4 * SIZE(PREB)
MADD2 c32, c32, a2, b3
MADD4 c34, c34, a2, b4
@@ -739,14 +739,14 @@
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
- MADD1 c11, c11, a3, b5 # axc A1xB1
+ MADD1 c11, c11, a3, b5 # axc A1xB1
MADD3 c13, c13, a3, b6 # axd
LD b1, 8 * SIZE(BO)
LD b2, 9 * SIZE(BO)
MADD2 c12, c12, a4, b5 # bxc
MADD4 c14, c14, a4, b6 # bxd
-
+
LD b3, 10 * SIZE(BO)
LD b4, 11 * SIZE(BO)
MADD1 c31, c31, a3, b7 # A1xB2
@@ -759,7 +759,7 @@
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
- MADD1 c11, c11, a5, b1 # axc A1xB1
+ MADD1 c11, c11, a5, b1 # axc A1xB1
MADD3 c13, c13, a5, b2 # axd
LD b5, 12 * SIZE(BO)
@@ -782,7 +782,7 @@
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
- MADD1 c11, c11, a7, b5 # axc A1xB1
+ MADD1 c11, c11, a7, b5 # axc A1xB1
MADD3 c13, c13, a7, b6 # axd
LD b1, 0 * SIZE(BO)
@@ -818,7 +818,7 @@
.L36:
daddiu L, L, -1
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx
@@ -828,8 +828,8 @@
daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx
MADD1 c31, c31, a1, b3 # A1xB2
MADD3 c33, c33, a1, b4
-
- daddiu PREB, PREB, 4 * SIZE
+
+ daddiu PREB, PREB, 4 * SIZE
MADD2 c32, c32, a2, b3
MADD4 c34, c34, a2, b4
@@ -873,8 +873,8 @@
ST a3, 0 * SIZE(CO2)
ST a4, 1 * SIZE(CO2)
- daddiu CO1,CO1, 2 * SIZE
- daddiu CO2,CO2, 2 * SIZE
+ daddiu CO1,CO1, 2 * SIZE
+ daddiu CO2,CO2, 2 * SIZE
#else
ADD c11, c14, c11
@@ -901,8 +901,8 @@
ST a3, 0 * SIZE(CO2)
ST a4, 1 * SIZE(CO2)
- daddiu CO1,CO1, 2 * SIZE
- daddiu CO2,CO2, 2 * SIZE
+ daddiu CO1,CO1, 2 * SIZE
+ daddiu CO2,CO2, 2 * SIZE
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
@@ -935,7 +935,7 @@
move B, BO
.align 5
-
+
.L20:
andi J, N, 1
blez J, .L999
@@ -998,7 +998,7 @@
NOP
#else
- dsra L, K, 2 # Unroll K 4 times
+ dsra L, K, 2 # Unroll K 4 times
move BO, B
LD a1, 0 * SIZE(AO)
@@ -1032,7 +1032,7 @@
.L22:
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
LD b3, 2 * SIZE(BO)
@@ -1044,14 +1044,14 @@
LD a8, 7 * SIZE(AO)
MADD1 c21, c21, a3, b1 # A2xB1
MADD3 c23, c23, a3, b2
-
+
FETCH $0, 4 * SIZE(PREA)
MADD2 c22, c22, a4, b1
MADD4 c24, c24, a4, b2
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
- MADD1 c11, c11, a5, b3 # axc A1xB1
+ MADD1 c11, c11, a5, b3 # axc A1xB1
MADD3 c13, c13, a5, b4 # axd
LD b5, 4 * SIZE(BO)
@@ -1071,7 +1071,7 @@
LD a5, 12 * SIZE(AO)
LD a6, 13 * SIZE(AO)
- MADD1 c11, c11, a1, b5 # axc A1xB1
+ MADD1 c11, c11, a1, b5 # axc A1xB1
MADD3 c13, c13, a1, b6 # axd
LD b7, 6 * SIZE(BO)
@@ -1090,11 +1090,11 @@
FETCH $0, 12 * SIZE(PREA)
MADD2 c22, c22, a4, b5
MADD4 c24, c24, a4, b6
- daddiu PREA, PREA, 16 * SIZE
+ daddiu PREA, PREA, 16 * SIZE
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
- MADD1 c11, c11, a5, b7 # axc A1xB1
+ MADD1 c11, c11, a5, b7 # axc A1xB1
MADD3 c13, c13, a5, b8 # axd
LD b1, 0 * SIZE(BO)
@@ -1127,7 +1127,7 @@
.L26:
daddiu L, L, -1
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx
@@ -1224,7 +1224,7 @@
daddiu KK, KK, 2
#endif
#endif
- daddiu CO1,CO1, 4 * SIZE
+ daddiu CO1,CO1, 4 * SIZE
bgtz I, .L21
NOP
@@ -1270,7 +1270,7 @@
NOP
#else
- dsra L, K, 2 # Unroll K 4 times
+ dsra L, K, 2 # Unroll K 4 times
move BO, B
# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
@@ -1297,7 +1297,7 @@
# gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
# gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
@@ -1306,27 +1306,27 @@
MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd
-# gsLQC1(R12, F9, F8, 2) # Unroll K=1
+# gsLQC1(R12, F9, F8, 2) # Unroll K=1
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
- MADD1 c11, c11, a3, b3 # axc A1xB1
+ MADD1 c11, c11, a3, b3 # axc A1xB1
MADD3 c13, c13, a3, b4 # axd
-# gsLQC1(R13, F13, F12, 2)
+# gsLQC1(R13, F13, F12, 2)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
MADD2 c12, c12, a4, b3 # bxc
MADD4 c14, c14, a4, b4 # bxd
-# gsLQC1(R12, F11, F10, 3)
+# gsLQC1(R12, F11, F10, 3)
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
- MADD1 c11, c11, a5, b5 # axc A1xB1
+ MADD1 c11, c11, a5, b5 # axc A1xB1
MADD3 c13, c13, a5, b6 # axd
daddiu L, L, -1
-# gsLQC1(R13, F16, F15, 3)
+# gsLQC1(R13, F16, F15, 3)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD2 c12, c12, a6, b5 # bxc
@@ -1338,7 +1338,7 @@
# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
- MADD1 c11, c11, a7, b7 # axc A1xB1
+ MADD1 c11, c11, a7, b7 # axc A1xB1
MADD3 c13, c13, a7, b8 # axd
# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
@@ -1369,7 +1369,7 @@
daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx
daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd
@@ -1432,7 +1432,7 @@
daddiu KK, KK, 1
#endif
- daddiu CO1,CO1, 2 * SIZE
+ daddiu CO1,CO1, 2 * SIZE
#endif
diff --git a/kernel/mips64/cnrm2.S b/kernel/mips64/cnrm2.S
index dd8c210..76fa9c2 100644
--- a/kernel/mips64/cnrm2.S
+++ b/kernel/mips64/cnrm2.S
@@ -42,7 +42,7 @@
#define N $4
#define X $5
#define INCX $6
-
+
#define I $2
#define TEMP $3
@@ -65,7 +65,7 @@
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
@@ -118,7 +118,7 @@
cvt.d.s t1, a5
NOP
-
+
madd.d s2, s2, t2, t2
LD a2, 1 * SIZE(X)
@@ -195,7 +195,7 @@
cvt.d.s t1, a1
cvt.d.s t2, a2
-
+
madd.d s1, s1, t1, t1
daddu X, X, INCX
@@ -210,5 +210,5 @@
j $31
cvt.s.d s1, s1
-
+
EPILOGUE
diff --git a/kernel/mips64/copy.S b/kernel/mips64/copy.S
index 7942b18..bf7f7c7 100644
--- a/kernel/mips64/copy.S
+++ b/kernel/mips64/copy.S
@@ -44,7 +44,7 @@
#define INCX $6
#define Y $7
#define INCY $8
-
+
#define I $2
#define TEMP $3
@@ -58,7 +58,7 @@
#define a8 $f7
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
diff --git a/kernel/mips64/daxpy_loongson3a_simd.S b/kernel/mips64/daxpy_loongson3a_simd.S
index 8f53441..880a67f 100644
--- a/kernel/mips64/daxpy_loongson3a_simd.S
+++ b/kernel/mips64/daxpy_loongson3a_simd.S
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -71,9 +71,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ASSEMBLER
#include "common.h"
-
+
#define PREFETCH_DISTANCE 2016
-
+
#define N $4
#define X $8
@@ -158,7 +158,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define X_BASE 8
#define Y_BASE 10
-
+
#define gsLQC1_(base,fq,ft,offset) .word (0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define gsLQC1(base,fq,ft,offset) gsLQC1_((base), (fq), (ft), (offset))
@@ -166,7 +166,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define gsSQC1(base,fq,ft,offset) gsSQC1_((base), (fq), (ft), (offset))
PROLOGUE
-
+
#ifndef __64BIT__
daddiu $sp, $sp, -40
sdc1 $f20, 0($sp)
@@ -185,7 +185,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
-
+
li TEMP, SIZE
blez N, .L999
@@ -196,9 +196,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
bne INCY, TEMP, .L20
- //Dose the address of Y algin 16 bytes?
+ //Dose the address of Y algin 16 bytes?
andi TEMP, Y, 8
- beq TEMP, $0, .L10
+ beq TEMP, $0, .L10
//Y unalgin. Compute this unalgined element.
LD a1, 0 * SIZE(X)
LD b1, 0 * SIZE(Y)
@@ -208,20 +208,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MADD t1, b1, ALPHA, a1
daddiu N, N, -1
-
+
ST t1, -1 * SIZE(Y)
blez N, .L999
.align 5
-
+
.L10:
dsra I, N, 4
blez I, .L15
daddiu I, I, -1
-
+
//Y algin. We need test X address
- //Dose the address of X algin 16 bytes?
+ //Dose the address of X algin 16 bytes?
andi TEMP, X, 8
bne TEMP, $0, .L30 ///
.align 5
@@ -242,16 +242,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
gsLQC1(Y_BASE,B4,B3,1)
gsLQC1(Y_BASE,B6,B5,2)
gsLQC1(Y_BASE,B8,B7,3)
-
+
blez I, .L13
NOP
.align 5
.L12:
-
- MADD t1, b1, ALPHA, a1
+
+ MADD t1, b1, ALPHA, a1
MADD t2, b2, ALPHA, a2
- gsSQC1(Y_BASE, T2, T1, 0)
+ gsSQC1(Y_BASE, T2, T1, 0)
gsLQC1(Y_BASE,B2,B1,4)
MADD t3, b3, ALPHA, a3
@@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MADD t1, b5, ALPHA, a5
MADD t2, b6, ALPHA, a6
- gsSQC1(Y_BASE, T2, T1, 2)
+ gsSQC1(Y_BASE, T2, T1, 2)
gsLQC1(Y_BASE,B6,B5,6)
MADD t3, b7, ALPHA, a7
@@ -275,9 +275,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y))
PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y))
- MADD t1, b1, ALPHA, a9
+ MADD t1, b1, ALPHA, a9
MADD t2, b2, ALPHA, a10
- gsSQC1(Y_BASE, T2, T1, 4)
+ gsSQC1(Y_BASE, T2, T1, 4)
gsLQC1(Y_BASE,B2,B1,8)
MADD t3, b3, ALPHA, a11
@@ -288,16 +288,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PREFETCHD(PREFETCH_DISTANCE*SIZE(X))
PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X))
- MADD t1, b5, ALPHA, a13
+ MADD t1, b5, ALPHA, a13
MADD t2, b6, ALPHA, a14
- gsSQC1(Y_BASE, T2, T1, 6)
+ gsSQC1(Y_BASE, T2, T1, 6)
gsLQC1(Y_BASE,B6,B5,10)
MADD t3, b7, ALPHA, a15
MADD t4, b8, ALPHA, a16
gsSQC1(Y_BASE, T4, T3, 7)
gsLQC1(Y_BASE,B8,B7,11)
-
+
PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X))
PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X))
@@ -314,7 +314,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
daddiu I, I, -1
daddiu Y, Y, 16 * SIZE
-
+
daddiu X, X, 16 * SIZE
bgtz I, .L12
@@ -322,7 +322,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L13:
- MADD t1, b1, ALPHA, a1
+ MADD t1, b1, ALPHA, a1
MADD t2, b2, ALPHA, a2
gsSQC1(Y_BASE, T2, T1, 0)
gsLQC1(Y_BASE,B2,B1,4)
@@ -344,7 +344,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
gsLQC1(Y_BASE,B8,B7,7)
- MADD t1, b1, ALPHA, a9
+ MADD t1, b1, ALPHA, a9
MADD t2, b2, ALPHA, a10
gsSQC1(Y_BASE, T2, T1, 4)
@@ -354,7 +354,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
gsSQC1(Y_BASE, T4, T3, 5)
- MADD t1, b5, ALPHA, a13
+ MADD t1, b5, ALPHA, a13
MADD t2, b6, ALPHA, a14
gsSQC1(Y_BASE, T2, T1, 6)
@@ -413,7 +413,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L30:
//Y align, X unalign, INCX==INCY==1
//unloop 16
-
+
LD a1, 0 * SIZE(X)
daddiu X, X, SIZE
gsLQC1(X_BASE,A3,A2,0)
@@ -426,18 +426,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
gsLQC1(X_BASE,A15,A14,6)
LD a16, 14 * SIZE(X)
-
+
gsLQC1(Y_BASE,B2,B1,0)
gsLQC1(Y_BASE,B4,B3,1)
gsLQC1(Y_BASE,B6,B5,2)
gsLQC1(Y_BASE,B8,B7,3)
-
+
blez I, .L32
NOP
.align 5
-
+
.L31:
- MADD t1, b1, ALPHA, a1
+ MADD t1, b1, ALPHA, a1
MADD t2, b2, ALPHA, a2
gsSQC1(Y_BASE, T2, T1, 0)
gsLQC1(Y_BASE,B2,B1,4)
@@ -463,7 +463,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y))
PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y))
- MADD t1, b1, ALPHA, a9
+ MADD t1, b1, ALPHA, a9
MADD t2, b2, ALPHA, a10
gsSQC1(Y_BASE, T2, T1, 4)
gsLQC1(Y_BASE,B2,B1,8)
@@ -476,7 +476,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PREFETCHD(PREFETCH_DISTANCE*SIZE(X))
PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X))
- MADD t1, b5, ALPHA, a13
+ MADD t1, b5, ALPHA, a13
MADD t2, b6, ALPHA, a14
gsSQC1(Y_BASE, T2, T1, 6)
gsLQC1(Y_BASE,B6,B5,10)
@@ -485,7 +485,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MADD t4, b8, ALPHA, a16
gsSQC1(Y_BASE, T4, T3, 7)
gsLQC1(Y_BASE,B8,B7,11)
-
+
PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X))
PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X))
@@ -502,15 +502,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
daddiu I, I, -1
daddiu Y, Y, 16 * SIZE
-
+
daddiu X, X, 16 * SIZE
bgtz I, .L31
-
+
.align 5
//Loop end:
.L32:
-
- MADD t1, b1, ALPHA, a1
+
+ MADD t1, b1, ALPHA, a1
MADD t2, b2, ALPHA, a2
gsSQC1(Y_BASE, T2, T1, 0)
gsLQC1(Y_BASE,B2,B1,4)
@@ -532,7 +532,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
gsLQC1(Y_BASE,B8,B7,7)
- MADD t1, b1, ALPHA, a9
+ MADD t1, b1, ALPHA, a9
MADD t2, b2, ALPHA, a10
gsSQC1(Y_BASE, T2, T1, 4)
@@ -542,7 +542,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
gsSQC1(Y_BASE, T4, T3, 5)
- MADD t1, b5, ALPHA, a13
+ MADD t1, b5, ALPHA, a13
MADD t2, b6, ALPHA, a14
gsSQC1(Y_BASE, T2, T1, 6)
@@ -558,8 +558,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//jump back to the remain process.
b .L15
.align 5
-
-//INCX!=1 or INCY != 1
+
+//INCX!=1 or INCY != 1
.L20:
dsra I, N, 3
move YY, Y
diff --git a/kernel/mips64/dgemm_kernel_loongson3a_4x4.S b/kernel/mips64/dgemm_kernel_loongson3a_4x4.S
index 3e95a3e..025f256 100644
--- a/kernel/mips64/dgemm_kernel_loongson3a_4x4.S
+++ b/kernel/mips64/dgemm_kernel_loongson3a_4x4.S
@@ -109,7 +109,7 @@
#define F27 27
#define F26 26
#define F25 25
-#define F24 24
+#define F24 24
#define F23 23
#define F22 22
#define F21 21
@@ -117,7 +117,7 @@
#define F19 19
#define F18 18
#define F17 17
-#define F16 16
+#define F16 16
#define F15 15
#define F14 14
#define F13 13
@@ -129,14 +129,14 @@
#define F7 7
#define F6 6
#define F5 5
-#define F4 4
-#define F3 3
-#define F2 2
-#define F1 1
+#define F4 4
+#define F3 3
+#define F2 2
+#define F1 1
#define F0 0
PROLOGUE
-
+
daddiu $sp, $sp, -160
sd $16, 0($sp)
sd $17, 8($sp)
@@ -159,7 +159,7 @@
ST $f23,144($sp)
- .align 5
+ .align 5
.L0_N4: # Loop N
ST ALPHA,152($sp) # Backup ALPHA
move MCO,M # Backup M
@@ -169,26 +169,26 @@
move AO,A # Backup A_addr
dsra N,NCO,2 # N=NCO/2
-
+
dsll LDC,LDC,BASE_SHIFT # LDC*8Byte
dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5
-
+
#if defined(TRMMKERNEL)
- LDARG OFFSET,160($sp) # OFFSET is relate to the data part
+ LDARG OFFSET,160($sp) # OFFSET is relate to the data part
#endif
#if defined(TRMMKERNEL) && !defined(LEFT)
- neg KK,OFFSET
+ neg KK,OFFSET
#endif
-
+
move BO,B # Backup B_addr
beq N,$0,.L0_N2 # N=0,NCO<4
dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte
.L0_N4_Lb: # mr=4,nr=4
- move CO1,C
+ move CO1,C
dsra M,MCO,2 # M=MCO/2
-
+
move A,AO # Reset A
daddu CO2,C,LDC
@@ -199,7 +199,7 @@
daddu CO4,CO3,LDC
#if defined(TRMMKERNEL) && defined(LEFT)
- move KK,OFFSET
+ move KK,OFFSET
#endif
beqz M,.L14_M2
daddu C,CO4,LDC # move C to next panel Cj
@@ -227,25 +227,25 @@
MOV t12,t11
MOV t22,t11
gsLQC1(R8,F3,F2,1) # a2,a3
-
+
MOV t32,t11
MOV t42,t11
gsLQC1(R9,F11,F10,1) # b2,b3
MOV t13,t11
MOV t23,t11
-
+
MOV t33,t11
MOV t43,t11
MOV t14,t11
MOV t24,t11
-
+
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP,KCO,KK # temp is the length of the data part
#elif defined(LEFT)
- daddiu TEMP, KK, 4 # S=L,U=L
+ daddiu TEMP, KK, 4 # S=L,U=L
#else
daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part
#endif
@@ -254,7 +254,7 @@
beqz K,.L15
MOV t44,t11
-#else
+#else
move B,BO # Reset B
MTC $0,t11 # GEMM part NR=4,MR=4
gsLQC1(R8,F1,F0,0) # a0,a1
@@ -266,42 +266,42 @@
MOV t41,t11
MOV t12,t11
gsLQC1(R8,F3,F2,1) # a2,a3
-
+
MOV t22,t11
MOV t32,t11
gsLQC1(R9,F11,F10,1) # b2,b3
MOV t42,t11
dsra K,KCO,2 # K=KCO/2
-
+
MOV t13,t11
MOV t23,t11
-
+
MOV t33,t11
MOV t43,t11
MOV t14,t11
MOV t24,t11
-
+
MOV t34,t11
beqz K,.L15
MOV t44,t11 # clear 16 results registers
#endif
-
+
.align 5
.L11: # kr=4
- gsLQC1(R8,F5,F4,2)
+ gsLQC1(R8,F5,F4,2)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- gsLQC1(R9,F13,F12,2)
+ gsLQC1(R9,F13,F12,2)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,3)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
-
+
gsLQC1(R9,F15,F14,3)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
@@ -309,17 +309,17 @@
FETCH $0,(PREB)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
-
+
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
-
+
FETCH $0,(PREA)
MADD t33,t33,a2,b2
MADD t43,t43,a3,b2
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
-
+
.L12:
gsLQC1(R8,F1,F0,4)
MADD t11,t11,a4,b4
@@ -347,12 +347,12 @@
FETCH $0,4*SIZE(PREA)
MADD t33,t33,a6,b6
MADD t43,t43,a7,b6
-
+
MADD t34,t34,a6,b7
MADD t44,t44,a7,b7
.L13:
- gsLQC1(R8,F5,F4,6)
+ gsLQC1(R8,F5,F4,6)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
@@ -383,9 +383,9 @@
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
-
+
.L14:
- gsLQC1(R8,F1,F0,0)
+ gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
@@ -413,7 +413,7 @@
MADD t33,t33,a6,b6
MADD t43,t43,a7,b6
daddu PREB,PREB,16*SIZE
-
+
MADD t34,t34,a6,b7
MADD t44,t44,a7,b7
bnez K,.L11
@@ -421,19 +421,19 @@
.L15: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP, 2
#endif
beqz K,.L18
nop
-.L16:
- gsLQC1(R8,F5,F4,2)
+.L16:
+ gsLQC1(R8,F5,F4,2)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- gsLQC1(R9,F13,F12,2)
+ gsLQC1(R9,F13,F12,2)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
@@ -460,7 +460,7 @@
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
-
+
.L17:
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b4
@@ -490,19 +490,19 @@
MADD t33,t33,a6,b6
MADD t43,t43,a7,b6
daddu PREA,PREA,8*SIZE
-
+
MADD t34,t34,a6,b7
MADD t44,t44,a7,b7
-
+
.L18: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L19
+ beqz K,.L19
LD ALPHA,152($sp) # Get ALPHA
-
+
FETCH $0,0(PREB)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
@@ -534,8 +534,8 @@
MADD t44,t44,a3,b3
.L19: # Write Back to C
-#ifndef TRMMKERNEL
- LD c11,0(CO1) # GEMM write part
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # GEMM write part
LD c21,1*SIZE(CO1) # get 16 C
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
@@ -605,11 +605,11 @@
daddu CO3,CO3,4*SIZE
ST t44,3*SIZE(CO4)
daddu PREB,BO,SPANB
-
- bnez M,.L10
+
+ bnez M,.L10
daddu CO4,CO4,4*SIZE
-#else
+#else
MUL t11, ALPHA, t11 # TRMM write back part
MUL t21, ALPHA, t21
MUL t31, ALPHA, t31
@@ -650,7 +650,7 @@
daddiu CO1,CO1, 4 * SIZE
daddiu CO2,CO2, 4 * SIZE
daddiu CO3,CO3, 4 * SIZE
- daddiu CO4,CO4, 4 * SIZE
+ daddiu CO4,CO4, 4 * SIZE
FETCH $0,4*SIZE(CO1)
FETCH $0,4*SIZE(CO2)
@@ -663,7 +663,7 @@
FETCH $0,0(CO4)
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- dsubu TEMP,KCO,KK
+ dsubu TEMP,KCO,KK
#ifdef LEFT
daddiu TEMP,TEMP, -4
#else
@@ -675,10 +675,10 @@
daddu B,B,TEMP # mov B to the end of panel Bj
#endif
-#ifdef LEFT
+#ifdef LEFT
daddiu KK, KK,4
#endif
- bnez M,.L10
+ bnez M,.L10
nop
#endif
@@ -686,7 +686,7 @@
.align 3
.L14_M2:
andi M, MCO, 2 # nr=4,mr=2
- beqz M,.L14_M1
+ beqz M,.L14_M1
nop
.L20:
@@ -694,7 +694,7 @@
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO # Reset B
#else
- dsll K,KK,1 + BASE_SHIFT # mr=2
+ dsll K,KK,1 + BASE_SHIFT # mr=2
dsll TEMP,KK,2 + BASE_SHIFT # nr=4
daddu A,A,K
daddu B,BO,TEMP
@@ -707,11 +707,11 @@
MOV t12,t11
MOV t22,t11
gsLQC1(R9,F9,F8,0) # b0,b1
-
+
MOV t13,t11
MOV t23,t11
gsLQC1(R9,F11,F10,1) # b2,b3
-
+
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP,KCO,KK
@@ -726,42 +726,42 @@
MOV t24,t11 # clear 2*4=8 results registers
#else
- move B,BO # Reset B
+ move B,BO # Reset B
MTC $0,t11
- gsLQC1(R8,F1,F0,0)
-
+ gsLQC1(R8,F1,F0,0)
+
MOV t21,t11
MOV t12,t11
- gsLQC1(R9,F9,F8,0)
+ gsLQC1(R9,F9,F8,0)
MOV t22,t11
- dsra K,KCO,2
- gsLQC1(R9,F11,F10,1)
-
+ dsra K,KCO,2
+ gsLQC1(R9,F11,F10,1)
+
MOV t13,t11
MOV t23,t11
-
+
MOV t14,t11
beqz K,.L25
MOV t24,t11
#endif
.L21: # nr=4,mr=2,kr=4
- gsLQC1(R8,F5,F4,1)
+ gsLQC1(R8,F5,F4,1)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- gsLQC1(R9,F13,F12,2)
+ gsLQC1(R9,F13,F12,2)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R9,F15,F14,3)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
-
+
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
-
+
gsLQC1(R8,F3,F2,2)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
@@ -778,7 +778,7 @@
MADD t24,t24,a5,b7
daddiu K,K,-1
- gsLQC1(R8,F7,F6,3)
+ gsLQC1(R8,F7,F6,3)
MADD t11,t11,a2,b0
MADD t21,t21,a3,b0
@@ -811,7 +811,7 @@
bnez K,.L21
MADD t24,t24,a7,b7
-.L25:
+.L25:
#ifndef TRMMKERNEL
andi K,KCO,2 # kr=2
#else
@@ -820,12 +820,12 @@
beqz K,.L28
nop
-.L26:
- gsLQC1(R8,F5,F4,1)
+.L26:
+ gsLQC1(R8,F5,F4,1)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- gsLQC1(R9,F13,F12,2)
+ gsLQC1(R9,F13,F12,2)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
@@ -833,7 +833,7 @@
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
daddu A,A,4*SIZE # 2mr*2kr
-
+
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
daddu B,B,8*SIZE # 4nr*2kr
@@ -853,16 +853,16 @@
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
-
-.L28: # kr=1
+
+.L28: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L29
+ beqz K,.L29
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # 2mr*kr
@@ -880,11 +880,11 @@
.L29: # Write Back to C
#ifndef TRMMKERNEL
LD c11,0(CO1) # GEMM write back part
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
LD c12,0(CO2)
LD c22,1*SIZE(CO2)
-
+
LD c13,0(CO3)
MADD t11,c11,t11,ALPHA
LD c23,1*SIZE(CO3)
@@ -923,25 +923,25 @@
#else
MUL t11, ALPHA, t11 # TRMM write back part
MUL t21, ALPHA, t21
-
+
ST t11, 0 * SIZE(CO1)
MUL t12, ALPHA, t12
ST t21, 1 * SIZE(CO1)
MUL t22, ALPHA, t22
-
+
ST t12, 0 * SIZE(CO2)
MUL t13, ALPHA, t13
ST t22, 1 * SIZE(CO2)
MUL t23, ALPHA, t23
-
+
ST t13, 0 * SIZE(CO3)
MUL t14, ALPHA, t14
ST t23, 1 * SIZE(CO3)
MUL t24, ALPHA, t24
-
+
ST t14, 0 * SIZE(CO4)
ST t24, 1 * SIZE(CO4)
-
+
daddiu CO1,CO1, 2 * SIZE
daddiu CO2,CO2, 2 * SIZE
daddiu CO3,CO3, 2 * SIZE
@@ -974,7 +974,7 @@
.align 3
.L14_M1:
- andi M,MCO,1 # mr=1
+ andi M,MCO,1 # mr=1
beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj
nop
@@ -1010,8 +1010,8 @@
nop
beqz K,.L35
nop
-
-#else
+
+#else
move B,BO # Reset B, GEMM part
dsra K,KCO,2 # K=KCO/2
LD a0, 0 * SIZE(A) # a0
@@ -1023,28 +1023,28 @@
MOV t13,t11
MOV t14,t11
gsLQC1(R9,F11,F10,1) # b2,b3
-
+
beqz K,.L35
nop
#endif
-.L31: # nr=4,mr=1,kr=4
+.L31: # nr=4,mr=1,kr=4
LD a1, 1*SIZE(A) # load a1
MADD t11,t11,a0,b0
-
+
gsLQC1(R9,F13,F12,2) # b4,b5
MADD t12,t12,a0,b1
-
+
gsLQC1(R9,F15,F14,3) # b6,b7
MADD t13,t13,a0,b2
MADD t14,t14,a0,b3
LD a2, 2*SIZE(A) # a2
MADD t11,t11,a1,b4
-
+
gsLQC1(R9,F9,F8,4)
MADD t12,t12,a1,b5
-
+
gsLQC1(R9,F11,F10,5)
MADD t13,t13,a1,b6
MADD t14,t14,a1,b7
@@ -1052,11 +1052,11 @@
LD a3, 3*SIZE(A) # a3
MADD t11,t11,a2,b0
-
+
gsLQC1(R9,F13,F12,6)
MADD t12,t12,a2,b1
daddu A,A,4*SIZE # 1mr*4kr
-
+
gsLQC1(R9,F15,F14,7)
MADD t13,t13,a2,b2
MADD t14,t14,a2,b3
@@ -1064,10 +1064,10 @@
LD a0, 0*SIZE(A) # a0
MADD t11,t11,a3,b4
-
+
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a3,b5
-
+
gsLQC1(R9,F11,F10,1)
MADD t13,t13,a3,b6
bnez K,.L31
@@ -1075,21 +1075,21 @@
.L35: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L38
nop
-.L36:
+.L36:
LD a1,1*SIZE(A) # load a1
MADD t11,t11,a0,b0
-
- gsLQC1(R9,F13,F12,2)
+
+ gsLQC1(R9,F13,F12,2)
MADD t12,t12,a0,b1
daddu A,A,2*SIZE # mr*2kr
-
+
gsLQC1(R9,F15,F14,3)
MADD t13,t13,a0,b2
MADD t14,t14,a0,b3
@@ -1099,38 +1099,38 @@
.L37:
LD a0,0(A)
MADD t11,t11,a1,b4
-
+
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a1,b5
-
+
gsLQC1(R9,F11,F10,1)
MADD t13,t13,a1,b6
MADD t14,t14,a1,b7
-
+
.L38: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L39
+ beqz K,.L39
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
- daddu A,A,1*SIZE
+ daddu A,A,1*SIZE
daddu B,B,4*SIZE
-
+
MADD t13,t13,a0,b2
MADD t14,t14,a0,b3
.L39: # Write Back
#ifndef TRMMKERNEL
- LD c11,0(CO1)
+ LD c11,0(CO1)
LD c12,0(CO2)
LD c13,0(CO3)
LD c14,0(CO4)
-
+
MADD t11,c11,t11,ALPHA
MADD t12,c12,t12,ALPHA
MADD t13,c13,t13,ALPHA
@@ -1176,22 +1176,22 @@
.L0_N4_Loop: # mc finished
daddiu N,N,-1 # N--
#if defined(TRMMKERNEL) && !defined(LEFT)
- daddiu KK, KK,4
+ daddiu KK, KK,4
#endif
- bnez N,.L0_N4_Lb
+ bnez N,.L0_N4_Lb
move BO,B # Set BO point to next panel Bj
- .align 5
+ .align 5
.L0_N2:
andi N,NCO,2 # nr = 2
- beqz N,.L0_N1
+ beqz N,.L0_N1
nop
.L0_N2_Lb:
- move CO1,C
+ move CO1,C
daddu CO2,C,LDC
- dsra M,MCO,2
+ dsra M,MCO,2
move A,AO # Reset A
daddu PREA,AO,SPANA
@@ -1203,13 +1203,13 @@
beqz M,.L12_M2
nop
-.L40:
+.L40:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO # Reset B
#else
dsll K,KK, 2 + BASE_SHIFT
- dsll TEMP, KK,1 + BASE_SHIFT
+ dsll TEMP, KK,1 + BASE_SHIFT
daddu A,A,K
daddu B,BO,TEMP
@@ -1225,7 +1225,7 @@
MOV t12,t11
MOV t22,t11
gsLQC1(R8,F3,F2,1) # a2,a3
-
+
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP,KCO,KK
#elif defined(LEFT)
@@ -1233,7 +1233,7 @@
#else
daddiu TEMP, KK, 2
#endif
- dsra K,TEMP,2
+ dsra K,TEMP,2
MOV t32,t11
beqz K,.L45
MOV t42,t11
@@ -1250,28 +1250,28 @@
MOV t41,t11
dsra K,KCO,2 # K=KCO/2
gsLQC1(R8,F3,F2,1) # a2,a3
-
+
MOV t12,t11
MOV t22,t11
-
+
MOV t32,t11
beqz K,.L45
MOV t42,t11
#endif
.L41: # nr=2,mr=kr=4
- gsLQC1(R8,F5,F4,2)
+ gsLQC1(R8,F5,F4,2)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- gsLQC1(R9,F13,F12,1)
+ gsLQC1(R9,F13,F12,1)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,3)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
-
+
FETCH $0,(PREA)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
@@ -1294,7 +1294,7 @@
MADD t42,t42,a7,b5
.L43:
- gsLQC1(R8,F5,F4,6)
+ gsLQC1(R8,F5,F4,6)
MADD t11,t11,a0,b2
MADD t21,t21,a1,b2
@@ -1305,7 +1305,7 @@
gsLQC1(R8,F7,F6,7)
MADD t31,t31,a2,b2
MADD t41,t41,a3,b2
- daddu B,B,8*SIZE # 2nr*4kr
+ daddu B,B,8*SIZE # 2nr*4kr
FETCH $0,8*SIZE(PREA)
MADD t32,t32,a2,b3
@@ -1335,19 +1335,19 @@
.L45: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L48
nop
-.L46:
- gsLQC1(R8,F5,F4,2)
+.L46:
+ gsLQC1(R8,F5,F4,2)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- gsLQC1(R9,F13,F12,1)
+ gsLQC1(R9,F13,F12,1)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
@@ -1379,16 +1379,16 @@
MADD t42,t42,a7,b5
daddu PREA,PREA,8*SIZE
-
+
.L48: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L49
+ beqz K,.L49
LD ALPHA,152($sp) # Get ALPHA
-
+
FETCH $0,0(PREA)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
@@ -1408,7 +1408,7 @@
.L49: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # gemm write back part Fetch 16 C
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
@@ -1429,7 +1429,7 @@
MADD t32,c32,t32,ALPHA
ST t41,3*SIZE(CO1)
MADD t42,c42,t42,ALPHA
- daddiu M,M,-1
+ daddiu M,M,-1
ST t12,0(CO2)
ST t22,1*SIZE(CO2)
@@ -1441,8 +1441,8 @@
FETCH $0,8*SIZE(CO1)
FETCH $0,8*SIZE(CO2)
- daddu CO1,CO1,4*SIZE
- bnez M,.L40
+ daddu CO1,CO1,4*SIZE
+ bnez M,.L40
daddu CO2,CO2,4*SIZE
#else
@@ -1450,7 +1450,7 @@
MUL t21, ALPHA, t21
MUL t31, ALPHA, t31
MUL t41, ALPHA, t41
-
+
MUL t12, ALPHA, t12
ST t11, 0 * SIZE(CO1)
MUL t22, ALPHA, t22
@@ -1459,13 +1459,13 @@
ST t31, 2 * SIZE(CO1)
MUL t42, ALPHA, t42
ST t41, 3 * SIZE(CO1)
-
+
ST t12, 0 * SIZE(CO2)
daddiu M,M,-1
ST t22, 1 * SIZE(CO2)
ST t32, 2 * SIZE(CO2)
ST t42, 3 * SIZE(CO2)
-
+
daddiu CO1,CO1, 4*SIZE
daddiu CO2,CO2, 4*SIZE
@@ -1499,7 +1499,7 @@
.align 3
.L12_M2:
andi M,MCO,2 # mr = 2
- beqz M,.L12_M1
+ beqz M,.L12_M1
nop
.L50:
@@ -1525,7 +1525,7 @@
#else
daddiu TEMP, KK, 2
#endif
- dsra K,TEMP,2
+ dsra K,TEMP,2
MOV t12,t11
beqz K,.L55
MOV t22,t11
@@ -1538,18 +1538,18 @@
MTC $0,t11
MOV t21,t11
gsLQC1(R9,F9,F8,0) #b0,b1
-
+
MOV t12,t11
beqz K,.L55
MOV t22,t11
#endif
.L51: # nr=2 mr=2,kr=4
- gsLQC1(R8,F5,F4,1)
+ gsLQC1(R8,F5,F4,1)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
- gsLQC1(R9,F13,F12,1)
+ gsLQC1(R9,F13,F12,1)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
@@ -1562,7 +1562,7 @@
MADD t22,t22,a5,b5
daddiu K,K,-1
- gsLQC1(R8,F7,F6,3)
+ gsLQC1(R8,F7,F6,3)
MADD t11,t11,a2,b2
MADD t21,t21,a3,b2
daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
@@ -1583,20 +1583,20 @@
.L55: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L58
nop
-.L56:
- gsLQC1(R8,F5,F4,1)
+.L56:
+ gsLQC1(R8,F5,F4,1)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
- gsLQC1(R9,F13,F12,1)
+ gsLQC1(R9,F13,F12,1)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
daddu B,B,4*SIZE # 2nr*2kr
@@ -1610,16 +1610,16 @@
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
-
+
.L58: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP, 1
#endif
- beqz K,.L59
+ beqz K,.L59
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
@@ -1632,10 +1632,10 @@
.L59: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # write gemm part back Fetch 16 C
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
LD c12,0(CO2)
LD c22,1*SIZE(CO2)
-
+
MADD t11,c11,t11,ALPHA
MADD t21,c21,t21,ALPHA
MADD t12,c12,t12,ALPHA
@@ -1646,7 +1646,7 @@
ST t12,0(CO2)
ST t22,1*SIZE(CO2)
- daddu CO1,CO1,2*SIZE
+ daddu CO1,CO1,2*SIZE
daddu CO2,CO2,2*SIZE
FETCH $0,0(CO1)
@@ -1692,7 +1692,7 @@
.align 3
.L12_M1:
andi M,MCO,1 # mr = 1
- beqz M,.L0_N2_Loop
+ beqz M,.L0_N2_Loop
nop
.L60:
@@ -1708,10 +1708,10 @@
#endif
MTC $0,t11
LD a0, 0*SIZE(A) # a0
-
+
MOV t21,t11
gsLQC1(R9,F9,F8,0) # b0,b1
-
+
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
@@ -1719,42 +1719,42 @@
#else
daddiu TEMP, KK, 2
#endif
- dsra K,TEMP,2
+ dsra K,TEMP,2
MOV t12,t11
beqz K,.L65
MOV t22,t11
#else
- dsra K,KCO,2
+ dsra K,KCO,2
move B,BO # Reset B
LD a0,0*SIZE(A)
-
+
MTC $0,t11
MOV t21,t11
- gsLQC1(R9,F9,F8,0)
+ gsLQC1(R9,F9,F8,0)
MOV t12,t11
beqz K,.L65
MOV t22,t11
#endif
-.L61: # nr=2,mr=1,kr=4
+.L61: # nr=2,mr=1,kr=4
LD a4, 1*SIZE(A) # a2
MADD t11,t11,a0,b0
-
- gsLQC1(R9,F13,F12,1)
+
+ gsLQC1(R9,F13,F12,1)
MADD t12,t12,a0,b1
LD a2, 2*SIZE(A) # a3
MADD t11,t11,a4,b4
-
+
gsLQC1(R9,F11,F10,2)
MADD t12,t12,a4,b5
LD a6, 3*SIZE(A) # a4
MADD t11,t11,a2,b2
daddiu K,K,-1
-
+
gsLQC1(R9,F15,F14,3)
MADD t12,t12,a2,b3
daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
@@ -1762,46 +1762,46 @@
LD a0, 0*SIZE(A)
MADD t11,t11,a6,b6
daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE
-
+
gsLQC1(R9,F9,F8,0) # a0
bnez K,.L61
MADD t12,t12,a6,b7
.L65: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L68
nop
-.L66:
+.L66:
LD a4, 1*SIZE(A) # a1
MADD t11,t11,a0,b0
daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16
-
- gsLQC1(R9,F13,F12,1)
+
+ gsLQC1(R9,F13,F12,1)
MADD t12,t12,a0,b1
daddu B,B,4*SIZE
.L67:
LD a0,0(A) # a0
MADD t11,t11,a4,b4
-
+
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b5
-
+
.L68: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L69
+ beqz K,.L69
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16
@@ -1812,14 +1812,14 @@
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
LD c12,0(CO2)
-
+
MADD t11,c11,t11,ALPHA
MADD t12,c12,t12,ALPHA
ST t11,0(CO1)
ST t12,0(CO2)
- daddu CO1,CO1,1*SIZE
+ daddu CO1,CO1,1*SIZE
daddu CO2,CO2,1*SIZE
#else
@@ -1829,7 +1829,7 @@
ST t11, 0 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
- daddu CO1,CO1,1*SIZE
+ daddu CO1,CO1,1*SIZE
daddu CO2,CO2,1*SIZE
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1859,15 +1859,15 @@
move BO, B
- .align 5
+ .align 5
.L0_N1:
andi N,NCO,1 # nr = 1
- beqz N,.L999
+ beqz N,.L999
nop
- move CO1,C
- dsra M,MCO,2
-
+ move CO1,C
+ dsra M,MCO,2
+
move A,AO # Reset A
daddu PREA,AO,SPANA
#if defined(TRMMKERNEL) && defined(LEFT)
@@ -1877,7 +1877,7 @@
beqz M,.L11_M2
daddu C,CO1,LDC
-.L70:
+.L70:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B, BO # Reset B
@@ -1891,7 +1891,7 @@
MTC $0,t11
LD b0, 0*SIZE(B)
-
+
MOV t21,t11
gsLQC1(R8,F1,F0,0) #a0,a1
@@ -1904,23 +1904,23 @@
#else
daddiu TEMP, KK, 1
#endif
- dsra K,TEMP,2
+ dsra K,TEMP,2
MOV t41,t11
beqz K,.L75
nop
#else
move B, BO # Reset B
- dsra K,KCO,2
+ dsra K,KCO,2
LD b0, 0*SIZE(B)
-
+
MTC $0,t11
MOV t21,t11
gsLQC1(R8,F1,F0,0) #a0,a1
-
+
MOV t31,t11
MOV t41,t11
gsLQC1(R8,F3,F2,1) #a2,a3
-
+
beqz K,.L75
nop
#endif
@@ -1928,8 +1928,8 @@
.L71: # nr=1,mr=kr=4
LD b4, 1*SIZE(B) # b1
MADD t11,t11,a0,b0
-
- gsLQC1(R8,F5,F4,2)
+
+ gsLQC1(R8,F5,F4,2)
MADD t21,t21,a1,b0
gsLQC1(R8,F7,F6,3)
@@ -1952,8 +1952,8 @@
LD b6, 3*SIZE(B)
MADD t11,t11,a0,b2
daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
-
- gsLQC1(R8,F5,F4,6)
+
+ gsLQC1(R8,F5,F4,6)
MADD t21,t21,a1,b2
FETCH $0,8*SIZE(PREA)
@@ -1966,7 +1966,7 @@
LD b0, 0*SIZE(B)
MADD t11,t11,a4,b6
daddu PREA,PREA,16*SIZE
-
+
gsLQC1(R8,F1,F0,0)
MADD t21,t21,a5,b6
daddiu K,K,-1
@@ -1980,19 +1980,19 @@
.L75: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L78
nop
-.L76:
+.L76:
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32
-
- gsLQC1(R8,F5,F4,2)
+
+ gsLQC1(R8,F5,F4,2)
MADD t21,t21,a1,b0
FETCH $0,0(PREA)
@@ -2004,7 +2004,7 @@
.L77:
LD b0,0(B)
MADD t11,t11,a4,b4
-
+
gsLQC1(R8,F1,F0,0)
MADD t21,t21,a5,b4
FETCH $0,4*SIZE(PREA)
@@ -2014,16 +2014,16 @@
MADD t41,t41,a7,b4
daddu PREA,PREA,8*SIZE
-
+
.L78: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L79
+ beqz K,.L79
LD ALPHA,152($sp) # Get ALPHA
-
+
FETCH $0,0(PREA)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
@@ -2038,7 +2038,7 @@
.L79: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
@@ -2073,7 +2073,7 @@
FETCH $0,4*SIZE(CO1)
FETCH $0,8*SIZE(CO1)
- daddu CO1,CO1,4*SIZE
+ daddu CO1,CO1,4*SIZE
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
@@ -2092,7 +2092,7 @@
#ifdef LEFT
daddiu KK, KK, 4
#endif
- bnez M,.L70
+ bnez M,.L70
nop
#endif
@@ -2100,10 +2100,10 @@
.align 3
.L11_M2:
andi M,MCO,2 # mr = 2
- beqz M,.L11_M1
+ beqz M,.L11_M1
nop
-.L80:
+.L80:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B, BO
@@ -2117,7 +2117,7 @@
LD b0, 0*SIZE(B)
MTC $0,t11
-
+
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
@@ -2132,20 +2132,20 @@
nop
#else
move B, BO
- dsra K,KCO,2
+ dsra K,KCO,2
LD b0, 0*SIZE(B)
MTC $0,t11
MOV t21,t11
gsLQC1(R8,F1,F0,0) #a0,a1
-
+
beqz K,.L85
nop
#endif
.L81: # nr=1,mr=2,kr=4
LD b4, 1*SIZE(B)
- gsLQC1(R8,F5,F4,1)
+ gsLQC1(R8,F5,F4,1)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
@@ -2153,7 +2153,7 @@
gsLQC1(R8,F3,F2,2)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
-
+
LD b6, 3*SIZE(B)
gsLQC1(R8,F7,F6,3)
MADD t11,t11,a2,b2
@@ -2166,44 +2166,44 @@
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a6,b6
MADD t21,t21,a7,b6
-
+
daddiu K,K,-1
bnez K,.L81
nop
.L85: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L88
nop
-.L86:
- gsLQC1(R8,F5,F4,1)
+.L86:
+ gsLQC1(R8,F5,F4,1)
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
-
+
daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
-
+
gsLQC1(R8,F1,F0,0)
LD b0,0(B)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
-
+
.L88: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L89
+ beqz K,.L89
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
@@ -2213,7 +2213,7 @@
.L89: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
MADD t11,c11,t11,ALPHA
MADD t21,c21,t21,ALPHA
@@ -2222,7 +2222,7 @@
ST t21,1*SIZE(CO1)
FETCH $0,2*SIZE(CO1)
-
+
daddu CO1,CO1,2*SIZE # COx += 2*8Byte
#else
@@ -2257,10 +2257,10 @@
.align 3
.L11_M1:
andi M,MCO,1 # mr = 1
- beqz M,.L999
+ beqz M,.L999
nop
-.L90:
+.L90:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B, BO
@@ -2289,7 +2289,7 @@
move B, BO
LD a0, 0*SIZE(A)
LD b0, 0*SIZE(B)
- dsra K,KCO,2
+ dsra K,KCO,2
beqz K,.L95
MTC $0,t11
#endif
@@ -2298,7 +2298,7 @@
LD a4, 1*SIZE(A)
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
-
+
LD a2, 2*SIZE(A)
LD b2, 2*SIZE(B)
MADD t11,t11,a4,b4
@@ -2306,28 +2306,28 @@
LD a6, 3*SIZE(A)
LD b6, 3*SIZE(B)
MADD t11,t11,a2,b2
-
+
daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
LD a0, 0*SIZE(A)
LD b0, 0*SIZE(B)
MADD t11,t11,a6,b6
-
+
daddiu K,K,-1
bnez K,.L91
nop
.L95: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L98
nop
-.L96:
+.L96:
LD a4, 1*SIZE(A)
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
@@ -2337,14 +2337,14 @@
LD b0,0(B)
LD a0,0(A)
MADD t11,t11,a4,b4
-
+
.L98: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L99
+ beqz K,.L99
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
diff --git a/kernel/mips64/dgemm_kernel_loongson3b_4x4.S b/kernel/mips64/dgemm_kernel_loongson3b_4x4.S
index 4a8c9b0..10c5f47 100644
--- a/kernel/mips64/dgemm_kernel_loongson3b_4x4.S
+++ b/kernel/mips64/dgemm_kernel_loongson3b_4x4.S
@@ -110,7 +110,7 @@
#define F27 27
#define F26 26
#define F25 25
-#define F24 24
+#define F24 24
#define F23 23
#define F22 22
#define F21 21
@@ -118,7 +118,7 @@
#define F19 19
#define F18 18
#define F17 17
-#define F16 16
+#define F16 16
#define F15 15
#define F14 14
#define F13 13
@@ -130,14 +130,14 @@
#define F7 7
#define F6 6
#define F5 5
-#define F4 4
-#define F3 3
-#define F2 2
-#define F1 1
+#define F4 4
+#define F3 3
+#define F2 2
+#define F1 1
#define F0 0
PROLOGUE
-
+
daddiu $sp, $sp, -160
sd $16, 0($sp)
sd $17, 8($sp)
@@ -160,7 +160,7 @@
ST $f23,144($sp)
- .align 5
+ .align 5
.L0_N4: # Loop N
ST ALPHA,152($sp) # Backup ALPHA
move MCO,M # Backup M
@@ -170,26 +170,26 @@
move AO,A # Backup A_addr
dsra N,NCO,2 # N=NCO/2
-
+
dsll LDC,LDC,BASE_SHIFT # LDC*8Byte
dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5
-
+
#if defined(TRMMKERNEL)
- LDARG OFFSET,160($sp) # OFFSET is relate to the data part
+ LDARG OFFSET,160($sp) # OFFSET is relate to the data part
#endif
#if defined(TRMMKERNEL) && !defined(LEFT)
- neg KK,OFFSET
+ neg KK,OFFSET
#endif
-
+
move BO,B # Backup B_addr
beq N,$0,.L0_N2 # N=0,NCO<4
dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte
.L0_N4_Lb: # mr=4,nr=4
- move CO1,C
+ move CO1,C
dsra M,MCO,2 # M=MCO/2
-
+
move A,AO # Reset A
daddu CO2,C,LDC
@@ -200,7 +200,7 @@
daddu CO4,CO3,LDC
#if defined(TRMMKERNEL) && defined(LEFT)
- move KK,OFFSET
+ move KK,OFFSET
#endif
beqz M,.L14_M2
daddu C,CO4,LDC # move C to next panel Cj
@@ -227,18 +227,18 @@
MOV t41,t11
MOV t12,t11
LD b0,0(B)
-
+
MOV t22,t11
MOV t32,t11
LD b1,1*SIZE(B)
MOV t42,t11
LD a2,2*SIZE(A)
-
+
MOV t13,t11
MOV t23,t11
LD b2,2*SIZE(B)
-
+
MOV t33,t11
MOV t43,t11
LD a3,3*SIZE(A)
@@ -250,7 +250,7 @@
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP,KCO,KK # temp is the length of the data part
#elif defined(LEFT)
- daddiu TEMP, KK, 4 # S=L,U=L
+ daddiu TEMP, KK, 4 # S=L,U=L
#else
daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part
#endif
@@ -259,7 +259,7 @@
beqz K,.L15
MOV t44,t11
-#else
+#else
move B,BO # Reset B
MTC $0,t11 # GEMM part NR=4,MR=4
LD a0,0(A)
@@ -271,7 +271,7 @@
MOV t41,t11
MOV t12,t11
LD b0,0(B)
-
+
MOV t22,t11
MOV t32,t11
LD b1,1*SIZE(B)
@@ -279,11 +279,11 @@
MOV t42,t11
dsra K,KCO,2 # K=KCO/2
LD a2,2*SIZE(A)
-
+
MOV t13,t11
MOV t23,t11
LD b2,2*SIZE(B)
-
+
MOV t33,t11
MOV t43,t11
LD a3,3*SIZE(A)
@@ -296,7 +296,7 @@
beqz K,.L15
MOV t44,t11 # clear 16 results registers
#endif
-
+
.align 5
.L11: # kr=4
MADD t11,t11,a0,b0
@@ -306,29 +306,29 @@
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
LD a5,5*SIZE(A)
-
+
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
LD b4,4*SIZE(B)
-
+
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
LD b5,5*SIZE(B)
FETCH $0,(PREB)
-
+
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
LD a6,6*SIZE(A)
-
+
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
LD b6,6*SIZE(B)
FETCH $0,(PREA)
-
+
MADD t33,t33,a2,b2
MADD t43,t43,a3,b2
LD a7,7*SIZE(A)
-
+
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
LD b7,7*SIZE(B)
@@ -447,14 +447,14 @@
.L15: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP, 2
#endif
beqz K,.L18
nop
-.L16:
+.L16:
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
LD a4,4*SIZE(A)
@@ -528,16 +528,16 @@
daddu PREB,PREB,8*SIZE
LD b3,3*SIZE(B)
-
+
.L18: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L19
+ beqz K,.L19
LD ALPHA,152($sp) # Get ALPHA
-
+
FETCH $0,0(PREB)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
@@ -569,8 +569,8 @@
MADD t44,t44,a3,b3
.L19: # Write Back to C
-#ifndef TRMMKERNEL
- LD c11,0(CO1) # GEMM write part
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # GEMM write part
LD c21,1*SIZE(CO1) # get 16 C
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
@@ -640,11 +640,11 @@
daddu CO3,CO3,4*SIZE
ST t44,3*SIZE(CO4)
daddu PREB,BO,SPANB
-
- bnez M,.L10
+
+ bnez M,.L10
daddu CO4,CO4,4*SIZE
-#else
+#else
MUL t11, ALPHA, t11 # TRMM write back part
MUL t21, ALPHA, t21
MUL t31, ALPHA, t31
@@ -685,7 +685,7 @@
daddiu CO1,CO1, 4 * SIZE
daddiu CO2,CO2, 4 * SIZE
daddiu CO3,CO3, 4 * SIZE
- daddiu CO4,CO4, 4 * SIZE
+ daddiu CO4,CO4, 4 * SIZE
FETCH $0,4*SIZE(CO1)
FETCH $0,4*SIZE(CO2)
@@ -698,7 +698,7 @@
FETCH $0,0(CO4)
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- dsubu TEMP,KCO,KK
+ dsubu TEMP,KCO,KK
#ifdef LEFT
daddiu TEMP,TEMP, -4
#else
@@ -710,10 +710,10 @@
daddu B,B,TEMP # mov B to the end of panel Bj
#endif
-#ifdef LEFT
+#ifdef LEFT
daddiu KK, KK,4
#endif
- bnez M,.L10
+ bnez M,.L10
nop
#endif
@@ -721,7 +721,7 @@
.align 3
.L14_M2:
andi M, MCO, 2 # nr=4,mr=2
- beqz M,.L14_M1
+ beqz M,.L14_M1
nop
.L20:
@@ -729,7 +729,7 @@
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO # Reset B
#else
- dsll K,KK,1 + BASE_SHIFT # mr=2
+ dsll K,KK,1 + BASE_SHIFT # mr=2
dsll TEMP,KK,2 + BASE_SHIFT # nr=4
daddu A,A,K
daddu B,BO,TEMP
@@ -738,7 +738,7 @@
LD a0,0*SIZE(A)
MTC $0,t11
LD a1,1*SIZE(A)
-
+
MOV t21,t11
LD b0,0*SIZE(B)
MOV t12,t11
@@ -764,18 +764,18 @@
MOV t24,t11 # clear 2*4=8 results registers
#else
- move B,BO # Reset B
+ move B,BO # Reset B
LD a0,0*SIZE(A)
MTC $0,t11
LD a1,1*SIZE(A)
-
+
MOV t21,t11
LD b0,0*SIZE(B)
MOV t12,t11
LD b1,1*SIZE(B)
MOV t22,t11
- dsra K,KCO,2
+ dsra K,KCO,2
LD b2,2*SIZE(B)
MOV t13,t11
@@ -806,7 +806,7 @@
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
-
+
MADD t11,t11,a4,b4
LD a2,4*SIZE(A)
MADD t21,t21,a5,b4
@@ -866,7 +866,7 @@
MADD t24,t24,a7,b7
-.L25:
+.L25:
#ifndef TRMMKERNEL
andi K,KCO,2 # kr=2
#else
@@ -875,7 +875,7 @@
beqz K,.L28
nop
-.L26:
+.L26:
MADD t11,t11,a0,b0
LD a4,2*SIZE(A)
MADD t21,t21,a1,b0
@@ -890,7 +890,7 @@
LD b6,6*SIZE(B)
MADD t23,t23,a1,b2
LD b7,7*SIZE(B)
-
+
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
daddu A,A,4*SIZE # 2mr*2kr
@@ -915,16 +915,16 @@
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
-
-.L28: # kr=1
+
+.L28: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L29
+ beqz K,.L29
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # 2mr*kr
@@ -942,11 +942,11 @@
.L29: # Write Back to C
#ifndef TRMMKERNEL
LD c11,0(CO1) # GEMM write back part
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
LD c12,0(CO2)
LD c22,1*SIZE(CO2)
-
+
LD c13,0(CO3)
MADD t11,c11,t11,ALPHA
LD c23,1*SIZE(CO3)
@@ -985,25 +985,25 @@
#else
MUL t11, ALPHA, t11 # TRMM write back part
MUL t21, ALPHA, t21
-
+
ST t11, 0 * SIZE(CO1)
MUL t12, ALPHA, t12
ST t21, 1 * SIZE(CO1)
MUL t22, ALPHA, t22
-
+
ST t12, 0 * SIZE(CO2)
MUL t13, ALPHA, t13
ST t22, 1 * SIZE(CO2)
MUL t23, ALPHA, t23
-
+
ST t13, 0 * SIZE(CO3)
MUL t14, ALPHA, t14
ST t23, 1 * SIZE(CO3)
MUL t24, ALPHA, t24
-
+
ST t14, 0 * SIZE(CO4)
ST t24, 1 * SIZE(CO4)
-
+
daddiu CO1,CO1, 2 * SIZE
daddiu CO2,CO2, 2 * SIZE
daddiu CO3,CO3, 2 * SIZE
@@ -1036,7 +1036,7 @@
.align 3
.L14_M1:
- andi M,MCO,1 # mr=1
+ andi M,MCO,1 # mr=1
beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj
nop
@@ -1056,13 +1056,13 @@
MTC $0,t11
LD b0,0*SIZE(B)
-
+
MOV t12,t11
LD b1,1*SIZE(B)
MOV t13,t11
LD b2,2*SIZE(B)
-
+
MOV t14,t11
LD b3,3*SIZE(B)
@@ -1077,35 +1077,35 @@
nop
beqz K,.L35
nop
-
-#else
+
+#else
move B,BO # Reset B, GEMM part
dsra K,KCO,2 # K=KCO/2
LD a0, 0 * SIZE(A) # a0
MTC $0,t11
LD b0,0*SIZE(B)
-
+
MOV t12,t11
LD b1,1*SIZE(B)
MOV t13,t11
LD b2,2*SIZE(B)
-
+
MOV t14,t11
beqz K,.L35
LD b3,3*SIZE(B)
#endif
-.L31: # nr=4,mr=1,kr=4
+.L31: # nr=4,mr=1,kr=4
LD a1, 1*SIZE(A) # load a1
MADD t11,t11,a0,b0
-
+
LD b4,4*SIZE(B)
LD b5,5*SIZE(B)
MADD t12,t12,a0,b1
-
+
LD b6,6*SIZE(B)
LD b7,7*SIZE(B)
MADD t13,t13,a0,b2
@@ -1113,11 +1113,11 @@
LD a2, 2*SIZE(A) # a2
MADD t11,t11,a1,b4
-
+
LD b0,8*SIZE(B)
LD b1,9*SIZE(B)
MADD t12,t12,a1,b5
-
+
LD b2,10*SIZE(B)
LD b3,11*SIZE(B)
MADD t13,t13,a1,b6
@@ -1126,12 +1126,12 @@
LD a3, 3*SIZE(A) # a3
MADD t11,t11,a2,b0
daddiu K,K,-1
-
+
LD b4,12*SIZE(B)
LD b5,13*SIZE(B)
MADD t12,t12,a2,b1
daddu A,A,4*SIZE # 1mr*4kr
-
+
LD b6,14*SIZE(B)
LD b7,15*SIZE(B)
MADD t13,t13,a2,b2
@@ -1140,7 +1140,7 @@
LD a0, 0*SIZE(A) # a0
daddu B,B,16*SIZE # 4nr*4kr
MADD t11,t11,a3,b4
-
+
LD b0,0*SIZE(B)
MADD t12,t12,a3,b5
LD b1,1*SIZE(B)
@@ -1154,14 +1154,14 @@
.L35: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L38
nop
-.L36:
+.L36:
LD a1,1*SIZE(A) # load a1
MADD t11,t11,a0,b0
@@ -1169,10 +1169,10 @@
LD b5,5*SIZE(B)
MADD t12,t12,a0,b1
daddu A,A,2*SIZE # mr*2kr
-
+
LD b6,6*SIZE(B)
MADD t13,t13,a0,b2
-
+
LD b7,7*SIZE(B)
MADD t14,t14,a0,b3
daddu B,B,8*SIZE # 4nr*2kr
@@ -1181,41 +1181,41 @@
.L37:
LD a0,0(A)
MADD t11,t11,a1,b4
-
+
LD b0,0*SIZE(B)
LD b1,1*SIZE(B)
MADD t12,t12,a1,b5
-
+
LD b2,2*SIZE(B)
LD b3,3*SIZE(B)
MADD t13,t13,a1,b6
MADD t14,t14,a1,b7
-
-
+
+
.L38: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L39
+ beqz K,.L39
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
- daddu A,A,1*SIZE
+ daddu A,A,1*SIZE
daddu B,B,4*SIZE
-
+
MADD t13,t13,a0,b2
MADD t14,t14,a0,b3
.L39: # Write Back
#ifndef TRMMKERNEL
- LD c11,0(CO1)
+ LD c11,0(CO1)
LD c12,0(CO2)
LD c13,0(CO3)
LD c14,0(CO4)
-
+
MADD t11,c11,t11,ALPHA
MADD t12,c12,t12,ALPHA
MADD t13,c13,t13,ALPHA
@@ -1261,22 +1261,22 @@
.L0_N4_Loop: # mc finished
daddiu N,N,-1 # N--
#if defined(TRMMKERNEL) && !defined(LEFT)
- daddiu KK, KK,4
+ daddiu KK, KK,4
#endif
- bnez N,.L0_N4_Lb
+ bnez N,.L0_N4_Lb
move BO,B # Set BO point to next panel Bj
- .align 5
+ .align 5
.L0_N2:
andi N,NCO,2 # nr = 2
- beqz N,.L0_N1
+ beqz N,.L0_N1
nop
.L0_N2_Lb:
- move CO1,C
+ move CO1,C
daddu CO2,C,LDC
- dsra M,MCO,2
+ dsra M,MCO,2
move A,AO # Reset A
daddu PREA,AO,SPANA
@@ -1288,13 +1288,13 @@
beqz M,.L12_M2
nop
-.L40:
+.L40:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO # Reset B
#else
dsll K,KK, 2 + BASE_SHIFT
- dsll TEMP, KK,1 + BASE_SHIFT
+ dsll TEMP, KK,1 + BASE_SHIFT
daddu A,A,K
daddu B,BO,TEMP
@@ -1311,10 +1311,10 @@
MOV t41,t11
LD a2,2*SIZE(A)
LD a3,3*SIZE(A)
-
+
MOV t12,t11
MOV t22,t11
-
+
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP,KCO,KK
#elif defined(LEFT)
@@ -1322,7 +1322,7 @@
#else
daddiu TEMP, KK, 2
#endif
- dsra K,TEMP,2
+ dsra K,TEMP,2
MOV t32,t11
beqz K,.L45
MOV t42,t11
@@ -1342,10 +1342,10 @@
LD a2,2*SIZE(A)
dsra K,KCO,2 # K=KCO/2
LD a3,3*SIZE(A)
-
+
MOV t12,t11
MOV t22,t11
-
+
MOV t32,t11
beqz K,.L45
MOV t42,t11
@@ -1411,9 +1411,9 @@
FETCH $0,8*SIZE(PREA)
MADD t32,t32,a2,b3
MADD t42,t42,a3,b3
-
+
daddu A,A,16*SIZE # 4mr*4kr
- daddu B,B,8*SIZE # 2nr*4kr
+ daddu B,B,8*SIZE # 2nr*4kr
.L44:
MADD t11,t11,a4,b6
@@ -1443,14 +1443,14 @@
.L45: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L48
nop
-.L46:
+.L46:
MADD t11,t11,a0,b0
LD a4,4*SIZE(A)
MADD t21,t21,a1,b0
@@ -1469,7 +1469,7 @@
FETCH $0,0(PREA)
MADD t32,t32,a2,b1
daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32
-
+
MADD t42,t42,a3,b1
daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
@@ -1495,16 +1495,16 @@
daddu PREA,PREA,8*SIZE
-
+
.L48: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L49
+ beqz K,.L49
LD ALPHA,152($sp) # Get ALPHA
-
+
FETCH $0,0(PREA)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
@@ -1524,7 +1524,7 @@
.L49: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # gemm write back part Fetch 16 C
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
@@ -1545,7 +1545,7 @@
MADD t32,c32,t32,ALPHA
ST t41,3*SIZE(CO1)
MADD t42,c42,t42,ALPHA
- daddiu M,M,-1
+ daddiu M,M,-1
ST t12,0(CO2)
ST t22,1*SIZE(CO2)
@@ -1557,8 +1557,8 @@
FETCH $0,8*SIZE(CO1)
FETCH $0,8*SIZE(CO2)
- daddu CO1,CO1,4*SIZE
- bnez M,.L40
+ daddu CO1,CO1,4*SIZE
+ bnez M,.L40
daddu CO2,CO2,4*SIZE
#else
@@ -1566,7 +1566,7 @@
MUL t21, ALPHA, t21
MUL t31, ALPHA, t31
MUL t41, ALPHA, t41
-
+
MUL t12, ALPHA, t12
ST t11, 0 * SIZE(CO1)
MUL t22, ALPHA, t22
@@ -1575,13 +1575,13 @@
ST t31, 2 * SIZE(CO1)
MUL t42, ALPHA, t42
ST t41, 3 * SIZE(CO1)
-
+
ST t12, 0 * SIZE(CO2)
daddiu M,M,-1
ST t22, 1 * SIZE(CO2)
ST t32, 2 * SIZE(CO2)
ST t42, 3 * SIZE(CO2)
-
+
daddiu CO1,CO1, 4*SIZE
daddiu CO2,CO2, 4*SIZE
@@ -1615,7 +1615,7 @@
.align 3
.L12_M2:
andi M,MCO,2 # mr = 2
- beqz M,.L12_M1
+ beqz M,.L12_M1
nop
.L50:
@@ -1636,7 +1636,7 @@
LD b0,0*SIZE(B)
MOV t21,t11
LD b1,1*SIZE(B)
-
+
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
@@ -1644,7 +1644,7 @@
#else
daddiu TEMP, KK, 2
#endif
- dsra K,TEMP,2
+ dsra K,TEMP,2
MOV t12,t11
beqz K,.L55
MOV t22,t11
@@ -1659,7 +1659,7 @@
LD b0,0*SIZE(B)
MOV t21,t11
LD b1,1*SIZE(B)
-
+
MOV t12,t11
beqz K,.L55
MOV t22,t11
@@ -1715,14 +1715,14 @@
.L55: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L58
nop
-.L56:
+.L56:
MADD t11,t11,a0,b0
LD a4,2*SIZE(A)
MADD t21,t21,a1,b0
@@ -1752,9 +1752,9 @@
#else
andi K,TEMP, 1
#endif
- beqz K,.L59
+ beqz K,.L59
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
@@ -1767,10 +1767,10 @@
.L59: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # write gemm part back Fetch 16 C
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
LD c12,0(CO2)
LD c22,1*SIZE(CO2)
-
+
MADD t11,c11,t11,ALPHA
MADD t21,c21,t21,ALPHA
MADD t12,c12,t12,ALPHA
@@ -1781,7 +1781,7 @@
ST t12,0(CO2)
ST t22,1*SIZE(CO2)
- daddu CO1,CO1,2*SIZE
+ daddu CO1,CO1,2*SIZE
daddu CO2,CO2,2*SIZE
FETCH $0,0(CO1)
@@ -1827,7 +1827,7 @@
.align 3
.L12_M1:
andi M,MCO,1 # mr = 1
- beqz M,.L0_N2_Loop
+ beqz M,.L0_N2_Loop
nop
.L60:
@@ -1842,7 +1842,7 @@
daddu B, BO, TEMP
#endif
LD a0,0*SIZE(A)
-
+
MTC $0,t11
MOV t21,t11
LD b0,0*SIZE(B)
@@ -1857,16 +1857,16 @@
#else
daddiu TEMP, KK, 2
#endif
- dsra K,TEMP,2
+ dsra K,TEMP,2
MOV t22,t11
beqz K,.L65
nop
#else
- dsra K,KCO,2
+ dsra K,KCO,2
move B,BO # Reset B
LD a0,0*SIZE(A)
-
+
MTC $0,t11
MOV t21,t11
LD b0,0*SIZE(B)
@@ -1878,18 +1878,18 @@
#endif
-.L61: # nr=2,mr=1,kr=4
+.L61: # nr=2,mr=1,kr=4
LD a4, 1*SIZE(A) # a2
LD b4, 2*SIZE(B)
MADD t11,t11,a0,b0
-
+
LD b5,3*SIZE(B)
MADD t12,t12,a0,b1
LD a2, 2*SIZE(A) # a3
LD b2,4*SIZE(B)
MADD t11,t11,a4,b4
-
+
LD b3,5*SIZE(B)
MADD t12,t12,a4,b5
@@ -1897,17 +1897,17 @@
daddiu K,K,-1
LD b6,6*SIZE(B)
MADD t11,t11,a2,b2
-
+
LD b7,7*SIZE(B)
MADD t12,t12,a2,b3
daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
LD a0, 0*SIZE(A)
daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE
-
- LD b0,0*SIZE(B)
+
+ LD b0,0*SIZE(B)
MADD t11,t11,a6,b6
-
+
LD b1,1*SIZE(B)
bnez K,.L61
MADD t12,t12,a6,b7
@@ -1916,19 +1916,19 @@
.L65: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L68
nop
-.L66:
+.L66:
LD a4, 1*SIZE(A) # a1
MADD t11,t11,a0,b0
LD b4,2*SIZE(B)
daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16
-
+
LD b5,3*SIZE(B)
MADD t12,t12,a0,b1
daddu B,B,4*SIZE
@@ -1937,7 +1937,7 @@
LD a0,0(A) # a0
LD b0,0*SIZE(B)
MADD t11,t11,a4,b4
-
+
LD b1,1*SIZE(B)
MADD t12,t12,a4,b5
@@ -1948,9 +1948,9 @@
#else
andi K,TEMP,1
#endif
- beqz K,.L69
+ beqz K,.L69
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16
@@ -1961,14 +1961,14 @@
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
LD c12,0(CO2)
-
+
MADD t11,c11,t11,ALPHA
MADD t12,c12,t12,ALPHA
ST t11,0(CO1)
ST t12,0(CO2)
- daddu CO1,CO1,1*SIZE
+ daddu CO1,CO1,1*SIZE
daddu CO2,CO2,1*SIZE
#else
@@ -1978,7 +1978,7 @@
ST t11, 0 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
- daddu CO1,CO1,1*SIZE
+ daddu CO1,CO1,1*SIZE
daddu CO2,CO2,1*SIZE
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -2008,15 +2008,15 @@
move BO, B
- .align 5
+ .align 5
.L0_N1:
andi N,NCO,1 # nr = 1
- beqz N,.L999
+ beqz N,.L999
nop
- move CO1,C
- dsra M,MCO,2
-
+ move CO1,C
+ dsra M,MCO,2
+
move A,AO # Reset A
daddu PREA,AO,SPANA
#if defined(TRMMKERNEL) && defined(LEFT)
@@ -2026,7 +2026,7 @@
beqz M,.L11_M2
daddu C,CO1,LDC
-.L70:
+.L70:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B, BO # Reset B
@@ -2038,12 +2038,12 @@
daddu B, BO, TEMP
#endif
LD b0, 0*SIZE(B)
-
+
MTC $0,t11
LD a0,0*SIZE(A)
MOV t21,t11
LD a1,1*SIZE(A)
-
+
MOV t31,t11
LD a2,2*SIZE(A)
MOV t41,t11
@@ -2057,19 +2057,19 @@
#else
daddiu TEMP, KK, 1
#endif
- dsra K,TEMP,2
+ dsra K,TEMP,2
beqz K,.L75
nop
#else
move B, BO # Reset B
- dsra K,KCO,2
+ dsra K,KCO,2
LD b0, 0*SIZE(B)
-
+
MTC $0,t11
LD a0,0*SIZE(A)
MOV t21,t11
LD a1,1*SIZE(A)
-
+
MOV t31,t11
LD a2,2*SIZE(A)
MOV t41,t11
@@ -2081,7 +2081,7 @@
.L71: # nr=1,mr=kr=4
LD b4, 1*SIZE(B) # b1
MADD t11,t11,a0,b0
-
+
LD a4, 4*SIZE(A)
MADD t21,t21,a1,b0
@@ -2097,7 +2097,7 @@
.L72:
LD b2, 2*SIZE(B) # b2
MADD t11,t11,a4,b4
-
+
LD a0,8*SIZE(A)
MADD t21,t21,a5,b4
@@ -2106,17 +2106,17 @@
LD a2,10*SIZE(A)
MADD t31,t31,a6,b4
-
+
LD a3,11*SIZE(A)
MADD t41,t41,a7,b4
.L73:
LD b6, 3*SIZE(B)
MADD t11,t11,a0,b2
-
+
LD a4,12*SIZE(A)
daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
-
+
LD a5,13*SIZE(A)
MADD t21,t21,a1,b2
@@ -2131,7 +2131,7 @@
.L74:
LD b0, 0*SIZE(B)
MADD t11,t11,a4,b6
-
+
LD a0,0*SIZE(A)
daddu PREA,PREA,16*SIZE
@@ -2150,20 +2150,20 @@
.L75: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L78
nop
-.L76:
+.L76:
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
-
+
LD a4,4*SIZE(A)
daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32
-
+
LD a5,5*SIZE(A)
MADD t21,t21,a1,b0
FETCH $0,0(PREA)
@@ -2193,16 +2193,16 @@
daddu PREA,PREA,8*SIZE
-
+
.L78: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L79
+ beqz K,.L79
LD ALPHA,152($sp) # Get ALPHA
-
+
FETCH $0,0(PREA)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
@@ -2217,7 +2217,7 @@
.L79: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
@@ -2252,7 +2252,7 @@
FETCH $0,4*SIZE(CO1)
FETCH $0,8*SIZE(CO1)
- daddu CO1,CO1,4*SIZE
+ daddu CO1,CO1,4*SIZE
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
@@ -2271,7 +2271,7 @@
#ifdef LEFT
daddiu KK, KK, 4
#endif
- bnez M,.L70
+ bnez M,.L70
nop
#endif
@@ -2279,10 +2279,10 @@
.align 3
.L11_M2:
andi M,MCO,2 # mr = 2
- beqz M,.L11_M1
+ beqz M,.L11_M1
nop
-.L80:
+.L80:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B, BO
@@ -2312,13 +2312,13 @@
nop
#else
move B, BO
- dsra K,KCO,2
+ dsra K,KCO,2
LD b0, 0*SIZE(B)
MTC $0,t11
MOV t21,t11
LD a0,0*SIZE(A)
-
+
beqz K,.L85
LD a1,1*SIZE(A)
@@ -2336,7 +2336,7 @@
MADD t11,t11,a4,b4
LD a3,5*SIZE(A)
MADD t21,t21,a5,b4
-
+
LD b6, 3*SIZE(B)
LD a6,6*SIZE(A)
MADD t11,t11,a2,b2
@@ -2358,23 +2358,23 @@
.L85: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L88
nop
-.L86:
+.L86:
LD b4, 1*SIZE(B)
LD a4,2*SIZE(A)
MADD t11,t11,a0,b0
LD a5,3*SIZE(A)
MADD t21,t21,a1,b0
-
+
daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
-
+
LD b0,0(B)
LD a0,0*SIZE(A)
MADD t11,t11,a4,b4
@@ -2382,16 +2382,16 @@
MADD t21,t21,a5,b4
-
+
.L88: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L89
+ beqz K,.L89
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
@@ -2401,7 +2401,7 @@
.L89: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
MADD t11,c11,t11,ALPHA
MADD t21,c21,t21,ALPHA
@@ -2410,7 +2410,7 @@
ST t21,1*SIZE(CO1)
FETCH $0,2*SIZE(CO1)
-
+
daddu CO1,CO1,2*SIZE # COx += 2*8Byte
#else
@@ -2445,10 +2445,10 @@
.align 3
.L11_M1:
andi M,MCO,1 # mr = 1
- beqz M,.L999
+ beqz M,.L999
nop
-.L90:
+.L90:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B, BO
@@ -2478,7 +2478,7 @@
move B, BO
LD a0, 0*SIZE(A)
LD b0, 0*SIZE(B)
- dsra K,KCO,2
+ dsra K,KCO,2
beqz K,.L95
MTC $0,t11
#endif
@@ -2487,7 +2487,7 @@
LD a4, 1*SIZE(A)
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
-
+
LD a2, 2*SIZE(A)
LD b2, 2*SIZE(B)
MADD t11,t11,a4,b4
@@ -2495,28 +2495,28 @@
LD a6, 3*SIZE(A)
LD b6, 3*SIZE(B)
MADD t11,t11,a2,b2
-
+
daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
LD a0, 0*SIZE(A)
LD b0, 0*SIZE(B)
MADD t11,t11,a6,b6
-
+
daddiu K,K,-1
bnez K,.L91
nop
.L95: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L98
nop
-.L96:
+.L96:
LD a4, 1*SIZE(A)
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
@@ -2526,14 +2526,14 @@
LD b0,0(B)
LD a0,0(A)
MADD t11,t11,a4,b4
-
+
.L98: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L99
+ beqz K,.L99
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
diff --git a/kernel/mips64/dnrm2.S b/kernel/mips64/dnrm2.S
index 595eb96..a095e05 100644
--- a/kernel/mips64/dnrm2.S
+++ b/kernel/mips64/dnrm2.S
@@ -43,7 +43,7 @@
#define X $5
#define INCX $6
#define XX $7
-
+
#define I $2
#define TEMP $3
@@ -71,7 +71,7 @@
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
diff --git a/kernel/mips64/dot.S b/kernel/mips64/dot.S
index 6220b6a..cb6fbe9 100644
--- a/kernel/mips64/dot.S
+++ b/kernel/mips64/dot.S
@@ -44,7 +44,7 @@
#define INCX $6
#define Y $7
#define INCY $8
-
+
#define I $2
#define TEMP $3
@@ -61,7 +61,7 @@
#define s2 $f1
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
@@ -306,5 +306,5 @@
#endif
j $31
NOP
-
+
EPILOGUE
diff --git a/kernel/mips64/gemm_beta.S b/kernel/mips64/gemm_beta.S
index 2e0b241..648d1b8 100644
--- a/kernel/mips64/gemm_beta.S
+++ b/kernel/mips64/gemm_beta.S
@@ -62,7 +62,7 @@
#define ALPHA $f15
PROLOGUE
-
+
LDARG C, 0($sp)
MTC $0, FZERO
LDARG LDC, 8($sp)
diff --git a/kernel/mips64/gemm_kernel.S b/kernel/mips64/gemm_kernel.S
index 8ee32d5..1405711 100644
--- a/kernel/mips64/gemm_kernel.S
+++ b/kernel/mips64/gemm_kernel.S
@@ -55,7 +55,7 @@
#define L $7
#define PREFETCHSIZE (4 * 10)
-
+
#define CO1 $14
#define CO2 $15
#define CO3 $16
@@ -109,7 +109,7 @@
#define ALPHA $f15
PROLOGUE
-
+
daddiu $sp, $sp, -160
SDARG $16, 0($sp)
@@ -1028,7 +1028,7 @@
bgtz J, .L10
move B, BO
.align 3
-
+
.L30:
andi J, N, 4
blez J, .L50
diff --git a/kernel/mips64/gemv_n.S b/kernel/mips64/gemv_n.S
index 908f973..dd0b606 100644
--- a/kernel/mips64/gemv_n.S
+++ b/kernel/mips64/gemv_n.S
@@ -89,7 +89,7 @@
PROLOGUE
-
+
LDARG Y, 0($sp)
LDARG INCY, 8($sp)
LDARG BUFFER, 16($sp)
@@ -109,7 +109,7 @@
sdc1 $f21, 24($sp)
sdc1 $f22, 32($sp)
#endif
-
+
blez M, .L999
dsll INCX, INCX, BASE_SHIFT
diff --git a/kernel/mips64/gemv_n_loongson3a.c b/kernel/mips64/gemv_n_loongson3a.c
index 7db5954..d06b58f 100644
--- a/kernel/mips64/gemv_n_loongson3a.c
+++ b/kernel/mips64/gemv_n_loongson3a.c
@@ -1,6 +1,6 @@
#include "common.h"
-//These are auto-tuning codes on Loongson-3A platform.
+//These are auto-tuning codes on Loongson-3A platform.
//#define prefetch(x) __builtin_prefetch(x)
//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
@@ -13,7 +13,7 @@
#define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
#define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER)
{
BLASLONG kx=0, ky=0;
diff --git a/kernel/mips64/gemv_t.S b/kernel/mips64/gemv_t.S
index 2808756..ae00fea 100644
--- a/kernel/mips64/gemv_t.S
+++ b/kernel/mips64/gemv_t.S
@@ -85,7 +85,7 @@
#define x8 $f20
PROLOGUE
-
+
LDARG Y, 0($sp)
LDARG INCY, 8($sp)
LDARG BUFFER, 16($sp)
@@ -104,7 +104,7 @@
#ifndef __64BIT__
sdc1 $f20, 16($sp)
#endif
-
+
blez M, .L999
dsll INCX, INCX, BASE_SHIFT
@@ -353,9 +353,9 @@
.L19:
LD a1, 0 * SIZE(Y)
- daddu Y, Y, INCY
+ daddu Y, Y, INCY
LD a2, 0 * SIZE(Y)
- daddu Y, Y, INCY
+ daddu Y, Y, INCY
MADD a1, a1, ALPHA, y1
daddiu J, J, -1
@@ -363,11 +363,11 @@
MTC $0, y1
ST a1, 0 * SIZE(YY)
- daddu YY, YY, INCY
+ daddu YY, YY, INCY
ST a2, 0 * SIZE(YY)
bgtz J, .L11
- daddu YY, YY, INCY
+ daddu YY, YY, INCY
.align 3
.L20:
@@ -504,13 +504,13 @@
.L29:
LD a1, 0 * SIZE(Y)
- daddu Y, Y, INCY
+ daddu Y, Y, INCY
MADD a1, a1, ALPHA, y1
NOP
ST a1, 0 * SIZE(YY)
- daddu YY, YY, INCY
+ daddu YY, YY, INCY
.align 3
.L999:
diff --git a/kernel/mips64/gemv_t_loongson3a.c b/kernel/mips64/gemv_t_loongson3a.c
index 51f035d..a6b4154 100644
--- a/kernel/mips64/gemv_t_loongson3a.c
+++ b/kernel/mips64/gemv_t_loongson3a.c
@@ -1,6 +1,6 @@
#include "common.h"
-//These are auto-tuning codes on Loongson-3A platform.
+//These are auto-tuning codes on Loongson-3A platform.
//#define prefetch(x) __builtin_prefetch(x)
//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
diff --git a/kernel/mips64/iamax.S b/kernel/mips64/iamax.S
index ff6c215..61e3514 100644
--- a/kernel/mips64/iamax.S
+++ b/kernel/mips64/iamax.S
@@ -42,7 +42,7 @@
#define N $4
#define X $5
#define INCX $6
-
+
#define I $3
#define TEMP $7
@@ -69,9 +69,9 @@
#define x2 $8
#define x3 $9
#define x4 $10
-
+
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
diff --git a/kernel/mips64/iamin.S b/kernel/mips64/iamin.S
index 131aa88..ff05b99 100644
--- a/kernel/mips64/iamin.S
+++ b/kernel/mips64/iamin.S
@@ -42,7 +42,7 @@
#define N $4
#define X $5
#define INCX $6
-
+
#define I $3
#define TEMP $7
@@ -69,9 +69,9 @@
#define x2 $8
#define x3 $9
#define x4 $10
-
+
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
diff --git a/kernel/mips64/imax.S b/kernel/mips64/imax.S
index ec9d3fc..e0d358b 100644
--- a/kernel/mips64/imax.S
+++ b/kernel/mips64/imax.S
@@ -42,7 +42,7 @@
#define N $4
#define X $5
#define INCX $6
-
+
#define I $3
#define TEMP $7
@@ -69,9 +69,9 @@
#define x2 $8
#define x3 $9
#define x4 $10
-
+
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
diff --git a/kernel/mips64/imin.S b/kernel/mips64/imin.S
index a247c83..b41f766 100644
--- a/kernel/mips64/imin.S
+++ b/kernel/mips64/imin.S
@@ -42,7 +42,7 @@
#define N $4
#define X $5
#define INCX $6
-
+
#define I $3
#define TEMP $7
@@ -69,9 +69,9 @@
#define x2 $8
#define x3 $9
#define x4 $10
-
+
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
diff --git a/kernel/mips64/izamax.S b/kernel/mips64/izamax.S
index 12e26c9..c7c8a5b 100644
--- a/kernel/mips64/izamax.S
+++ b/kernel/mips64/izamax.S
@@ -42,7 +42,7 @@
#define N $4
#define X $5
#define INCX $6
-
+
#define I $3
#define TEMP $7
@@ -73,9 +73,9 @@
#define x2 $8
#define x3 $9
#define x4 $10
-
+
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
@@ -92,7 +92,7 @@
FABS t1, a1
FABS t2, a2
-
+
ADD s1, t1, t2
ADD s2, t1, t2
ADD s3, t1, t2
diff --git a/kernel/mips64/izamin.S b/kernel/mips64/izamin.S
index af3d750..e65ac85 100644
--- a/kernel/mips64/izamin.S
+++ b/kernel/mips64/izamin.S
@@ -42,7 +42,7 @@
#define N $4
#define X $5
#define INCX $6
-
+
#define I $3
#define TEMP $7
@@ -73,9 +73,9 @@
#define x2 $8
#define x3 $9
#define x4 $10
-
+
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
@@ -92,7 +92,7 @@
FABS t1, a1
FABS t2, a2
-
+
ADD s1, t1, t2
ADD s2, t1, t2
ADD s3, t1, t2
diff --git a/kernel/mips64/max.S b/kernel/mips64/max.S
index a432f12..0616c92 100644
--- a/kernel/mips64/max.S
+++ b/kernel/mips64/max.S
@@ -42,7 +42,7 @@
#define N $4
#define X $5
#define INCX $6
-
+
#define I $2
#define TEMP $3
@@ -61,7 +61,7 @@
#define s4 $f3
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
diff --git a/kernel/mips64/min.S b/kernel/mips64/min.S
index 33cfc81..cf2e24b 100644
--- a/kernel/mips64/min.S
+++ b/kernel/mips64/min.S
@@ -42,7 +42,7 @@
#define N $4
#define X $5
#define INCX $6
-
+
#define I $2
#define TEMP $3
@@ -61,7 +61,7 @@
#define s4 $f3
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
diff --git a/kernel/mips64/rot.S b/kernel/mips64/rot.S
index b94a59c..c72d381 100644
--- a/kernel/mips64/rot.S
+++ b/kernel/mips64/rot.S
@@ -44,7 +44,7 @@
#define INCX $6
#define Y $7
#define INCY $8
-
+
#define XX $9
#define YY $10
@@ -70,7 +70,7 @@
#define t4 $f3
PROLOGUE
-
+
dsll INCX, INCX, BASE_SHIFT
li TEMP, SIZE
diff --git a/kernel/mips64/scal.S b/kernel/mips64/scal.S
index f544914..b28b8a3 100644
--- a/kernel/mips64/scal.S
+++ b/kernel/mips64/scal.S
@@ -66,7 +66,7 @@
#define t4 $f11
PROLOGUE
-
+
li TEMP, SIZE
MTC $0, a1
@@ -166,7 +166,7 @@
NOP
.align 3
-.L50:
+.L50:
bne INCX, TEMP, .L60
dsra I, N, 3
@@ -397,7 +397,7 @@
LD a1, 0 * SIZE(X)
MUL t1, ALPHA, a1
-
+
daddiu I, I, -1
ST t1, 0 * SIZE(X)
diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S
index bc81d0e..37b20a8 100644
--- a/kernel/mips64/sgemm_kernel_8x4_ps.S
+++ b/kernel/mips64/sgemm_kernel_8x4_ps.S
@@ -80,7 +80,7 @@
#define F27 27
#define F26 26
#define F25 25
-#define F24 24
+#define F24 24
#define F23 23
#define F22 22
#define F21 21
@@ -88,7 +88,7 @@
#define F19 19
#define F18 18
#define F17 17
-#define F16 16
+#define F16 16
#define F15 15
#define F14 14
#define F13 13
@@ -100,10 +100,10 @@
#define F7 7
#define F6 6
#define F5 5
-#define F4 4
-#define F3 3
-#define F2 2
-#define F1 1
+#define F4 4
+#define F3 3
+#define F2 2
+#define F1 1
#define F0 0
#define R12 12
@@ -132,7 +132,7 @@
# .fmask 0x00000000,0
# .set noreorder
# .set nomacro
-
+
PROLOGUE
@@ -213,12 +213,12 @@
daddu BO, B, TEMP
#endif
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
dsll PREB, K, BASE_SHIFT
MOV C21, C11
MOV C22, C11
-
+
MOV C31, C11
MOV C32, C11
gsLQC1(R13, F9, F8, 0) # B1 B2
@@ -235,13 +235,13 @@
FETCH $0, 0 * SIZE(CO1)
MOV C24, C11
FETCH $0, 4 * SIZE(CO1)
-
+
MOV C33, C11
FETCH $0, 0 * SIZE(CO2)
MOV C34, C11
FETCH $0, 4 * SIZE(CO2)
-
- daddu PREB, B, PREB
+
+ daddu PREB, B, PREB
MOV C43, C11
FETCH $0, 0 * SIZE(CO3)
@@ -271,12 +271,12 @@
dsra L, K, 6 # UnRoll K=64
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
dsll PREB, K, BASE_SHIFT
MOV C21, C11
MOV C22, C11
-
+
MOV C31, C11
MOV C32, C11
gsLQC1(R13, F9, F8, 0) # B1 B2
@@ -293,13 +293,13 @@
FETCH $0, 0 * SIZE(CO1)
MOV C24, C11
FETCH $0, 4 * SIZE(CO1)
-
+
MOV C33, C11
FETCH $0, 0 * SIZE(CO2)
MOV C34, C11
FETCH $0, 4 * SIZE(CO2)
-
- daddu PREB, B, PREB
+
+ daddu PREB, B, PREB
MOV C43, C11
FETCH $0, 0 * SIZE(CO3)
@@ -435,7 +435,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -571,7 +571,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -707,7 +707,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -843,7 +843,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -979,7 +979,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -1115,7 +1115,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -1251,7 +1251,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -1387,7 +1387,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -1523,7 +1523,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -1659,7 +1659,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -1795,7 +1795,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -1931,7 +1931,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -2067,7 +2067,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -2203,7 +2203,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -2339,7 +2339,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -2475,7 +2475,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -2622,7 +2622,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -2758,7 +2758,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -2894,7 +2894,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -3030,7 +3030,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -3166,7 +3166,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -3302,7 +3302,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -3438,7 +3438,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -3574,7 +3574,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -3721,7 +3721,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -3857,7 +3857,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -3993,7 +3993,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -4129,7 +4129,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -4148,7 +4148,7 @@
.align 4
-.L484:
+.L484:
#ifndef TRMMKERNEL
andi L, K, 8
#else
@@ -4276,7 +4276,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -4412,7 +4412,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -4559,7 +4559,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -4640,7 +4640,7 @@
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
- daddiu PREB, PREB, 8 * SIZE
+ daddiu PREB, PREB, 8 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
@@ -4721,7 +4721,7 @@
CVTU A8, C41 # A8=C41.upper=c28
LD B6, 5 * SIZE(CO2)
- MADD A1, B1, A1, ALPHA # c12
+ MADD A1, B1, A1, ALPHA # c12
LD B7, 7 * SIZE(CO1)
MADD A2, B2, A2, ALPHA # c22
@@ -4732,7 +4732,7 @@
MADD A4, B4, A4, ALPHA # c24
LD B3, 0 * SIZE(CO2)
-
+
MADD A5, B5, A5, ALPHA # c16
LD B4, 2 * SIZE(CO1)
@@ -4759,7 +4759,7 @@
MADD C31, B6, C31, ALPHA # c16
LD A2, 6 * SIZE(CO2)
-
+
MADD C33, B7, C33, ALPHA # c26
ST A4, 3 * SIZE(CO2)
@@ -4773,7 +4773,7 @@
MADD C43, A2, C43, ALPHA # c28
ST C13, 0 * SIZE(CO2)
-
+
ST C21, 2 * SIZE(CO1)
ST C23, 2 * SIZE(CO2)
ST C31, 4 * SIZE(CO1)
@@ -4801,58 +4801,58 @@
CVTU A7, C44 # B7=C42.upper=c48
LD B6, 5 * SIZE(CO4)
- CVTU A8, C42 # A1=C44.upper=c38
+ CVTU A8, C42 # A1=C44.upper=c38
LD B7, 7 * SIZE(CO3)
MADD A1, B1, A1, ALPHA # c31
LD C11, 7 * SIZE(CO4)
- MADD A2, B2, A2, ALPHA
+ MADD A2, B2, A2, ALPHA
LD C13, 0 * SIZE(CO3)
-
- MADD A3, B3, A3, ALPHA
+
+ MADD A3, B3, A3, ALPHA
LD C21, 0 * SIZE(CO4)
-
- MADD A4, B4, A4, ALPHA
+
+ MADD A4, B4, A4, ALPHA
LD C23, 2 * SIZE(CO3)
- MADD A5, B5, A5, ALPHA
+ MADD A5, B5, A5, ALPHA
LD C31, 2 * SIZE(CO4)
-
- MADD A6, B6, A6, ALPHA
+
+ MADD A6, B6, A6, ALPHA
LD C33, 4 * SIZE(CO3)
-
- MADD A7, B7, A7, ALPHA
+
+ MADD A7, B7, A7, ALPHA
LD C41, 4 * SIZE(CO4)
-
- MADD A8, C11, A8, ALPHA
+
+ MADD A8, C11, A8, ALPHA
ST A1, 1 * SIZE(CO3)
- MADD C12, C13, C12, ALPHA
+ MADD C12, C13, C12, ALPHA
LD C43, 6 * SIZE(CO3)
-
- MADD C14, C21, C14, ALPHA
+
+ MADD C14, C21, C14, ALPHA
ST A2, 1 * SIZE(CO4)
- MADD C22, C23, C22, ALPHA
+ MADD C22, C23, C22, ALPHA
LD B1, 6 * SIZE(CO4)
-
- MADD C24, C31, C24, ALPHA
+
+ MADD C24, C31, C24, ALPHA
ST A3, 3 * SIZE(CO3)
- MADD C32, C33, C32, ALPHA
+ MADD C32, C33, C32, ALPHA
ST A4, 3 * SIZE(CO4)
- MADD C34, C41, C34, ALPHA
+ MADD C34, C41, C34, ALPHA
ST A5, 5 * SIZE(CO3)
- MADD C42, C43, C42, ALPHA
+ MADD C42, C43, C42, ALPHA
ST A6, 5 * SIZE(CO4)
ST A7, 7 * SIZE(CO3)
NOP
- MADD C44, B1, C44, ALPHA
+ MADD C44, B1, C44, ALPHA
ST A8, 7 * SIZE(CO4)
ST C12, 0 * SIZE(CO3)
@@ -4880,7 +4880,7 @@
CVTU A7, C43 # A7=C43.upper=c18
CVTU A8, C41 # A8=C41.upper=c28
- MUL A1, A1, ALPHA # c12
+ MUL A1, A1, ALPHA # c12
MUL A2, A2, ALPHA # c22
MUL A3, A3, ALPHA # c14
MUL A4, A4, ALPHA # c24
@@ -4903,7 +4903,7 @@
MUL C31, C31, ALPHA # c16
ST A5, 5 * SIZE(CO1)
-
+
MUL C33, C33, ALPHA # c26
ST A6, 5 * SIZE(CO2)
@@ -4921,7 +4921,7 @@
CVTU A3, C24 # B3=C22.upper=c44
ST C21, 2 * SIZE(CO1)
-
+
CVTU A4, C22 # B4=C24.upper=c34
ST C23, 2 * SIZE(CO2)
@@ -4934,40 +4934,40 @@
CVTU A7, C44 # B7=C42.upper=c48
ST C41, 6 * SIZE(CO1)
- CVTU A8, C42 # A1=C44.upper=c38
+ CVTU A8, C42 # A1=C44.upper=c38
ST C43, 6 * SIZE(CO2)
MUL A1, A1, ALPHA # c31
- MUL A2, A2, ALPHA
- MUL A3, A3, ALPHA
- MUL A4, A4, ALPHA
- MUL A5, A5, ALPHA
- MUL A6, A6, ALPHA
- MUL A7, A7, ALPHA
- MUL A8, A8, ALPHA
-
- MUL C12, C12, ALPHA
+ MUL A2, A2, ALPHA
+ MUL A3, A3, ALPHA
+ MUL A4, A4, ALPHA
+ MUL A5, A5, ALPHA
+ MUL A6, A6, ALPHA
+ MUL A7, A7, ALPHA
+ MUL A8, A8, ALPHA
+
+ MUL C12, C12, ALPHA
ST A1, 1 * SIZE(CO3)
- MUL C14, C14, ALPHA
+ MUL C14, C14, ALPHA
ST A2, 1 * SIZE(CO4)
- MUL C22, C22, ALPHA
+ MUL C22, C22, ALPHA
ST A3, 3 * SIZE(CO3)
- MUL C24, C24, ALPHA
+ MUL C24, C24, ALPHA
ST A4, 3 * SIZE(CO4)
- MUL C32, C32, ALPHA
+ MUL C32, C32, ALPHA
ST A5, 5 * SIZE(CO3)
- MUL C34, C34, ALPHA
+ MUL C34, C34, ALPHA
ST A6, 5 * SIZE(CO4)
- MUL C42, C42, ALPHA
+ MUL C42, C42, ALPHA
ST A7, 7 * SIZE(CO3)
- MUL C44, C44, ALPHA
+ MUL C44, C44, ALPHA
ST A8, 7 * SIZE(CO4)
ST C12, 0 * SIZE(CO3)
@@ -5025,12 +5025,12 @@
daddu BO, B, TEMP
#endif
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
dsll PREB, K, BASE_SHIFT
MOV C21, C11
MOV C22, C11
-
+
MOV C31, C11
MOV C32, C11
gsLQC1(R13, F9, F8, 0) # B1 B2
@@ -5045,12 +5045,12 @@
MOV C23, C11
FETCH $0, 0 * SIZE(CO1)
MOV C24, C11
-
+
MOV C33, C11
FETCH $0, 0 * SIZE(CO2)
MOV C34, C11
-
- daddu PREB, B, PREB
+
+ daddu PREB, B, PREB
MOV C43, C11
FETCH $0, 0 * SIZE(CO3)
@@ -5077,12 +5077,12 @@
dsra L, K, 2 # UnRoll K=4
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
dsll PREB, K, BASE_SHIFT
MOV C21, C11
MOV C22, C11
-
+
MOV C31, C11
MOV C32, C11
gsLQC1(R13, F9, F8, 0) # B1 B2
@@ -5097,12 +5097,12 @@
MOV C23, C11
FETCH $0, 0 * SIZE(CO1)
MOV C24, C11
-
+
MOV C33, C11
FETCH $0, 0 * SIZE(CO2)
MOV C34, C11
-
- daddu PREB, B, PREB
+
+ daddu PREB, B, PREB
MOV C43, C11
FETCH $0, 0 * SIZE(CO3)
@@ -5114,7 +5114,7 @@
PLU B4, B2, B2
#endif
-.L4410: #
+.L4410: #
daddiu L, L, -1
MADPS C11, C11, A1, B1
gsLQC1(R13, F13, F12, 1) # B3 B4
@@ -5196,7 +5196,7 @@
MADPS C13, C13, A7, B7
daddiu PREA, PREA, 16 * SIZE
MADPS C23, C23, A8, B7
- daddiu PREB, PREB, 16 * SIZE
+ daddiu PREB, PREB, 16 * SIZE
MADPS C14, C14, A7, B8
MADPS C24, C24, A8, B8
@@ -5303,7 +5303,7 @@
LD B4, 3 * SIZE(CO2)
- MADD A1, B1, A1, ALPHA # c12
+ MADD A1, B1, A1, ALPHA # c12
LD B5, 0 * SIZE(CO1)
MADD A2, B2, A2, ALPHA # c22
@@ -5314,7 +5314,7 @@
MADD A4, B4, A4, ALPHA # c24
LD B1, 2 * SIZE(CO2)
-
+
MADD C11, B5, C11, ALPHA # c12
ST A1, 1 * SIZE(CO1)
@@ -5347,25 +5347,25 @@
MADD A1, B1, A1, ALPHA # c31
LD A5, 0 * SIZE(CO3)
- MADD A2, B2, A2, ALPHA
+ MADD A2, B2, A2, ALPHA
LD A6, 0 * SIZE(CO4)
-
- MADD A3, B3, A3, ALPHA
+
+ MADD A3, B3, A3, ALPHA
LD A7, 2 * SIZE(CO3)
-
- MADD A4, B4, A4, ALPHA
+
+ MADD A4, B4, A4, ALPHA
LD A8, 2 * SIZE(CO4)
- MADD C12, A5, C12, ALPHA
+ MADD C12, A5, C12, ALPHA
ST A1, 1 * SIZE(CO3)
- MADD C14, A6, C14, ALPHA
+ MADD C14, A6, C14, ALPHA
ST A2, 1 * SIZE(CO4)
- MADD C22, A7, C22, ALPHA
+ MADD C22, A7, C22, ALPHA
ST A3, 3 * SIZE(CO3)
-
- MADD C24, A8, C24, ALPHA
+
+ MADD C24, A8, C24, ALPHA
ST A4, 3 * SIZE(CO4)
ST C12, 0 * SIZE(CO3)
@@ -5384,11 +5384,11 @@
CVTU A3, C23 # A3=C23.upper=c14
CVTU A4, C21 # A4=C21.upper=c24
- MUL A1, A1, ALPHA # c12
+ MUL A1, A1, ALPHA # c12
MUL A2, A2, ALPHA # c22
MUL A3, A3, ALPHA # c14
MUL A4, A4, ALPHA # c24
-
+
MUL C11, C11, ALPHA # c12
ST A1, 1 * SIZE(CO1)
@@ -5409,25 +5409,25 @@
CVTU A7, C24 # B3=C22.upper=c44
ST C21, 2 * SIZE(CO1)
-
+
CVTU A8, C22 # B4=C24.upper=c34
ST C23, 2 * SIZE(CO2)
MUL A5, A5, ALPHA # c31
- MUL A6, A6, ALPHA
- MUL A7, A7, ALPHA
- MUL A8, A8, ALPHA
+ MUL A6, A6, ALPHA
+ MUL A7, A7, ALPHA
+ MUL A8, A8, ALPHA
- MUL C12, C12, ALPHA
+ MUL C12, C12, ALPHA
ST A5, 1 * SIZE(CO3)
- MUL C14, C14, ALPHA
+ MUL C14, C14, ALPHA
ST A6, 1 * SIZE(CO4)
- MUL C22, C22, ALPHA
+ MUL C22, C22, ALPHA
ST A7, 3 * SIZE(CO3)
-
- MUL C24, C24, ALPHA
+
+ MUL C24, C24, ALPHA
ST A8, 3 * SIZE(CO4)
ST C12, 0 * SIZE(CO3)
@@ -5478,11 +5478,11 @@
daddu BO, B, TEMP
#endif
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
MOV C21, C11
MOV C22, C11
-
+
MOV C31, C11
MOV C32, C11
gsLQC1(R13, F9, F8, 0) # B1 B2
@@ -5497,11 +5497,11 @@
MOV C23, C11
FETCH $0, 0 * SIZE(CO1)
MOV C24, C11
-
+
MOV C33, C11
FETCH $0, 0 * SIZE(CO2)
MOV C34, C11
-
+
MOV C43, C11
FETCH $0, 0 * SIZE(CO3)
@@ -5527,11 +5527,11 @@
dsra L, K, 2 # UnRoll K=4
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
MOV C21, C11
MOV C22, C11
-
+
MOV C31, C11
MOV C32, C11
gsLQC1(R13, F9, F8, 0) # B1 B2
@@ -5546,11 +5546,11 @@
MOV C23, C11
FETCH $0, 0 * SIZE(CO1)
MOV C24, C11
-
+
MOV C33, C11
FETCH $0, 0 * SIZE(CO2)
MOV C34, C11
-
+
MOV C43, C11
FETCH $0, 0 * SIZE(CO3)
@@ -5669,7 +5669,7 @@
CVTU A2, C11 # A2=C11.upper=c22
LD B2, 1 * SIZE(CO2)
- MADD A1, B1, A1, ALPHA # c12
+ MADD A1, B1, A1, ALPHA # c12
LD B5, 0 * SIZE(CO1)
MADD A2, B2, A2, ALPHA # c22
@@ -5693,13 +5693,13 @@
MADD A1, B1, A1, ALPHA # c31
LD A5, 0 * SIZE(CO3)
- MADD A2, B2, A2, ALPHA
+ MADD A2, B2, A2, ALPHA
LD A6, 0 * SIZE(CO4)
-
- MADD C12, A5, C12, ALPHA
+
+ MADD C12, A5, C12, ALPHA
ST A1, 1 * SIZE(CO3)
- MADD C14, A6, C14, ALPHA
+ MADD C14, A6, C14, ALPHA
ST A2, 1 * SIZE(CO4)
ST C12, 0 * SIZE(CO3)
@@ -5713,7 +5713,7 @@
CVTU A1, C13 # A1=C13.upper=c12
CVTU A2, C11 # A2=C11.upper=c22
- MUL A1, A1, ALPHA # c12
+ MUL A1, A1, ALPHA # c12
MUL A2, A2, ALPHA # c22
MUL C11, C11, ALPHA # c12
@@ -5725,13 +5725,13 @@
MUL A3, A3, ALPHA # c31
ST A1, 1 * SIZE(CO1)
- MUL A4, A4, ALPHA
+ MUL A4, A4, ALPHA
ST A2, 1 * SIZE(CO2)
- MUL C12, C12, ALPHA
+ MUL C12, C12, ALPHA
ST C11, 0 * SIZE(CO1)
-
- MUL C14, C14, ALPHA
+
+ MUL C14, C14, ALPHA
ST C13, 0 * SIZE(CO2)
ST A3, 1 * SIZE(CO3)
@@ -5784,7 +5784,7 @@
daddu BO, B, TEMP
#endif
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
LD B1, 0 * SIZE(BO)
MOV C21, C11
@@ -5805,10 +5805,10 @@
MOV C23, C11
MOV C24, C11
-
+
MOV C33, C11
MOV C34, C11
-
+
MOV C43, C11
MOV C44, C11
#if (defined(LEFT) && !defined(TRANSA))||\
@@ -5827,7 +5827,7 @@
dsra L, K, 2 # UnRoll K=4
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
LD B1, 0 * SIZE(BO)
MOV C21, C11
@@ -5848,10 +5848,10 @@
MOV C23, C11
MOV C24, C11
-
+
MOV C33, C11
MOV C34, C11
-
+
MOV C43, C11
blez L, .L412
MOV C44, C11
@@ -5860,7 +5860,7 @@
.L4110:
daddiu L, L, -1
LD A2, 1 * SIZE(AO)
-
+
MADD C11, C11, A1, B1
LD B5, 4 * SIZE(BO)
@@ -5875,7 +5875,7 @@
LD A3, 2 * SIZE(AO)
NOP
-
+
MADD C11, C11, A2, B5
LD B1, 8 * SIZE(BO)
@@ -5890,7 +5890,7 @@
LD A4, 3 * SIZE(AO)
daddiu AO, AO, 4 * SIZE
-
+
MADD C11, C11, A3, B1
LD B5, 12 * SIZE(BO)
@@ -5930,7 +5930,7 @@
LD A2, 1 * SIZE(AO)
daddiu AO, AO, 2 * SIZE
-
+
MADD C11, C11, A1, B1
LD B5, 4 * SIZE(BO)
@@ -5945,7 +5945,7 @@
LD A1, 0 * SIZE(AO)
daddiu BO, BO, 8 * SIZE
-
+
MADD C11, C11, A2, B5
LD B1, 0 * SIZE(BO)
@@ -6046,7 +6046,7 @@
.align 4
.L2: # Nr=2
- andi J, N, 2
+ andi J, N, 2
blez J, .L1
NOP
@@ -6078,7 +6078,7 @@
MTC $0, C11 # CLEAR REAULTS REGISTERS
LD A1, 0 * SIZE(AO)
- MOV C12, C11
+ MOV C12, C11
LD A2, 1 * SIZE(AO)
MOV C21, C11
@@ -6107,10 +6107,10 @@
MOV C23, C11
MOV C24, C11
-
+
MOV C33, C11
MOV C34, C11
-
+
MOV C43, C11
MOV C44, C11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
@@ -6131,7 +6131,7 @@
MTC $0, C11 # CLEAR REAULTS REGISTERS
LD A1, 0 * SIZE(AO)
- MOV C12, C11
+ MOV C12, C11
LD A2, 1 * SIZE(AO)
MOV C21, C11
@@ -6160,10 +6160,10 @@
MOV C23, C11
MOV C24, C11
-
+
MOV C33, C11
MOV C34, C11
-
+
MOV C43, C11
blez L, .L282
MOV C44, C11
@@ -6293,8 +6293,8 @@
LD A8, 7 * SIZE(CO1)
MADD A1, A1, C11, ALPHA
- LD B1, 0 * SIZE(CO2)
-
+ LD B1, 0 * SIZE(CO2)
+
MADD A2, A2, C21, ALPHA
LD B2, 1 * SIZE(CO2)
@@ -6439,7 +6439,7 @@
daddu BO, B, TEMP
#endif
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
LD A1, 0 * SIZE(AO)
MOV C21, C11
@@ -6464,7 +6464,7 @@
MOV C33, C11
MOV C34, C11
-
+
MOV C43, C11
MOV C44, C11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
@@ -6483,7 +6483,7 @@
dsra L, K, 1 # UnRoll K=4
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
LD A1, 0 * SIZE(AO)
MOV C21, C11
@@ -6508,7 +6508,7 @@
MOV C33, C11
MOV C34, C11
-
+
MOV C43, C11
blez L, .L242
MOV C44, C11
@@ -6593,8 +6593,8 @@
LD A4, 3 * SIZE(CO1)
MADD A1, A1, C11, ALPHA
- LD B1, 0 * SIZE(CO2)
-
+ LD B1, 0 * SIZE(CO2)
+
MADD A2, A2, C21, ALPHA
LD B2, 1 * SIZE(CO2)
@@ -6687,7 +6687,7 @@
daddu BO, B, TEMP
#endif
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
LD A1, 0 * SIZE(AO)
MOV C21, C11
@@ -6720,7 +6720,7 @@
dsra L, K, 1 # UnRoll K=4
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
LD A1, 0 * SIZE(AO)
MOV C21, C11
@@ -6797,8 +6797,8 @@
LD A2, 1 * SIZE(CO1)
MADD A1, A1, C11, ALPHA
- LD B1, 0 * SIZE(CO2)
-
+ LD B1, 0 * SIZE(CO2)
+
MADD A2, A2, C21, ALPHA
LD B2, 1 * SIZE(CO2)
@@ -6867,7 +6867,7 @@
#endif
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
LD A1, 0 * SIZE(AO)
MOV C21, C11
@@ -6899,7 +6899,7 @@
dsra L, K, 1 # UnRoll K=4
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
LD A1, 0 * SIZE(AO)
MOV C21, C11
@@ -6963,8 +6963,8 @@
LD A1, 0 * SIZE(CO1)
MADD A1, A1, C11, ALPHA
- LD B1, 0 * SIZE(CO2)
-
+ LD B1, 0 * SIZE(CO2)
+
MADD B1, B1, C12, ALPHA
ST A1, 0 * SIZE(CO1)
@@ -7044,7 +7044,7 @@
MTC $0, C11 # CLEAR REAULTS REGISTERS
LD A1, 0 * SIZE(AO)
- MOV C12, C11
+ MOV C12, C11
LD A2, 1 * SIZE(AO)
MOV C21, C11
@@ -7072,10 +7072,10 @@
MOV C23, C11
MOV C24, C11
-
+
MOV C33, C11
MOV C34, C11
-
+
MOV C43, C11
MOV C44, C11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
@@ -7096,7 +7096,7 @@
MTC $0, C11 # CLEAR REAULTS REGISTERS
LD A1, 0 * SIZE(AO)
- MOV C12, C11
+ MOV C12, C11
LD A2, 1 * SIZE(AO)
MOV C21, C11
@@ -7124,10 +7124,10 @@
MOV C23, C11
MOV C24, C11
-
+
MOV C33, C11
MOV C34, C11
-
+
MOV C43, C11
blez L, .L182
MOV C44, C11
@@ -7315,7 +7315,7 @@
daddu BO, B, TEMP
#endif
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
LD A1, 0 * SIZE(AO)
MOV C21, C11
@@ -7339,7 +7339,7 @@
MOV C33, C11
MOV C34, C11
-
+
MOV C43, C11
MOV C44, C11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
@@ -7358,7 +7358,7 @@
dsra L, K, 1 # UnRoll K=4
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
LD A1, 0 * SIZE(AO)
MOV C21, C11
@@ -7382,7 +7382,7 @@
MOV C33, C11
MOV C34, C11
-
+
MOV C43, C11
blez L, .L142
MOV C44, C11
@@ -7511,7 +7511,7 @@
#endif
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
LD A1, 0 * SIZE(AO)
MOV C21, C11
@@ -7544,7 +7544,7 @@
dsra L, K, 1 # UnRoll K=4
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
LD A1, 0 * SIZE(AO)
MOV C21, C11
@@ -7660,7 +7660,7 @@
daddu BO, B, L
#endif
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
LD A1, 0 * SIZE(AO)
MOV C21, C11
@@ -7686,7 +7686,7 @@
dsra L, K, 1 # UnRoll K=4
MTC $0, C11 # CLEAR REAULTS REGISTERS
- MOV C12, C11
+ MOV C12, C11
LD A1, 0 * SIZE(AO)
MOV C21, C11
@@ -7739,13 +7739,13 @@
LD A1, 0 * SIZE(C)
MADD A1, A1, C11, ALPHA
-
+
ST A1, 0 * SIZE(C)
daddiu C, C, 1 * SIZE
#else
MUL A1, C11, ALPHA
-
+
ST A1, 0 * SIZE(C)
daddiu C, C, 1 * SIZE
diff --git a/kernel/mips64/sgemm_kernel_loongson3a_4x4.S b/kernel/mips64/sgemm_kernel_loongson3a_4x4.S
index 4a8c9b0..10c5f47 100644
--- a/kernel/mips64/sgemm_kernel_loongson3a_4x4.S
+++ b/kernel/mips64/sgemm_kernel_loongson3a_4x4.S
@@ -110,7 +110,7 @@
#define F27 27
#define F26 26
#define F25 25
-#define F24 24
+#define F24 24
#define F23 23
#define F22 22
#define F21 21
@@ -118,7 +118,7 @@
#define F19 19
#define F18 18
#define F17 17
-#define F16 16
+#define F16 16
#define F15 15
#define F14 14
#define F13 13
@@ -130,14 +130,14 @@
#define F7 7
#define F6 6
#define F5 5
-#define F4 4
-#define F3 3
-#define F2 2
-#define F1 1
+#define F4 4
+#define F3 3
+#define F2 2
+#define F1 1
#define F0 0
PROLOGUE
-
+
daddiu $sp, $sp, -160
sd $16, 0($sp)
sd $17, 8($sp)
@@ -160,7 +160,7 @@
ST $f23,144($sp)
- .align 5
+ .align 5
.L0_N4: # Loop N
ST ALPHA,152($sp) # Backup ALPHA
move MCO,M # Backup M
@@ -170,26 +170,26 @@
move AO,A # Backup A_addr
dsra N,NCO,2 # N=NCO/2
-
+
dsll LDC,LDC,BASE_SHIFT # LDC*8Byte
dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5
-
+
#if defined(TRMMKERNEL)
- LDARG OFFSET,160($sp) # OFFSET is relate to the data part
+ LDARG OFFSET,160($sp) # OFFSET is relate to the data part
#endif
#if defined(TRMMKERNEL) && !defined(LEFT)
- neg KK,OFFSET
+ neg KK,OFFSET
#endif
-
+
move BO,B # Backup B_addr
beq N,$0,.L0_N2 # N=0,NCO<4
dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte
.L0_N4_Lb: # mr=4,nr=4
- move CO1,C
+ move CO1,C
dsra M,MCO,2 # M=MCO/2
-
+
move A,AO # Reset A
daddu CO2,C,LDC
@@ -200,7 +200,7 @@
daddu CO4,CO3,LDC
#if defined(TRMMKERNEL) && defined(LEFT)
- move KK,OFFSET
+ move KK,OFFSET
#endif
beqz M,.L14_M2
daddu C,CO4,LDC # move C to next panel Cj
@@ -227,18 +227,18 @@
MOV t41,t11
MOV t12,t11
LD b0,0(B)
-
+
MOV t22,t11
MOV t32,t11
LD b1,1*SIZE(B)
MOV t42,t11
LD a2,2*SIZE(A)
-
+
MOV t13,t11
MOV t23,t11
LD b2,2*SIZE(B)
-
+
MOV t33,t11
MOV t43,t11
LD a3,3*SIZE(A)
@@ -250,7 +250,7 @@
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP,KCO,KK # temp is the length of the data part
#elif defined(LEFT)
- daddiu TEMP, KK, 4 # S=L,U=L
+ daddiu TEMP, KK, 4 # S=L,U=L
#else
daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part
#endif
@@ -259,7 +259,7 @@
beqz K,.L15
MOV t44,t11
-#else
+#else
move B,BO # Reset B
MTC $0,t11 # GEMM part NR=4,MR=4
LD a0,0(A)
@@ -271,7 +271,7 @@
MOV t41,t11
MOV t12,t11
LD b0,0(B)
-
+
MOV t22,t11
MOV t32,t11
LD b1,1*SIZE(B)
@@ -279,11 +279,11 @@
MOV t42,t11
dsra K,KCO,2 # K=KCO/2
LD a2,2*SIZE(A)
-
+
MOV t13,t11
MOV t23,t11
LD b2,2*SIZE(B)
-
+
MOV t33,t11
MOV t43,t11
LD a3,3*SIZE(A)
@@ -296,7 +296,7 @@
beqz K,.L15
MOV t44,t11 # clear 16 results registers
#endif
-
+
.align 5
.L11: # kr=4
MADD t11,t11,a0,b0
@@ -306,29 +306,29 @@
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
LD a5,5*SIZE(A)
-
+
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
LD b4,4*SIZE(B)
-
+
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
LD b5,5*SIZE(B)
FETCH $0,(PREB)
-
+
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
LD a6,6*SIZE(A)
-
+
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
LD b6,6*SIZE(B)
FETCH $0,(PREA)
-
+
MADD t33,t33,a2,b2
MADD t43,t43,a3,b2
LD a7,7*SIZE(A)
-
+
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
LD b7,7*SIZE(B)
@@ -447,14 +447,14 @@
.L15: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP, 2
#endif
beqz K,.L18
nop
-.L16:
+.L16:
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
LD a4,4*SIZE(A)
@@ -528,16 +528,16 @@
daddu PREB,PREB,8*SIZE
LD b3,3*SIZE(B)
-
+
.L18: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L19
+ beqz K,.L19
LD ALPHA,152($sp) # Get ALPHA
-
+
FETCH $0,0(PREB)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
@@ -569,8 +569,8 @@
MADD t44,t44,a3,b3
.L19: # Write Back to C
-#ifndef TRMMKERNEL
- LD c11,0(CO1) # GEMM write part
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # GEMM write part
LD c21,1*SIZE(CO1) # get 16 C
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
@@ -640,11 +640,11 @@
daddu CO3,CO3,4*SIZE
ST t44,3*SIZE(CO4)
daddu PREB,BO,SPANB
-
- bnez M,.L10
+
+ bnez M,.L10
daddu CO4,CO4,4*SIZE
-#else
+#else
MUL t11, ALPHA, t11 # TRMM write back part
MUL t21, ALPHA, t21
MUL t31, ALPHA, t31
@@ -685,7 +685,7 @@
daddiu CO1,CO1, 4 * SIZE
daddiu CO2,CO2, 4 * SIZE
daddiu CO3,CO3, 4 * SIZE
- daddiu CO4,CO4, 4 * SIZE
+ daddiu CO4,CO4, 4 * SIZE
FETCH $0,4*SIZE(CO1)
FETCH $0,4*SIZE(CO2)
@@ -698,7 +698,7 @@
FETCH $0,0(CO4)
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- dsubu TEMP,KCO,KK
+ dsubu TEMP,KCO,KK
#ifdef LEFT
daddiu TEMP,TEMP, -4
#else
@@ -710,10 +710,10 @@
daddu B,B,TEMP # mov B to the end of panel Bj
#endif
-#ifdef LEFT
+#ifdef LEFT
daddiu KK, KK,4
#endif
- bnez M,.L10
+ bnez M,.L10
nop
#endif
@@ -721,7 +721,7 @@
.align 3
.L14_M2:
andi M, MCO, 2 # nr=4,mr=2
- beqz M,.L14_M1
+ beqz M,.L14_M1
nop
.L20:
@@ -729,7 +729,7 @@
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO # Reset B
#else
- dsll K,KK,1 + BASE_SHIFT # mr=2
+ dsll K,KK,1 + BASE_SHIFT # mr=2
dsll TEMP,KK,2 + BASE_SHIFT # nr=4
daddu A,A,K
daddu B,BO,TEMP
@@ -738,7 +738,7 @@
LD a0,0*SIZE(A)
MTC $0,t11
LD a1,1*SIZE(A)
-
+
MOV t21,t11
LD b0,0*SIZE(B)
MOV t12,t11
@@ -764,18 +764,18 @@
MOV t24,t11 # clear 2*4=8 results registers
#else
- move B,BO # Reset B
+ move B,BO # Reset B
LD a0,0*SIZE(A)
MTC $0,t11
LD a1,1*SIZE(A)
-
+
MOV t21,t11
LD b0,0*SIZE(B)
MOV t12,t11
LD b1,1*SIZE(B)
MOV t22,t11
- dsra K,KCO,2
+ dsra K,KCO,2
LD b2,2*SIZE(B)
MOV t13,t11
@@ -806,7 +806,7 @@
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
-
+
MADD t11,t11,a4,b4
LD a2,4*SIZE(A)
MADD t21,t21,a5,b4
@@ -866,7 +866,7 @@
MADD t24,t24,a7,b7
-.L25:
+.L25:
#ifndef TRMMKERNEL
andi K,KCO,2 # kr=2
#else
@@ -875,7 +875,7 @@
beqz K,.L28
nop
-.L26:
+.L26:
MADD t11,t11,a0,b0
LD a4,2*SIZE(A)
MADD t21,t21,a1,b0
@@ -890,7 +890,7 @@
LD b6,6*SIZE(B)
MADD t23,t23,a1,b2
LD b7,7*SIZE(B)
-
+
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
daddu A,A,4*SIZE # 2mr*2kr
@@ -915,16 +915,16 @@
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
-
-.L28: # kr=1
+
+.L28: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L29
+ beqz K,.L29
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # 2mr*kr
@@ -942,11 +942,11 @@
.L29: # Write Back to C
#ifndef TRMMKERNEL
LD c11,0(CO1) # GEMM write back part
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
LD c12,0(CO2)
LD c22,1*SIZE(CO2)
-
+
LD c13,0(CO3)
MADD t11,c11,t11,ALPHA
LD c23,1*SIZE(CO3)
@@ -985,25 +985,25 @@
#else
MUL t11, ALPHA, t11 # TRMM write back part
MUL t21, ALPHA, t21
-
+
ST t11, 0 * SIZE(CO1)
MUL t12, ALPHA, t12
ST t21, 1 * SIZE(CO1)
MUL t22, ALPHA, t22
-
+
ST t12, 0 * SIZE(CO2)
MUL t13, ALPHA, t13
ST t22, 1 * SIZE(CO2)
MUL t23, ALPHA, t23
-
+
ST t13, 0 * SIZE(CO3)
MUL t14, ALPHA, t14
ST t23, 1 * SIZE(CO3)
MUL t24, ALPHA, t24
-
+
ST t14, 0 * SIZE(CO4)
ST t24, 1 * SIZE(CO4)
-
+
daddiu CO1,CO1, 2 * SIZE
daddiu CO2,CO2, 2 * SIZE
daddiu CO3,CO3, 2 * SIZE
@@ -1036,7 +1036,7 @@
.align 3
.L14_M1:
- andi M,MCO,1 # mr=1
+ andi M,MCO,1 # mr=1
beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj
nop
@@ -1056,13 +1056,13 @@
MTC $0,t11
LD b0,0*SIZE(B)
-
+
MOV t12,t11
LD b1,1*SIZE(B)
MOV t13,t11
LD b2,2*SIZE(B)
-
+
MOV t14,t11
LD b3,3*SIZE(B)
@@ -1077,35 +1077,35 @@
nop
beqz K,.L35
nop
-
-#else
+
+#else
move B,BO # Reset B, GEMM part
dsra K,KCO,2 # K=KCO/2
LD a0, 0 * SIZE(A) # a0
MTC $0,t11
LD b0,0*SIZE(B)
-
+
MOV t12,t11
LD b1,1*SIZE(B)
MOV t13,t11
LD b2,2*SIZE(B)
-
+
MOV t14,t11
beqz K,.L35
LD b3,3*SIZE(B)
#endif
-.L31: # nr=4,mr=1,kr=4
+.L31: # nr=4,mr=1,kr=4
LD a1, 1*SIZE(A) # load a1
MADD t11,t11,a0,b0
-
+
LD b4,4*SIZE(B)
LD b5,5*SIZE(B)
MADD t12,t12,a0,b1
-
+
LD b6,6*SIZE(B)
LD b7,7*SIZE(B)
MADD t13,t13,a0,b2
@@ -1113,11 +1113,11 @@
LD a2, 2*SIZE(A) # a2
MADD t11,t11,a1,b4
-
+
LD b0,8*SIZE(B)
LD b1,9*SIZE(B)
MADD t12,t12,a1,b5
-
+
LD b2,10*SIZE(B)
LD b3,11*SIZE(B)
MADD t13,t13,a1,b6
@@ -1126,12 +1126,12 @@
LD a3, 3*SIZE(A) # a3
MADD t11,t11,a2,b0
daddiu K,K,-1
-
+
LD b4,12*SIZE(B)
LD b5,13*SIZE(B)
MADD t12,t12,a2,b1
daddu A,A,4*SIZE # 1mr*4kr
-
+
LD b6,14*SIZE(B)
LD b7,15*SIZE(B)
MADD t13,t13,a2,b2
@@ -1140,7 +1140,7 @@
LD a0, 0*SIZE(A) # a0
daddu B,B,16*SIZE # 4nr*4kr
MADD t11,t11,a3,b4
-
+
LD b0,0*SIZE(B)
MADD t12,t12,a3,b5
LD b1,1*SIZE(B)
@@ -1154,14 +1154,14 @@
.L35: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L38
nop
-.L36:
+.L36:
LD a1,1*SIZE(A) # load a1
MADD t11,t11,a0,b0
@@ -1169,10 +1169,10 @@
LD b5,5*SIZE(B)
MADD t12,t12,a0,b1
daddu A,A,2*SIZE # mr*2kr
-
+
LD b6,6*SIZE(B)
MADD t13,t13,a0,b2
-
+
LD b7,7*SIZE(B)
MADD t14,t14,a0,b3
daddu B,B,8*SIZE # 4nr*2kr
@@ -1181,41 +1181,41 @@
.L37:
LD a0,0(A)
MADD t11,t11,a1,b4
-
+
LD b0,0*SIZE(B)
LD b1,1*SIZE(B)
MADD t12,t12,a1,b5
-
+
LD b2,2*SIZE(B)
LD b3,3*SIZE(B)
MADD t13,t13,a1,b6
MADD t14,t14,a1,b7
-
-
+
+
.L38: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L39
+ beqz K,.L39
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
- daddu A,A,1*SIZE
+ daddu A,A,1*SIZE
daddu B,B,4*SIZE
-
+
MADD t13,t13,a0,b2
MADD t14,t14,a0,b3
.L39: # Write Back
#ifndef TRMMKERNEL
- LD c11,0(CO1)
+ LD c11,0(CO1)
LD c12,0(CO2)
LD c13,0(CO3)
LD c14,0(CO4)
-
+
MADD t11,c11,t11,ALPHA
MADD t12,c12,t12,ALPHA
MADD t13,c13,t13,ALPHA
@@ -1261,22 +1261,22 @@
.L0_N4_Loop: # mc finished
daddiu N,N,-1 # N--
#if defined(TRMMKERNEL) && !defined(LEFT)
- daddiu KK, KK,4
+ daddiu KK, KK,4
#endif
- bnez N,.L0_N4_Lb
+ bnez N,.L0_N4_Lb
move BO,B # Set BO point to next panel Bj
- .align 5
+ .align 5
.L0_N2:
andi N,NCO,2 # nr = 2
- beqz N,.L0_N1
+ beqz N,.L0_N1
nop
.L0_N2_Lb:
- move CO1,C
+ move CO1,C
daddu CO2,C,LDC
- dsra M,MCO,2
+ dsra M,MCO,2
move A,AO # Reset A
daddu PREA,AO,SPANA
@@ -1288,13 +1288,13 @@
beqz M,.L12_M2
nop
-.L40:
+.L40:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO # Reset B
#else
dsll K,KK, 2 + BASE_SHIFT
- dsll TEMP, KK,1 + BASE_SHIFT
+ dsll TEMP, KK,1 + BASE_SHIFT
daddu A,A,K
daddu B,BO,TEMP
@@ -1311,10 +1311,10 @@
MOV t41,t11
LD a2,2*SIZE(A)
LD a3,3*SIZE(A)
-
+
MOV t12,t11
MOV t22,t11
-
+
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP,KCO,KK
#elif defined(LEFT)
@@ -1322,7 +1322,7 @@
#else
daddiu TEMP, KK, 2
#endif
- dsra K,TEMP,2
+ dsra K,TEMP,2
MOV t32,t11
beqz K,.L45
MOV t42,t11
@@ -1342,10 +1342,10 @@
LD a2,2*SIZE(A)
dsra K,KCO,2 # K=KCO/2
LD a3,3*SIZE(A)
-
+
MOV t12,t11
MOV t22,t11
-
+
MOV t32,t11
beqz K,.L45
MOV t42,t11
@@ -1411,9 +1411,9 @@
FETCH $0,8*SIZE(PREA)
MADD t32,t32,a2,b3
MADD t42,t42,a3,b3
-
+
daddu A,A,16*SIZE # 4mr*4kr
- daddu B,B,8*SIZE # 2nr*4kr
+ daddu B,B,8*SIZE # 2nr*4kr
.L44:
MADD t11,t11,a4,b6
@@ -1443,14 +1443,14 @@
.L45: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L48
nop
-.L46:
+.L46:
MADD t11,t11,a0,b0
LD a4,4*SIZE(A)
MADD t21,t21,a1,b0
@@ -1469,7 +1469,7 @@
FETCH $0,0(PREA)
MADD t32,t32,a2,b1
daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32
-
+
MADD t42,t42,a3,b1
daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
@@ -1495,16 +1495,16 @@
daddu PREA,PREA,8*SIZE
-
+
.L48: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L49
+ beqz K,.L49
LD ALPHA,152($sp) # Get ALPHA
-
+
FETCH $0,0(PREA)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
@@ -1524,7 +1524,7 @@
.L49: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # gemm write back part Fetch 16 C
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
@@ -1545,7 +1545,7 @@
MADD t32,c32,t32,ALPHA
ST t41,3*SIZE(CO1)
MADD t42,c42,t42,ALPHA
- daddiu M,M,-1
+ daddiu M,M,-1
ST t12,0(CO2)
ST t22,1*SIZE(CO2)
@@ -1557,8 +1557,8 @@
FETCH $0,8*SIZE(CO1)
FETCH $0,8*SIZE(CO2)
- daddu CO1,CO1,4*SIZE
- bnez M,.L40
+ daddu CO1,CO1,4*SIZE
+ bnez M,.L40
daddu CO2,CO2,4*SIZE
#else
@@ -1566,7 +1566,7 @@
MUL t21, ALPHA, t21
MUL t31, ALPHA, t31
MUL t41, ALPHA, t41
-
+
MUL t12, ALPHA, t12
ST t11, 0 * SIZE(CO1)
MUL t22, ALPHA, t22
@@ -1575,13 +1575,13 @@
ST t31, 2 * SIZE(CO1)
MUL t42, ALPHA, t42
ST t41, 3 * SIZE(CO1)
-
+
ST t12, 0 * SIZE(CO2)
daddiu M,M,-1
ST t22, 1 * SIZE(CO2)
ST t32, 2 * SIZE(CO2)
ST t42, 3 * SIZE(CO2)
-
+
daddiu CO1,CO1, 4*SIZE
daddiu CO2,CO2, 4*SIZE
@@ -1615,7 +1615,7 @@
.align 3
.L12_M2:
andi M,MCO,2 # mr = 2
- beqz M,.L12_M1
+ beqz M,.L12_M1
nop
.L50:
@@ -1636,7 +1636,7 @@
LD b0,0*SIZE(B)
MOV t21,t11
LD b1,1*SIZE(B)
-
+
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
@@ -1644,7 +1644,7 @@
#else
daddiu TEMP, KK, 2
#endif
- dsra K,TEMP,2
+ dsra K,TEMP,2
MOV t12,t11
beqz K,.L55
MOV t22,t11
@@ -1659,7 +1659,7 @@
LD b0,0*SIZE(B)
MOV t21,t11
LD b1,1*SIZE(B)
-
+
MOV t12,t11
beqz K,.L55
MOV t22,t11
@@ -1715,14 +1715,14 @@
.L55: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L58
nop
-.L56:
+.L56:
MADD t11,t11,a0,b0
LD a4,2*SIZE(A)
MADD t21,t21,a1,b0
@@ -1752,9 +1752,9 @@
#else
andi K,TEMP, 1
#endif
- beqz K,.L59
+ beqz K,.L59
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
@@ -1767,10 +1767,10 @@
.L59: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # write gemm part back Fetch 16 C
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
LD c12,0(CO2)
LD c22,1*SIZE(CO2)
-
+
MADD t11,c11,t11,ALPHA
MADD t21,c21,t21,ALPHA
MADD t12,c12,t12,ALPHA
@@ -1781,7 +1781,7 @@
ST t12,0(CO2)
ST t22,1*SIZE(CO2)
- daddu CO1,CO1,2*SIZE
+ daddu CO1,CO1,2*SIZE
daddu CO2,CO2,2*SIZE
FETCH $0,0(CO1)
@@ -1827,7 +1827,7 @@
.align 3
.L12_M1:
andi M,MCO,1 # mr = 1
- beqz M,.L0_N2_Loop
+ beqz M,.L0_N2_Loop
nop
.L60:
@@ -1842,7 +1842,7 @@
daddu B, BO, TEMP
#endif
LD a0,0*SIZE(A)
-
+
MTC $0,t11
MOV t21,t11
LD b0,0*SIZE(B)
@@ -1857,16 +1857,16 @@
#else
daddiu TEMP, KK, 2
#endif
- dsra K,TEMP,2
+ dsra K,TEMP,2
MOV t22,t11
beqz K,.L65
nop
#else
- dsra K,KCO,2
+ dsra K,KCO,2
move B,BO # Reset B
LD a0,0*SIZE(A)
-
+
MTC $0,t11
MOV t21,t11
LD b0,0*SIZE(B)
@@ -1878,18 +1878,18 @@
#endif
-.L61: # nr=2,mr=1,kr=4
+.L61: # nr=2,mr=1,kr=4
LD a4, 1*SIZE(A) # a2
LD b4, 2*SIZE(B)
MADD t11,t11,a0,b0
-
+
LD b5,3*SIZE(B)
MADD t12,t12,a0,b1
LD a2, 2*SIZE(A) # a3
LD b2,4*SIZE(B)
MADD t11,t11,a4,b4
-
+
LD b3,5*SIZE(B)
MADD t12,t12,a4,b5
@@ -1897,17 +1897,17 @@
daddiu K,K,-1
LD b6,6*SIZE(B)
MADD t11,t11,a2,b2
-
+
LD b7,7*SIZE(B)
MADD t12,t12,a2,b3
daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
LD a0, 0*SIZE(A)
daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE
-
- LD b0,0*SIZE(B)
+
+ LD b0,0*SIZE(B)
MADD t11,t11,a6,b6
-
+
LD b1,1*SIZE(B)
bnez K,.L61
MADD t12,t12,a6,b7
@@ -1916,19 +1916,19 @@
.L65: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L68
nop
-.L66:
+.L66:
LD a4, 1*SIZE(A) # a1
MADD t11,t11,a0,b0
LD b4,2*SIZE(B)
daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16
-
+
LD b5,3*SIZE(B)
MADD t12,t12,a0,b1
daddu B,B,4*SIZE
@@ -1937,7 +1937,7 @@
LD a0,0(A) # a0
LD b0,0*SIZE(B)
MADD t11,t11,a4,b4
-
+
LD b1,1*SIZE(B)
MADD t12,t12,a4,b5
@@ -1948,9 +1948,9 @@
#else
andi K,TEMP,1
#endif
- beqz K,.L69
+ beqz K,.L69
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16
@@ -1961,14 +1961,14 @@
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
LD c12,0(CO2)
-
+
MADD t11,c11,t11,ALPHA
MADD t12,c12,t12,ALPHA
ST t11,0(CO1)
ST t12,0(CO2)
- daddu CO1,CO1,1*SIZE
+ daddu CO1,CO1,1*SIZE
daddu CO2,CO2,1*SIZE
#else
@@ -1978,7 +1978,7 @@
ST t11, 0 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
- daddu CO1,CO1,1*SIZE
+ daddu CO1,CO1,1*SIZE
daddu CO2,CO2,1*SIZE
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -2008,15 +2008,15 @@
move BO, B
- .align 5
+ .align 5
.L0_N1:
andi N,NCO,1 # nr = 1
- beqz N,.L999
+ beqz N,.L999
nop
- move CO1,C
- dsra M,MCO,2
-
+ move CO1,C
+ dsra M,MCO,2
+
move A,AO # Reset A
daddu PREA,AO,SPANA
#if defined(TRMMKERNEL) && defined(LEFT)
@@ -2026,7 +2026,7 @@
beqz M,.L11_M2
daddu C,CO1,LDC
-.L70:
+.L70:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B, BO # Reset B
@@ -2038,12 +2038,12 @@
daddu B, BO, TEMP
#endif
LD b0, 0*SIZE(B)
-
+
MTC $0,t11
LD a0,0*SIZE(A)
MOV t21,t11
LD a1,1*SIZE(A)
-
+
MOV t31,t11
LD a2,2*SIZE(A)
MOV t41,t11
@@ -2057,19 +2057,19 @@
#else
daddiu TEMP, KK, 1
#endif
- dsra K,TEMP,2
+ dsra K,TEMP,2
beqz K,.L75
nop
#else
move B, BO # Reset B
- dsra K,KCO,2
+ dsra K,KCO,2
LD b0, 0*SIZE(B)
-
+
MTC $0,t11
LD a0,0*SIZE(A)
MOV t21,t11
LD a1,1*SIZE(A)
-
+
MOV t31,t11
LD a2,2*SIZE(A)
MOV t41,t11
@@ -2081,7 +2081,7 @@
.L71: # nr=1,mr=kr=4
LD b4, 1*SIZE(B) # b1
MADD t11,t11,a0,b0
-
+
LD a4, 4*SIZE(A)
MADD t21,t21,a1,b0
@@ -2097,7 +2097,7 @@
.L72:
LD b2, 2*SIZE(B) # b2
MADD t11,t11,a4,b4
-
+
LD a0,8*SIZE(A)
MADD t21,t21,a5,b4
@@ -2106,17 +2106,17 @@
LD a2,10*SIZE(A)
MADD t31,t31,a6,b4
-
+
LD a3,11*SIZE(A)
MADD t41,t41,a7,b4
.L73:
LD b6, 3*SIZE(B)
MADD t11,t11,a0,b2
-
+
LD a4,12*SIZE(A)
daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
-
+
LD a5,13*SIZE(A)
MADD t21,t21,a1,b2
@@ -2131,7 +2131,7 @@
.L74:
LD b0, 0*SIZE(B)
MADD t11,t11,a4,b6
-
+
LD a0,0*SIZE(A)
daddu PREA,PREA,16*SIZE
@@ -2150,20 +2150,20 @@
.L75: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L78
nop
-.L76:
+.L76:
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
-
+
LD a4,4*SIZE(A)
daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32
-
+
LD a5,5*SIZE(A)
MADD t21,t21,a1,b0
FETCH $0,0(PREA)
@@ -2193,16 +2193,16 @@
daddu PREA,PREA,8*SIZE
-
+
.L78: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L79
+ beqz K,.L79
LD ALPHA,152($sp) # Get ALPHA
-
+
FETCH $0,0(PREA)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
@@ -2217,7 +2217,7 @@
.L79: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
@@ -2252,7 +2252,7 @@
FETCH $0,4*SIZE(CO1)
FETCH $0,8*SIZE(CO1)
- daddu CO1,CO1,4*SIZE
+ daddu CO1,CO1,4*SIZE
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
@@ -2271,7 +2271,7 @@
#ifdef LEFT
daddiu KK, KK, 4
#endif
- bnez M,.L70
+ bnez M,.L70
nop
#endif
@@ -2279,10 +2279,10 @@
.align 3
.L11_M2:
andi M,MCO,2 # mr = 2
- beqz M,.L11_M1
+ beqz M,.L11_M1
nop
-.L80:
+.L80:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B, BO
@@ -2312,13 +2312,13 @@
nop
#else
move B, BO
- dsra K,KCO,2
+ dsra K,KCO,2
LD b0, 0*SIZE(B)
MTC $0,t11
MOV t21,t11
LD a0,0*SIZE(A)
-
+
beqz K,.L85
LD a1,1*SIZE(A)
@@ -2336,7 +2336,7 @@
MADD t11,t11,a4,b4
LD a3,5*SIZE(A)
MADD t21,t21,a5,b4
-
+
LD b6, 3*SIZE(B)
LD a6,6*SIZE(A)
MADD t11,t11,a2,b2
@@ -2358,23 +2358,23 @@
.L85: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L88
nop
-.L86:
+.L86:
LD b4, 1*SIZE(B)
LD a4,2*SIZE(A)
MADD t11,t11,a0,b0
LD a5,3*SIZE(A)
MADD t21,t21,a1,b0
-
+
daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
-
+
LD b0,0(B)
LD a0,0*SIZE(A)
MADD t11,t11,a4,b4
@@ -2382,16 +2382,16 @@
MADD t21,t21,a5,b4
-
+
.L88: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L89
+ beqz K,.L89
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
@@ -2401,7 +2401,7 @@
.L89: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
MADD t11,c11,t11,ALPHA
MADD t21,c21,t21,ALPHA
@@ -2410,7 +2410,7 @@
ST t21,1*SIZE(CO1)
FETCH $0,2*SIZE(CO1)
-
+
daddu CO1,CO1,2*SIZE # COx += 2*8Byte
#else
@@ -2445,10 +2445,10 @@
.align 3
.L11_M1:
andi M,MCO,1 # mr = 1
- beqz M,.L999
+ beqz M,.L999
nop
-.L90:
+.L90:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B, BO
@@ -2478,7 +2478,7 @@
move B, BO
LD a0, 0*SIZE(A)
LD b0, 0*SIZE(B)
- dsra K,KCO,2
+ dsra K,KCO,2
beqz K,.L95
MTC $0,t11
#endif
@@ -2487,7 +2487,7 @@
LD a4, 1*SIZE(A)
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
-
+
LD a2, 2*SIZE(A)
LD b2, 2*SIZE(B)
MADD t11,t11,a4,b4
@@ -2495,28 +2495,28 @@
LD a6, 3*SIZE(A)
LD b6, 3*SIZE(B)
MADD t11,t11,a2,b2
-
+
daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
LD a0, 0*SIZE(A)
LD b0, 0*SIZE(B)
MADD t11,t11,a6,b6
-
+
daddiu K,K,-1
bnez K,.L91
nop
.L95: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L98
nop
-.L96:
+.L96:
LD a4, 1*SIZE(A)
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
@@ -2526,14 +2526,14 @@
LD b0,0(B)
LD a0,0(A)
MADD t11,t11,a4,b4
-
+
.L98: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L99
+ beqz K,.L99
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
diff --git a/kernel/mips64/sgemm_kernel_loongson3b_4x4.S b/kernel/mips64/sgemm_kernel_loongson3b_4x4.S
index 4a8c9b0..10c5f47 100644
--- a/kernel/mips64/sgemm_kernel_loongson3b_4x4.S
+++ b/kernel/mips64/sgemm_kernel_loongson3b_4x4.S
@@ -110,7 +110,7 @@
#define F27 27
#define F26 26
#define F25 25
-#define F24 24
+#define F24 24
#define F23 23
#define F22 22
#define F21 21
@@ -118,7 +118,7 @@
#define F19 19
#define F18 18
#define F17 17
-#define F16 16
+#define F16 16
#define F15 15
#define F14 14
#define F13 13
@@ -130,14 +130,14 @@
#define F7 7
#define F6 6
#define F5 5
-#define F4 4
-#define F3 3
-#define F2 2
-#define F1 1
+#define F4 4
+#define F3 3
+#define F2 2
+#define F1 1
#define F0 0
PROLOGUE
-
+
daddiu $sp, $sp, -160
sd $16, 0($sp)
sd $17, 8($sp)
@@ -160,7 +160,7 @@
ST $f23,144($sp)
- .align 5
+ .align 5
.L0_N4: # Loop N
ST ALPHA,152($sp) # Backup ALPHA
move MCO,M # Backup M
@@ -170,26 +170,26 @@
move AO,A # Backup A_addr
dsra N,NCO,2 # N=NCO/2
-
+
dsll LDC,LDC,BASE_SHIFT # LDC*8Byte
dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5
-
+
#if defined(TRMMKERNEL)
- LDARG OFFSET,160($sp) # OFFSET is relate to the data part
+ LDARG OFFSET,160($sp) # OFFSET is relate to the data part
#endif
#if defined(TRMMKERNEL) && !defined(LEFT)
- neg KK,OFFSET
+ neg KK,OFFSET
#endif
-
+
move BO,B # Backup B_addr
beq N,$0,.L0_N2 # N=0,NCO<4
dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte
.L0_N4_Lb: # mr=4,nr=4
- move CO1,C
+ move CO1,C
dsra M,MCO,2 # M=MCO/2
-
+
move A,AO # Reset A
daddu CO2,C,LDC
@@ -200,7 +200,7 @@
daddu CO4,CO3,LDC
#if defined(TRMMKERNEL) && defined(LEFT)
- move KK,OFFSET
+ move KK,OFFSET
#endif
beqz M,.L14_M2
daddu C,CO4,LDC # move C to next panel Cj
@@ -227,18 +227,18 @@
MOV t41,t11
MOV t12,t11
LD b0,0(B)
-
+
MOV t22,t11
MOV t32,t11
LD b1,1*SIZE(B)
MOV t42,t11
LD a2,2*SIZE(A)
-
+
MOV t13,t11
MOV t23,t11
LD b2,2*SIZE(B)
-
+
MOV t33,t11
MOV t43,t11
LD a3,3*SIZE(A)
@@ -250,7 +250,7 @@
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP,KCO,KK # temp is the length of the data part
#elif defined(LEFT)
- daddiu TEMP, KK, 4 # S=L,U=L
+ daddiu TEMP, KK, 4 # S=L,U=L
#else
daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part
#endif
@@ -259,7 +259,7 @@
beqz K,.L15
MOV t44,t11
-#else
+#else
move B,BO # Reset B
MTC $0,t11 # GEMM part NR=4,MR=4
LD a0,0(A)
@@ -271,7 +271,7 @@
MOV t41,t11
MOV t12,t11
LD b0,0(B)
-
+
MOV t22,t11
MOV t32,t11
LD b1,1*SIZE(B)
@@ -279,11 +279,11 @@
MOV t42,t11
dsra K,KCO,2 # K=KCO/2
LD a2,2*SIZE(A)
-
+
MOV t13,t11
MOV t23,t11
LD b2,2*SIZE(B)
-
+
MOV t33,t11
MOV t43,t11
LD a3,3*SIZE(A)
@@ -296,7 +296,7 @@
beqz K,.L15
MOV t44,t11 # clear 16 results registers
#endif
-
+
.align 5
.L11: # kr=4
MADD t11,t11,a0,b0
@@ -306,29 +306,29 @@
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
LD a5,5*SIZE(A)
-
+
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
LD b4,4*SIZE(B)
-
+
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
LD b5,5*SIZE(B)
FETCH $0,(PREB)
-
+
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
LD a6,6*SIZE(A)
-
+
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
LD b6,6*SIZE(B)
FETCH $0,(PREA)
-
+
MADD t33,t33,a2,b2
MADD t43,t43,a3,b2
LD a7,7*SIZE(A)
-
+
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
LD b7,7*SIZE(B)
@@ -447,14 +447,14 @@
.L15: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP, 2
#endif
beqz K,.L18
nop
-.L16:
+.L16:
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
LD a4,4*SIZE(A)
@@ -528,16 +528,16 @@
daddu PREB,PREB,8*SIZE
LD b3,3*SIZE(B)
-
+
.L18: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L19
+ beqz K,.L19
LD ALPHA,152($sp) # Get ALPHA
-
+
FETCH $0,0(PREB)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
@@ -569,8 +569,8 @@
MADD t44,t44,a3,b3
.L19: # Write Back to C
-#ifndef TRMMKERNEL
- LD c11,0(CO1) # GEMM write part
+#ifndef TRMMKERNEL
+ LD c11,0(CO1) # GEMM write part
LD c21,1*SIZE(CO1) # get 16 C
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
@@ -640,11 +640,11 @@
daddu CO3,CO3,4*SIZE
ST t44,3*SIZE(CO4)
daddu PREB,BO,SPANB
-
- bnez M,.L10
+
+ bnez M,.L10
daddu CO4,CO4,4*SIZE
-#else
+#else
MUL t11, ALPHA, t11 # TRMM write back part
MUL t21, ALPHA, t21
MUL t31, ALPHA, t31
@@ -685,7 +685,7 @@
daddiu CO1,CO1, 4 * SIZE
daddiu CO2,CO2, 4 * SIZE
daddiu CO3,CO3, 4 * SIZE
- daddiu CO4,CO4, 4 * SIZE
+ daddiu CO4,CO4, 4 * SIZE
FETCH $0,4*SIZE(CO1)
FETCH $0,4*SIZE(CO2)
@@ -698,7 +698,7 @@
FETCH $0,0(CO4)
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- dsubu TEMP,KCO,KK
+ dsubu TEMP,KCO,KK
#ifdef LEFT
daddiu TEMP,TEMP, -4
#else
@@ -710,10 +710,10 @@
daddu B,B,TEMP # mov B to the end of panel Bj
#endif
-#ifdef LEFT
+#ifdef LEFT
daddiu KK, KK,4
#endif
- bnez M,.L10
+ bnez M,.L10
nop
#endif
@@ -721,7 +721,7 @@
.align 3
.L14_M2:
andi M, MCO, 2 # nr=4,mr=2
- beqz M,.L14_M1
+ beqz M,.L14_M1
nop
.L20:
@@ -729,7 +729,7 @@
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO # Reset B
#else
- dsll K,KK,1 + BASE_SHIFT # mr=2
+ dsll K,KK,1 + BASE_SHIFT # mr=2
dsll TEMP,KK,2 + BASE_SHIFT # nr=4
daddu A,A,K
daddu B,BO,TEMP
@@ -738,7 +738,7 @@
LD a0,0*SIZE(A)
MTC $0,t11
LD a1,1*SIZE(A)
-
+
MOV t21,t11
LD b0,0*SIZE(B)
MOV t12,t11
@@ -764,18 +764,18 @@
MOV t24,t11 # clear 2*4=8 results registers
#else
- move B,BO # Reset B
+ move B,BO # Reset B
LD a0,0*SIZE(A)
MTC $0,t11
LD a1,1*SIZE(A)
-
+
MOV t21,t11
LD b0,0*SIZE(B)
MOV t12,t11
LD b1,1*SIZE(B)
MOV t22,t11
- dsra K,KCO,2
+ dsra K,KCO,2
LD b2,2*SIZE(B)
MOV t13,t11
@@ -806,7 +806,7 @@
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
-
+
MADD t11,t11,a4,b4
LD a2,4*SIZE(A)
MADD t21,t21,a5,b4
@@ -866,7 +866,7 @@
MADD t24,t24,a7,b7
-.L25:
+.L25:
#ifndef TRMMKERNEL
andi K,KCO,2 # kr=2
#else
@@ -875,7 +875,7 @@
beqz K,.L28
nop
-.L26:
+.L26:
MADD t11,t11,a0,b0
LD a4,2*SIZE(A)
MADD t21,t21,a1,b0
@@ -890,7 +890,7 @@
LD b6,6*SIZE(B)
MADD t23,t23,a1,b2
LD b7,7*SIZE(B)
-
+
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
daddu A,A,4*SIZE # 2mr*2kr
@@ -915,16 +915,16 @@
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
-
-.L28: # kr=1
+
+.L28: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L29
+ beqz K,.L29
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # 2mr*kr
@@ -942,11 +942,11 @@
.L29: # Write Back to C
#ifndef TRMMKERNEL
LD c11,0(CO1) # GEMM write back part
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
LD c12,0(CO2)
LD c22,1*SIZE(CO2)
-
+
LD c13,0(CO3)
MADD t11,c11,t11,ALPHA
LD c23,1*SIZE(CO3)
@@ -985,25 +985,25 @@
#else
MUL t11, ALPHA, t11 # TRMM write back part
MUL t21, ALPHA, t21
-
+
ST t11, 0 * SIZE(CO1)
MUL t12, ALPHA, t12
ST t21, 1 * SIZE(CO1)
MUL t22, ALPHA, t22
-
+
ST t12, 0 * SIZE(CO2)
MUL t13, ALPHA, t13
ST t22, 1 * SIZE(CO2)
MUL t23, ALPHA, t23
-
+
ST t13, 0 * SIZE(CO3)
MUL t14, ALPHA, t14
ST t23, 1 * SIZE(CO3)
MUL t24, ALPHA, t24
-
+
ST t14, 0 * SIZE(CO4)
ST t24, 1 * SIZE(CO4)
-
+
daddiu CO1,CO1, 2 * SIZE
daddiu CO2,CO2, 2 * SIZE
daddiu CO3,CO3, 2 * SIZE
@@ -1036,7 +1036,7 @@
.align 3
.L14_M1:
- andi M,MCO,1 # mr=1
+ andi M,MCO,1 # mr=1
beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj
nop
@@ -1056,13 +1056,13 @@
MTC $0,t11
LD b0,0*SIZE(B)
-
+
MOV t12,t11
LD b1,1*SIZE(B)
MOV t13,t11
LD b2,2*SIZE(B)
-
+
MOV t14,t11
LD b3,3*SIZE(B)
@@ -1077,35 +1077,35 @@
nop
beqz K,.L35
nop
-
-#else
+
+#else
move B,BO # Reset B, GEMM part
dsra K,KCO,2 # K=KCO/2
LD a0, 0 * SIZE(A) # a0
MTC $0,t11
LD b0,0*SIZE(B)
-
+
MOV t12,t11
LD b1,1*SIZE(B)
MOV t13,t11
LD b2,2*SIZE(B)
-
+
MOV t14,t11
beqz K,.L35
LD b3,3*SIZE(B)
#endif
-.L31: # nr=4,mr=1,kr=4
+.L31: # nr=4,mr=1,kr=4
LD a1, 1*SIZE(A) # load a1
MADD t11,t11,a0,b0
-
+
LD b4,4*SIZE(B)
LD b5,5*SIZE(B)
MADD t12,t12,a0,b1
-
+
LD b6,6*SIZE(B)
LD b7,7*SIZE(B)
MADD t13,t13,a0,b2
@@ -1113,11 +1113,11 @@
LD a2, 2*SIZE(A) # a2
MADD t11,t11,a1,b4
-
+
LD b0,8*SIZE(B)
LD b1,9*SIZE(B)
MADD t12,t12,a1,b5
-
+
LD b2,10*SIZE(B)
LD b3,11*SIZE(B)
MADD t13,t13,a1,b6
@@ -1126,12 +1126,12 @@
LD a3, 3*SIZE(A) # a3
MADD t11,t11,a2,b0
daddiu K,K,-1
-
+
LD b4,12*SIZE(B)
LD b5,13*SIZE(B)
MADD t12,t12,a2,b1
daddu A,A,4*SIZE # 1mr*4kr
-
+
LD b6,14*SIZE(B)
LD b7,15*SIZE(B)
MADD t13,t13,a2,b2
@@ -1140,7 +1140,7 @@
LD a0, 0*SIZE(A) # a0
daddu B,B,16*SIZE # 4nr*4kr
MADD t11,t11,a3,b4
-
+
LD b0,0*SIZE(B)
MADD t12,t12,a3,b5
LD b1,1*SIZE(B)
@@ -1154,14 +1154,14 @@
.L35: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L38
nop
-.L36:
+.L36:
LD a1,1*SIZE(A) # load a1
MADD t11,t11,a0,b0
@@ -1169,10 +1169,10 @@
LD b5,5*SIZE(B)
MADD t12,t12,a0,b1
daddu A,A,2*SIZE # mr*2kr
-
+
LD b6,6*SIZE(B)
MADD t13,t13,a0,b2
-
+
LD b7,7*SIZE(B)
MADD t14,t14,a0,b3
daddu B,B,8*SIZE # 4nr*2kr
@@ -1181,41 +1181,41 @@
.L37:
LD a0,0(A)
MADD t11,t11,a1,b4
-
+
LD b0,0*SIZE(B)
LD b1,1*SIZE(B)
MADD t12,t12,a1,b5
-
+
LD b2,2*SIZE(B)
LD b3,3*SIZE(B)
MADD t13,t13,a1,b6
MADD t14,t14,a1,b7
-
-
+
+
.L38: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L39
+ beqz K,.L39
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
- daddu A,A,1*SIZE
+ daddu A,A,1*SIZE
daddu B,B,4*SIZE
-
+
MADD t13,t13,a0,b2
MADD t14,t14,a0,b3
.L39: # Write Back
#ifndef TRMMKERNEL
- LD c11,0(CO1)
+ LD c11,0(CO1)
LD c12,0(CO2)
LD c13,0(CO3)
LD c14,0(CO4)
-
+
MADD t11,c11,t11,ALPHA
MADD t12,c12,t12,ALPHA
MADD t13,c13,t13,ALPHA
@@ -1261,22 +1261,22 @@
.L0_N4_Loop: # mc finished
daddiu N,N,-1 # N--
#if defined(TRMMKERNEL) && !defined(LEFT)
- daddiu KK, KK,4
+ daddiu KK, KK,4
#endif
- bnez N,.L0_N4_Lb
+ bnez N,.L0_N4_Lb
move BO,B # Set BO point to next panel Bj
- .align 5
+ .align 5
.L0_N2:
andi N,NCO,2 # nr = 2
- beqz N,.L0_N1
+ beqz N,.L0_N1
nop
.L0_N2_Lb:
- move CO1,C
+ move CO1,C
daddu CO2,C,LDC
- dsra M,MCO,2
+ dsra M,MCO,2
move A,AO # Reset A
daddu PREA,AO,SPANA
@@ -1288,13 +1288,13 @@
beqz M,.L12_M2
nop
-.L40:
+.L40:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO # Reset B
#else
dsll K,KK, 2 + BASE_SHIFT
- dsll TEMP, KK,1 + BASE_SHIFT
+ dsll TEMP, KK,1 + BASE_SHIFT
daddu A,A,K
daddu B,BO,TEMP
@@ -1311,10 +1311,10 @@
MOV t41,t11
LD a2,2*SIZE(A)
LD a3,3*SIZE(A)
-
+
MOV t12,t11
MOV t22,t11
-
+
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP,KCO,KK
#elif defined(LEFT)
@@ -1322,7 +1322,7 @@
#else
daddiu TEMP, KK, 2
#endif
- dsra K,TEMP,2
+ dsra K,TEMP,2
MOV t32,t11
beqz K,.L45
MOV t42,t11
@@ -1342,10 +1342,10 @@
LD a2,2*SIZE(A)
dsra K,KCO,2 # K=KCO/2
LD a3,3*SIZE(A)
-
+
MOV t12,t11
MOV t22,t11
-
+
MOV t32,t11
beqz K,.L45
MOV t42,t11
@@ -1411,9 +1411,9 @@
FETCH $0,8*SIZE(PREA)
MADD t32,t32,a2,b3
MADD t42,t42,a3,b3
-
+
daddu A,A,16*SIZE # 4mr*4kr
- daddu B,B,8*SIZE # 2nr*4kr
+ daddu B,B,8*SIZE # 2nr*4kr
.L44:
MADD t11,t11,a4,b6
@@ -1443,14 +1443,14 @@
.L45: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L48
nop
-.L46:
+.L46:
MADD t11,t11,a0,b0
LD a4,4*SIZE(A)
MADD t21,t21,a1,b0
@@ -1469,7 +1469,7 @@
FETCH $0,0(PREA)
MADD t32,t32,a2,b1
daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32
-
+
MADD t42,t42,a3,b1
daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
@@ -1495,16 +1495,16 @@
daddu PREA,PREA,8*SIZE
-
+
.L48: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L49
+ beqz K,.L49
LD ALPHA,152($sp) # Get ALPHA
-
+
FETCH $0,0(PREA)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
@@ -1524,7 +1524,7 @@
.L49: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # gemm write back part Fetch 16 C
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
@@ -1545,7 +1545,7 @@
MADD t32,c32,t32,ALPHA
ST t41,3*SIZE(CO1)
MADD t42,c42,t42,ALPHA
- daddiu M,M,-1
+ daddiu M,M,-1
ST t12,0(CO2)
ST t22,1*SIZE(CO2)
@@ -1557,8 +1557,8 @@
FETCH $0,8*SIZE(CO1)
FETCH $0,8*SIZE(CO2)
- daddu CO1,CO1,4*SIZE
- bnez M,.L40
+ daddu CO1,CO1,4*SIZE
+ bnez M,.L40
daddu CO2,CO2,4*SIZE
#else
@@ -1566,7 +1566,7 @@
MUL t21, ALPHA, t21
MUL t31, ALPHA, t31
MUL t41, ALPHA, t41
-
+
MUL t12, ALPHA, t12
ST t11, 0 * SIZE(CO1)
MUL t22, ALPHA, t22
@@ -1575,13 +1575,13 @@
ST t31, 2 * SIZE(CO1)
MUL t42, ALPHA, t42
ST t41, 3 * SIZE(CO1)
-
+
ST t12, 0 * SIZE(CO2)
daddiu M,M,-1
ST t22, 1 * SIZE(CO2)
ST t32, 2 * SIZE(CO2)
ST t42, 3 * SIZE(CO2)
-
+
daddiu CO1,CO1, 4*SIZE
daddiu CO2,CO2, 4*SIZE
@@ -1615,7 +1615,7 @@
.align 3
.L12_M2:
andi M,MCO,2 # mr = 2
- beqz M,.L12_M1
+ beqz M,.L12_M1
nop
.L50:
@@ -1636,7 +1636,7 @@
LD b0,0*SIZE(B)
MOV t21,t11
LD b1,1*SIZE(B)
-
+
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
@@ -1644,7 +1644,7 @@
#else
daddiu TEMP, KK, 2
#endif
- dsra K,TEMP,2
+ dsra K,TEMP,2
MOV t12,t11
beqz K,.L55
MOV t22,t11
@@ -1659,7 +1659,7 @@
LD b0,0*SIZE(B)
MOV t21,t11
LD b1,1*SIZE(B)
-
+
MOV t12,t11
beqz K,.L55
MOV t22,t11
@@ -1715,14 +1715,14 @@
.L55: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L58
nop
-.L56:
+.L56:
MADD t11,t11,a0,b0
LD a4,2*SIZE(A)
MADD t21,t21,a1,b0
@@ -1752,9 +1752,9 @@
#else
andi K,TEMP, 1
#endif
- beqz K,.L59
+ beqz K,.L59
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
@@ -1767,10 +1767,10 @@
.L59: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # write gemm part back Fetch 16 C
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
LD c12,0(CO2)
LD c22,1*SIZE(CO2)
-
+
MADD t11,c11,t11,ALPHA
MADD t21,c21,t21,ALPHA
MADD t12,c12,t12,ALPHA
@@ -1781,7 +1781,7 @@
ST t12,0(CO2)
ST t22,1*SIZE(CO2)
- daddu CO1,CO1,2*SIZE
+ daddu CO1,CO1,2*SIZE
daddu CO2,CO2,2*SIZE
FETCH $0,0(CO1)
@@ -1827,7 +1827,7 @@
.align 3
.L12_M1:
andi M,MCO,1 # mr = 1
- beqz M,.L0_N2_Loop
+ beqz M,.L0_N2_Loop
nop
.L60:
@@ -1842,7 +1842,7 @@
daddu B, BO, TEMP
#endif
LD a0,0*SIZE(A)
-
+
MTC $0,t11
MOV t21,t11
LD b0,0*SIZE(B)
@@ -1857,16 +1857,16 @@
#else
daddiu TEMP, KK, 2
#endif
- dsra K,TEMP,2
+ dsra K,TEMP,2
MOV t22,t11
beqz K,.L65
nop
#else
- dsra K,KCO,2
+ dsra K,KCO,2
move B,BO # Reset B
LD a0,0*SIZE(A)
-
+
MTC $0,t11
MOV t21,t11
LD b0,0*SIZE(B)
@@ -1878,18 +1878,18 @@
#endif
-.L61: # nr=2,mr=1,kr=4
+.L61: # nr=2,mr=1,kr=4
LD a4, 1*SIZE(A) # a2
LD b4, 2*SIZE(B)
MADD t11,t11,a0,b0
-
+
LD b5,3*SIZE(B)
MADD t12,t12,a0,b1
LD a2, 2*SIZE(A) # a3
LD b2,4*SIZE(B)
MADD t11,t11,a4,b4
-
+
LD b3,5*SIZE(B)
MADD t12,t12,a4,b5
@@ -1897,17 +1897,17 @@
daddiu K,K,-1
LD b6,6*SIZE(B)
MADD t11,t11,a2,b2
-
+
LD b7,7*SIZE(B)
MADD t12,t12,a2,b3
daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
LD a0, 0*SIZE(A)
daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE
-
- LD b0,0*SIZE(B)
+
+ LD b0,0*SIZE(B)
MADD t11,t11,a6,b6
-
+
LD b1,1*SIZE(B)
bnez K,.L61
MADD t12,t12,a6,b7
@@ -1916,19 +1916,19 @@
.L65: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L68
nop
-.L66:
+.L66:
LD a4, 1*SIZE(A) # a1
MADD t11,t11,a0,b0
LD b4,2*SIZE(B)
daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16
-
+
LD b5,3*SIZE(B)
MADD t12,t12,a0,b1
daddu B,B,4*SIZE
@@ -1937,7 +1937,7 @@
LD a0,0(A) # a0
LD b0,0*SIZE(B)
MADD t11,t11,a4,b4
-
+
LD b1,1*SIZE(B)
MADD t12,t12,a4,b5
@@ -1948,9 +1948,9 @@
#else
andi K,TEMP,1
#endif
- beqz K,.L69
+ beqz K,.L69
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16
@@ -1961,14 +1961,14 @@
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
LD c12,0(CO2)
-
+
MADD t11,c11,t11,ALPHA
MADD t12,c12,t12,ALPHA
ST t11,0(CO1)
ST t12,0(CO2)
- daddu CO1,CO1,1*SIZE
+ daddu CO1,CO1,1*SIZE
daddu CO2,CO2,1*SIZE
#else
@@ -1978,7 +1978,7 @@
ST t11, 0 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
- daddu CO1,CO1,1*SIZE
+ daddu CO1,CO1,1*SIZE
daddu CO2,CO2,1*SIZE
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -2008,15 +2008,15 @@
move BO, B
- .align 5
+ .align 5
.L0_N1:
andi N,NCO,1 # nr = 1
- beqz N,.L999
+ beqz N,.L999
nop
- move CO1,C
- dsra M,MCO,2
-
+ move CO1,C
+ dsra M,MCO,2
+
move A,AO # Reset A
daddu PREA,AO,SPANA
#if defined(TRMMKERNEL) && defined(LEFT)
@@ -2026,7 +2026,7 @@
beqz M,.L11_M2
daddu C,CO1,LDC
-.L70:
+.L70:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B, BO # Reset B
@@ -2038,12 +2038,12 @@
daddu B, BO, TEMP
#endif
LD b0, 0*SIZE(B)
-
+
MTC $0,t11
LD a0,0*SIZE(A)
MOV t21,t11
LD a1,1*SIZE(A)
-
+
MOV t31,t11
LD a2,2*SIZE(A)
MOV t41,t11
@@ -2057,19 +2057,19 @@
#else
daddiu TEMP, KK, 1
#endif
- dsra K,TEMP,2
+ dsra K,TEMP,2
beqz K,.L75
nop
#else
move B, BO # Reset B
- dsra K,KCO,2
+ dsra K,KCO,2
LD b0, 0*SIZE(B)
-
+
MTC $0,t11
LD a0,0*SIZE(A)
MOV t21,t11
LD a1,1*SIZE(A)
-
+
MOV t31,t11
LD a2,2*SIZE(A)
MOV t41,t11
@@ -2081,7 +2081,7 @@
.L71: # nr=1,mr=kr=4
LD b4, 1*SIZE(B) # b1
MADD t11,t11,a0,b0
-
+
LD a4, 4*SIZE(A)
MADD t21,t21,a1,b0
@@ -2097,7 +2097,7 @@
.L72:
LD b2, 2*SIZE(B) # b2
MADD t11,t11,a4,b4
-
+
LD a0,8*SIZE(A)
MADD t21,t21,a5,b4
@@ -2106,17 +2106,17 @@
LD a2,10*SIZE(A)
MADD t31,t31,a6,b4
-
+
LD a3,11*SIZE(A)
MADD t41,t41,a7,b4
.L73:
LD b6, 3*SIZE(B)
MADD t11,t11,a0,b2
-
+
LD a4,12*SIZE(A)
daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
-
+
LD a5,13*SIZE(A)
MADD t21,t21,a1,b2
@@ -2131,7 +2131,7 @@
.L74:
LD b0, 0*SIZE(B)
MADD t11,t11,a4,b6
-
+
LD a0,0*SIZE(A)
daddu PREA,PREA,16*SIZE
@@ -2150,20 +2150,20 @@
.L75: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L78
nop
-.L76:
+.L76:
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
-
+
LD a4,4*SIZE(A)
daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32
-
+
LD a5,5*SIZE(A)
MADD t21,t21,a1,b0
FETCH $0,0(PREA)
@@ -2193,16 +2193,16 @@
daddu PREA,PREA,8*SIZE
-
+
.L78: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L79
+ beqz K,.L79
LD ALPHA,152($sp) # Get ALPHA
-
+
FETCH $0,0(PREA)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
@@ -2217,7 +2217,7 @@
.L79: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
@@ -2252,7 +2252,7 @@
FETCH $0,4*SIZE(CO1)
FETCH $0,8*SIZE(CO1)
- daddu CO1,CO1,4*SIZE
+ daddu CO1,CO1,4*SIZE
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
@@ -2271,7 +2271,7 @@
#ifdef LEFT
daddiu KK, KK, 4
#endif
- bnez M,.L70
+ bnez M,.L70
nop
#endif
@@ -2279,10 +2279,10 @@
.align 3
.L11_M2:
andi M,MCO,2 # mr = 2
- beqz M,.L11_M1
+ beqz M,.L11_M1
nop
-.L80:
+.L80:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B, BO
@@ -2312,13 +2312,13 @@
nop
#else
move B, BO
- dsra K,KCO,2
+ dsra K,KCO,2
LD b0, 0*SIZE(B)
MTC $0,t11
MOV t21,t11
LD a0,0*SIZE(A)
-
+
beqz K,.L85
LD a1,1*SIZE(A)
@@ -2336,7 +2336,7 @@
MADD t11,t11,a4,b4
LD a3,5*SIZE(A)
MADD t21,t21,a5,b4
-
+
LD b6, 3*SIZE(B)
LD a6,6*SIZE(A)
MADD t11,t11,a2,b2
@@ -2358,23 +2358,23 @@
.L85: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L88
nop
-.L86:
+.L86:
LD b4, 1*SIZE(B)
LD a4,2*SIZE(A)
MADD t11,t11,a0,b0
LD a5,3*SIZE(A)
MADD t21,t21,a1,b0
-
+
daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
-
+
LD b0,0(B)
LD a0,0*SIZE(A)
MADD t11,t11,a4,b4
@@ -2382,16 +2382,16 @@
MADD t21,t21,a5,b4
-
+
.L88: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L89
+ beqz K,.L89
LD ALPHA,152($sp) # Get ALPHA
-
+
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
@@ -2401,7 +2401,7 @@
.L89: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
- LD c21,1*SIZE(CO1)
+ LD c21,1*SIZE(CO1)
MADD t11,c11,t11,ALPHA
MADD t21,c21,t21,ALPHA
@@ -2410,7 +2410,7 @@
ST t21,1*SIZE(CO1)
FETCH $0,2*SIZE(CO1)
-
+
daddu CO1,CO1,2*SIZE # COx += 2*8Byte
#else
@@ -2445,10 +2445,10 @@
.align 3
.L11_M1:
andi M,MCO,1 # mr = 1
- beqz M,.L999
+ beqz M,.L999
nop
-.L90:
+.L90:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B, BO
@@ -2478,7 +2478,7 @@
move B, BO
LD a0, 0*SIZE(A)
LD b0, 0*SIZE(B)
- dsra K,KCO,2
+ dsra K,KCO,2
beqz K,.L95
MTC $0,t11
#endif
@@ -2487,7 +2487,7 @@
LD a4, 1*SIZE(A)
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
-
+
LD a2, 2*SIZE(A)
LD b2, 2*SIZE(B)
MADD t11,t11,a4,b4
@@ -2495,28 +2495,28 @@
LD a6, 3*SIZE(A)
LD b6, 3*SIZE(B)
MADD t11,t11,a2,b2
-
+
daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
LD a0, 0*SIZE(A)
LD b0, 0*SIZE(B)
MADD t11,t11,a6,b6
-
+
daddiu K,K,-1
bnez K,.L91
nop
.L95: # kr=2
#ifndef TRMMKERNEL
- andi K,KCO,2
+ andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L98
nop
-.L96:
+.L96:
LD a4, 1*SIZE(A)
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
@@ -2526,14 +2526,14 @@
LD b0,0(B)
LD a0,0(A)
MADD t11,t11,a4,b4
-
+
.L98: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
- beqz K,.L99
+ beqz K,.L99
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
diff --git a/kernel/mips64/snrm2.S b/kernel/mips64/snrm2.S
index 04a48bd..1ba061a 100644
--- a/kernel/mips64/snrm2.S
+++ b/kernel/mips64/snrm2.S
@@ -42,7 +42,7 @@
#define N $4
#define X $5
#define INCX $6
-
+
#define I $2
#define TEMP $3
@@ -65,7 +65,7 @@
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
@@ -188,7 +188,7 @@
daddiu I, I, -1
cvt.d.s t1, a1
-
+
madd.d s1, s1, t1, t1
bgtz I, .L16
@@ -319,7 +319,7 @@
daddiu I, I, -1
cvt.d.s t1, a1
-
+
daddu X, X, INCX
bgtz I, .L26
@@ -333,5 +333,5 @@
j $31
cvt.s.d s1, s1
-
+
EPILOGUE
diff --git a/kernel/mips64/swap.S b/kernel/mips64/swap.S
index d54abd7..aa786ed 100644
--- a/kernel/mips64/swap.S
+++ b/kernel/mips64/swap.S
@@ -70,7 +70,7 @@
#define b8 $f15
PROLOGUE
-
+
li TEMP, SIZE
NOP
diff --git a/kernel/mips64/symv_L.S b/kernel/mips64/symv_L.S
index 9a54eb7..f67d70c 100644
--- a/kernel/mips64/symv_L.S
+++ b/kernel/mips64/symv_L.S
@@ -91,7 +91,7 @@
PROLOGUE
-
+
LDARG BUFFER, 0($sp)
daddiu $sp, $sp, -32
diff --git a/kernel/mips64/symv_U.S b/kernel/mips64/symv_U.S
index 285e591..5f20876 100644
--- a/kernel/mips64/symv_U.S
+++ b/kernel/mips64/symv_U.S
@@ -89,7 +89,7 @@
PROLOGUE
-
+
LDARG BUFFER, 0($sp)
daddiu $sp, $sp, -32
diff --git a/kernel/mips64/trsm_kernel_LN.S b/kernel/mips64/trsm_kernel_LN.S
index 28e1794..eb07aef 100644
--- a/kernel/mips64/trsm_kernel_LN.S
+++ b/kernel/mips64/trsm_kernel_LN.S
@@ -104,7 +104,7 @@
#define ALPHA $f15
PROLOGUE
-
+
daddiu $sp, $sp, -144
SDARG $16, 0($sp)
@@ -1695,7 +1695,7 @@
bgtz J, .L10
NOP
.align 3
-
+
.L30:
andi J, N, 4
blez J, .L50
diff --git a/kernel/mips64/trsm_kernel_LN_loongson3a.S b/kernel/mips64/trsm_kernel_LN_loongson3a.S
index aba86fb..4df2e43 100644
--- a/kernel/mips64/trsm_kernel_LN_loongson3a.S
+++ b/kernel/mips64/trsm_kernel_LN_loongson3a.S
@@ -70,7 +70,7 @@
#define ALPHA $f15
PROLOGUE
-
+
daddiu $sp, $sp, -144
SDARG $16, 0($sp)
@@ -97,13 +97,13 @@
sdc1 $f23,136($sp)
#endif
# LN compute from bottom to top
- LDARG OFFSET, 144($sp)
+ LDARG OFFSET, 144($sp)
dsll LDC, LDC, BASE_SHIFT # ldc
mult M, K
mflo TEMP # TEMP=MC*KC
- dsll TEMP, TEMP, BASE_SHIFT
+ dsll TEMP, TEMP, BASE_SHIFT
daddu A, A, TEMP # A move to the end of sa
dsll TEMP, M, BASE_SHIFT
@@ -129,19 +129,19 @@
MOV t32, t11
MOV t42, t11
- daddu KK, M, OFFSET # kc - kk is the length of the rectangular data part of panel Ai
+ daddu KK, M, OFFSET # kc - kk is the length of the rectangular data part of panel Ai
move AORIG, A # reset A
daddu C, CO4, LDC # fixed pointer C, the write back address
-
- andi I, M, 1 # mr=2,nr=4
+
+ andi I, M, 1 # mr=2,nr=4
blez I, .L50
nop
dsll TEMP, K, BASE_SHIFT # mr=1
dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of Ai
- dsll L, KK, BASE_SHIFT # mr=1
+ dsll L, KK, BASE_SHIFT # mr=1
dsll TEMP, KK, 2 + BASE_SHIFT # nr=4
daddu AO, AORIG, L # AO point to the rectangular data part
@@ -163,7 +163,7 @@
LD b1, 0 * SIZE(BO) # get 4b
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
- LD b4, 3 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
dsra L, TEMP, 2
blez L, .L55
@@ -172,7 +172,7 @@
.align 3
.L52:
- LD a5, 1 * SIZE(AO)
+ LD a5, 1 * SIZE(AO)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
@@ -206,10 +206,10 @@
MADD t13, t13, a3, b3
MADD t14, t14, a3, b4
- daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
+ daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
@@ -230,7 +230,7 @@
andi L, TEMP, 3
blez L, .L58
nop
-
+
.align 3
.L56:
MADD t11, t11, a1, b1 # 3rd compute
@@ -238,10 +238,10 @@
MADD t13, t13, a1, b3
MADD t14, t14, a1, b4
- daddiu AO, AO, 1 * SIZE # AO += 1mr
+ daddiu AO, AO, 1 * SIZE # AO += 1mr
daddiu BO, BO, 4 * SIZE # BO += 4nr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
@@ -303,16 +303,16 @@
MOV t42, t11
-
-.L50:
- andi I, M, 2 # mr=2,nr=4
+
+.L50:
+ andi I, M, 2 # mr=2,nr=4
blez I, .L20
nop
dsll TEMP, K, 1 + BASE_SHIFT
dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of Ai
- dsll L, KK, 1 + BASE_SHIFT
+ dsll L, KK, 1 + BASE_SHIFT
dsll TEMP, KK, 2 + BASE_SHIFT
daddu AO, AORIG, L # AO point to the rectangular data part
@@ -335,7 +335,7 @@
LD b1, 0 * SIZE(BO) # get 4b
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
- LD b4, 3 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
dsra L, TEMP, 2
blez L, .L25
@@ -344,7 +344,7 @@
.align 3
.L22:
- LD a5, 2 * SIZE(AO)
+ LD a5, 2 * SIZE(AO)
LD a6, 3 * SIZE(AO)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
@@ -392,10 +392,10 @@
MADD t14, t14, a3, b4
MADD t24, t24, a4, b4
- daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
@@ -421,7 +421,7 @@
andi L, TEMP, 3
blez L, .L28
nop
-
+
.align 3
.L26:
MADD t11, t11, a1, b1 # 3rd compute
@@ -433,10 +433,10 @@
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
- daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 4 * SIZE # BO += 4nr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
@@ -474,7 +474,7 @@
SUB t24, b8, t24
- LD b1, 3 * SIZE(AO) # computes the triangular_part
+ LD b1, 3 * SIZE(AO) # computes the triangular_part
LD b2, 2 * SIZE(AO)
MUL t21, b1, t21
MUL t22, b1, t22
@@ -484,7 +484,7 @@
NMSUB t12, t12, b2, t22
NMSUB t13, t13, b2, t23
NMSUB t14, t14, b2, t24
-
+
LD b3, 0 * SIZE(AO)
MUL t11, b3, t11
MUL t12, b3, t12
@@ -535,13 +535,13 @@
.L11: # mr=4
dsll TEMP, K, 2 + BASE_SHIFT # TEMP=KC*MR*data_Byte
dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of panel Ai
- dsll L, KK, 2 + BASE_SHIFT # KC-KK is the length of the rectangular data part of Ai
+ dsll L, KK, 2 + BASE_SHIFT # KC-KK is the length of the rectangular data part of Ai
dsll TEMP, KK, 2 + BASE_SHIFT # KK*NR*data_Byte
daddu AO, AORIG, L # AO point to the rectangular data part
daddu BO, B, TEMP
- dsubu TEMP, K, KK
+ dsubu TEMP, K, KK
LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
@@ -551,7 +551,7 @@
LD b1, 0 * SIZE(BO) # get 4b
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
- LD b4, 3 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
MOV t13, t11 # clear result registers
MOV t23, t11
@@ -568,7 +568,7 @@
.align 3
.L12:
- LD a5, 4 * SIZE(AO)
+ LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
@@ -596,7 +596,7 @@
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
MADD t34, t34, a3, b4
- MADD t44, t44, a4, b4
+ MADD t44, t44, a4, b4
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
@@ -626,7 +626,7 @@
MADD t14, t14, a5, b8
MADD t24, t24, a6, b8
MADD t34, t34, a7, b8
- MADD t44, t44, a8, b8
+ MADD t44, t44, a8, b8
LD a5, 12 * SIZE(AO)
LD a6, 13 * SIZE(AO)
@@ -656,12 +656,12 @@
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
MADD t34, t34, a3, b4
- MADD t44, t44, a4, b4
+ MADD t44, t44, a4, b4
- daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
@@ -689,12 +689,12 @@
MADD t14, t14, a5, b8
MADD t24, t24, a6, b8
MADD t34, t34, a7, b8
- MADD t44, t44, a8, b8
+ MADD t44, t44, a8, b8
daddiu L, L, -1
bgtz L, .L12
nop
-
+
.align 3
.L15:
@@ -704,7 +704,7 @@
.align 3
.L16:
- MADD t11, t11, a1, b1
+ MADD t11, t11, a1, b1
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
@@ -722,12 +722,12 @@
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
MADD t34, t34, a3, b4
- MADD t44, t44, a4, b4
+ MADD t44, t44, a4, b4
- daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 4 * SIZE # BO += 4nr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
@@ -743,13 +743,13 @@
.L18: # deal with the triangular data part of panel Ai
- daddiu TEMP, KK, -4 #
+ daddiu TEMP, KK, -4 #
dsll L, TEMP, 2 + BASE_SHIFT
dsll TEMP, TEMP, 2 + BASE_SHIFT
daddu AO, AORIG, L # AO point to the triangular data part
daddu BO, B, TEMP
-
+
LD b1, 0 * SIZE(BO) # triangular_part*X + rectangular_part = B
LD b2, 1 * SIZE(BO) # triangular_part*X = B - rectangular_part
LD b3, 2 * SIZE(BO)
@@ -764,7 +764,7 @@
LD b6, 5 * SIZE(BO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
-
+
SUB t21, b5, t21
SUB t22, b6, t22
SUB t23, b7, t23
@@ -774,12 +774,12 @@
LD b2, 9 * SIZE(BO)
LD b3, 10 * SIZE(BO)
LD b4, 11 * SIZE(BO)
-
+
SUB t31, b1, t31
SUB t32, b2, t32
SUB t33, b3, t33
SUB t34, b4, t34
-
+
LD b5, 12 * SIZE(BO)
LD b6, 13 * SIZE(BO)
LD b7, 14 * SIZE(BO)
@@ -792,10 +792,10 @@
LD b1, 15 * SIZE(AO)
- LD b2, 14 * SIZE(AO)
+ LD b2, 14 * SIZE(AO)
LD b4, 13 * SIZE(AO)
LD b7, 12 * SIZE(AO)
-
+
MUL t41, b1, t41
MUL t42, b1, t42
MUL t43, b1, t43
@@ -815,7 +815,7 @@
- LD b3, 10 * SIZE(AO)
+ LD b3, 10 * SIZE(AO)
LD b5, 9 * SIZE(AO)
LD b8, 8 * SIZE(AO)
MUL t31, b3, t31
@@ -852,7 +852,7 @@
MUL t13, b2, t13
MUL t14, b2, t14
- daddiu CO1, CO1, -4 * SIZE # modify
+ daddiu CO1, CO1, -4 * SIZE # modify
daddiu CO2, CO2, -4 * SIZE
daddiu CO3, CO3, -4 * SIZE
daddiu CO4, CO4, -4 * SIZE
@@ -875,7 +875,7 @@
ST t43, 14 * SIZE(BO)
ST t44, 15 * SIZE(BO)
- ST t11, 0 * SIZE(CO1) # write back
+ ST t11, 0 * SIZE(CO1) # write back
ST t21, 1 * SIZE(CO1)
ST t31, 2 * SIZE(CO1)
ST t41, 3 * SIZE(CO1)
@@ -916,8 +916,8 @@
bgtz J, .L10
nop
-
-
+
+
.align 3
.L30:
andi J, N, 2 # nr=2
@@ -934,8 +934,8 @@
daddu KK, M, OFFSET
move AORIG, A # reset A
-
- daddu C, CO2, LDC # fixed
+
+ daddu C, CO2, LDC # fixed
andi I, M, 1 # mr=1
blez I, .L60
@@ -968,7 +968,7 @@
.align 3
.L62:
- LD a5, 1 * SIZE(AO)
+ LD a5, 1 * SIZE(AO)
LD b5, 2 * SIZE(BO)
LD b6, 3 * SIZE(BO)
@@ -989,10 +989,10 @@
MADD t11, t11, a3, b3 # 3rd compute
MADD t12, t12, a3, b4
- daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
+ daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
@@ -1002,14 +1002,14 @@
daddiu L, L, -1
bgtz L, .L62
nop
-
+
.align 3
.L65:
andi L, TEMP, 3
blez L, .L68
nop
-
+
.align 3
.L66:
MADD t11, t11, a1, b1 # 3rd compute
@@ -1017,10 +1017,10 @@
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
- daddiu AO, AO, 1 * SIZE # AO += mr
+ daddiu AO, AO, 1 * SIZE # AO += mr
daddiu BO, BO, 2 * SIZE # BO += 2nr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
@@ -1035,14 +1035,14 @@
dsll TEMP, TEMP, 1 + BASE_SHIFT
daddu AO, AORIG, L # Ao point to the triangular data part
daddu BO, B, TEMP
-
+
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
SUB t11, b1, t11
SUB t12, b2, t12
-
+
LD b3, 0 * SIZE(AO)
MUL t11, b3, t11
MUL t12, b3, t12
@@ -1101,7 +1101,7 @@
.align 3
.L42:
- LD a5, 2 * SIZE(AO)
+ LD a5, 2 * SIZE(AO)
LD a6, 3 * SIZE(AO)
LD b5, 2 * SIZE(BO)
LD b6, 3 * SIZE(BO)
@@ -1131,10 +1131,10 @@
MADD t12, t12, a3, b4
MADD t22, t22, a4, b4
- daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
@@ -1147,14 +1147,14 @@
daddiu L, L, -1
bgtz L, .L42
nop
-
+
.align 3
.L45:
andi L, TEMP, 3
blez L, .L48
nop
-
+
.align 3
.L46:
MADD t11, t11, a1, b1 # 3rd compute
@@ -1162,10 +1162,10 @@
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
- daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 2 * SIZE # BO += 2nr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
@@ -1181,7 +1181,7 @@
dsll TEMP, TEMP, 1 + BASE_SHIFT
daddu AO, AORIG, L # Ao point to the triangular data part
daddu BO, B, TEMP
-
+
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
@@ -1192,13 +1192,13 @@
SUB t21, b3, t21
SUB t22, b4, t22
- LD b1, 3 * SIZE(AO) # computes the triangular_part
+ LD b1, 3 * SIZE(AO) # computes the triangular_part
LD b2, 2 * SIZE(AO)
MUL t21, b1, t21
MUL t22, b1, t22
NMSUB t11, t11, b2, t21
NMSUB t12, t12, b2, t22
-
+
LD b3, 0 * SIZE(AO)
MUL t11, b3, t11
MUL t12, b3, t12
@@ -1260,7 +1260,7 @@
.align 3
.L32:
- LD a5, 4 * SIZE(AO)
+ LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
@@ -1308,10 +1308,10 @@
MADD t32, t32, a3, b4
MADD t42, t42, a4, b4
- daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
@@ -1331,30 +1331,30 @@
bgtz L, .L32
nop
-
+
.align 3
.L35:
andi L, TEMP, 3
blez L, .L38
nop
-
+
.align 3
.L36:
MADD t11, t11, a1, b1 # 3rd compute
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
-
+
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t32, t32, a3, b2
MADD t42, t42, a4, b2
- daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 2 * SIZE # BO += 2nr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
@@ -1373,7 +1373,7 @@
daddu AO, AORIG, L # AO point to the triangular data part
daddu BO, B, TEMP
-
+
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
@@ -1394,10 +1394,10 @@
LD b1, 15 * SIZE(AO)
- LD b2, 14 * SIZE(AO)
+ LD b2, 14 * SIZE(AO)
LD b4, 13 * SIZE(AO)
LD b7, 12 * SIZE(AO)
-
+
MUL t41, b1, t41
MUL t42, b1, t42
NMSUB t31, t31, b2, t41
@@ -1409,7 +1409,7 @@
- LD b3, 10 * SIZE(AO)
+ LD b3, 10 * SIZE(AO)
LD b5, 9 * SIZE(AO)
LD b8, 8 * SIZE(AO)
MUL t31, b3, t31
@@ -1493,11 +1493,11 @@
dsll TEMP, K, BASE_SHIFT # mr=1
dsubu AORIG, AORIG, TEMP
-
+
dsll L, KK, BASE_SHIFT
daddu AO, AORIG, L # AO point to the rectangular data part
- daddu BO, B, L
+ daddu BO, B, L
dsubu TEMP, K, KK
@@ -1508,10 +1508,10 @@
dsra L, TEMP, 2
blez L, .L95
nop
-
+
.align 3
.L92:
- LD a5, 1 * SIZE(AO)
+ LD a5, 1 * SIZE(AO)
LD b5, 1 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
@@ -1526,10 +1526,10 @@
MADD t11, t11, a3, b3 # 3rd compute
- daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
+ daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
MADD t11, t11, a7, b7 # 4th compute
@@ -1537,7 +1537,7 @@
daddiu L, L, -1
bgtz L, .L92
nop
-
+
.align 3
.L95:
@@ -1549,10 +1549,10 @@
.L96:
MADD t11, t11, a1, b1 # 3rd compute
- daddiu AO, AO, 1 * SIZE # AO += 1mr
+ daddiu AO, AO, 1 * SIZE # AO += 1mr
daddiu BO, BO, 1 * SIZE # BO += 1nr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
daddiu L, L, -1
@@ -1584,7 +1584,7 @@
daddiu KK, KK, -1
-.L90:
+.L90:
andi I, M, 2
blez I, .L80
NOP
@@ -1594,7 +1594,7 @@
dsll TEMP, K, 1+BASE_SHIFT # mr=2
dsubu AORIG, AORIG, TEMP
-
+
dsll L, KK, 1 + BASE_SHIFT
dsll TEMP, KK, 0 + BASE_SHIFT
@@ -1611,10 +1611,10 @@
dsra L, TEMP, 2
blez L, .L85
nop
-
+
.align 3
.L82:
- LD a5, 2 * SIZE(AO)
+ LD a5, 2 * SIZE(AO)
LD a6, 3 * SIZE(AO)
LD b5, 1 * SIZE(BO)
@@ -1638,10 +1638,10 @@
MADD t11, t11, a3, b3 # 3rd compute
MADD t21, t21, a4, b3
- daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
@@ -1652,7 +1652,7 @@
daddiu L, L, -1
bgtz L, .L82
nop
-
+
.align 3
.L85:
@@ -1665,10 +1665,10 @@
MADD t11, t11, a1, b1 # 3rd compute
MADD t21, t21, a2, b1
- daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 1 * SIZE # BO += 1nr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
@@ -1692,11 +1692,11 @@
SUB t11, b1, t11
SUB t21, b2, t21
- LD b1, 3 * SIZE(AO) # computes the triangular_part
+ LD b1, 3 * SIZE(AO) # computes the triangular_part
LD b2, 2 * SIZE(AO)
MUL t21, b1, t21
NMSUB t11, t11, b2, t21
-
+
LD b3, 0 * SIZE(AO)
MUL t11, b3, t11
@@ -1709,8 +1709,8 @@
ST t21, 1 * SIZE(CO1)
daddiu KK, KK, -2
-
-
+
+
.align 3
.L80:
dsra I, M, 2
@@ -1748,7 +1748,7 @@
.align 3
.L72:
- LD a5, 4 * SIZE(AO)
+ LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
@@ -1784,10 +1784,10 @@
MADD t31, t31, a3, b3
MADD t41, t41, a4, b3
- daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
@@ -1802,7 +1802,7 @@
daddiu L, L, -1
bgtz L, .L72
nop
-
+
.align 3
.L75:
@@ -1817,10 +1817,10 @@
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
- daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 1 * SIZE # BO += 1nr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
@@ -1850,7 +1850,7 @@
SUB t41, b4, t41
LD b1, 15 * SIZE(AO)
- LD b2, 14 * SIZE(AO)
+ LD b2, 14 * SIZE(AO)
LD b4, 13 * SIZE(AO)
LD b7, 12 * SIZE(AO)
MUL t41, b1, t41
@@ -1860,7 +1860,7 @@
- LD b3, 10 * SIZE(AO)
+ LD b3, 10 * SIZE(AO)
LD b5, 9 * SIZE(AO)
LD b8, 8 * SIZE(AO)
MUL t31, b3, t31
diff --git a/kernel/mips64/trsm_kernel_LT.S b/kernel/mips64/trsm_kernel_LT.S
index 824e045..57f48c5 100644
--- a/kernel/mips64/trsm_kernel_LT.S
+++ b/kernel/mips64/trsm_kernel_LT.S
@@ -104,7 +104,7 @@
#define ALPHA $f15
PROLOGUE
-
+
daddiu $sp, $sp, -144
SDARG $16, 0($sp)
@@ -1686,7 +1686,7 @@
bgtz J, .L10
NOP
.align 3
-
+
.L30:
andi J, N, 4
blez J, .L50
diff --git a/kernel/mips64/trsm_kernel_LT_loongson3a.S b/kernel/mips64/trsm_kernel_LT_loongson3a.S
index 4114d94..b06269c 100644
--- a/kernel/mips64/trsm_kernel_LT_loongson3a.S
+++ b/kernel/mips64/trsm_kernel_LT_loongson3a.S
@@ -70,7 +70,7 @@
#define ALPHA $f15
PROLOGUE
-
+
daddiu $sp, $sp, -144
SDARG $16, 0($sp)
@@ -97,7 +97,7 @@
sdc1 $f23,136($sp)
#endif
# LT compute from left to right, top to bottom
- LDARG OFFSET, 144($sp)
+ LDARG OFFSET, 144($sp)
dsll LDC, LDC, BASE_SHIFT # ldc
dsra J, N, 2 # j = nc/4
@@ -122,7 +122,7 @@
dsra I, M, 2 # i = mc/4
move KK, OFFSET # kk is the length of the rectangular data part of panel Ai
- move AO, A # reset A
+ move AO, A # reset A
daddu C, CO4, LDC # fixed pointer C, the write back address
blez I, .L20
nop
@@ -137,7 +137,7 @@
LD b1, 0 * SIZE(B) # get 4b
LD b2, 1 * SIZE(B)
LD b3, 2 * SIZE(B)
- LD b4, 3 * SIZE(B)
+ LD b4, 3 * SIZE(B)
MOV t13, t11 # clear result registers
MOV t23, t11
@@ -155,7 +155,7 @@
.align 3
.L12:
- LD a5, 4 * SIZE(AO)
+ LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
@@ -183,7 +183,7 @@
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
MADD t34, t34, a3, b4
- MADD t44, t44, a4, b4
+ MADD t44, t44, a4, b4
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
@@ -213,7 +213,7 @@
MADD t14, t14, a5, b8
MADD t24, t24, a6, b8
MADD t34, t34, a7, b8
- MADD t44, t44, a8, b8
+ MADD t44, t44, a8, b8
LD a5, 12 * SIZE(AO)
LD a6, 13 * SIZE(AO)
@@ -243,12 +243,12 @@
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
MADD t34, t34, a3, b4
- MADD t44, t44, a4, b4
+ MADD t44, t44, a4, b4
- daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
@@ -276,12 +276,12 @@
MADD t14, t14, a5, b8
MADD t24, t24, a6, b8
MADD t34, t34, a7, b8
- MADD t44, t44, a8, b8
+ MADD t44, t44, a8, b8
daddiu L, L, -1
bgtz L, .L12
nop
-
+
.align 3
.L15:
@@ -291,7 +291,7 @@
.align 3
.L16:
- MADD t11, t11, a1, b1
+ MADD t11, t11, a1, b1
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
@@ -309,12 +309,12 @@
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
MADD t34, t34, a3, b4
- MADD t44, t44, a4, b4
+ MADD t44, t44, a4, b4
- daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 4 * SIZE # BO += 4nr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
@@ -344,7 +344,7 @@
LD b6, 5 * SIZE(BO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
-
+
SUB t21, b5, t21
SUB t22, b6, t22
SUB t23, b7, t23
@@ -354,12 +354,12 @@
LD b2, 9 * SIZE(BO)
LD b3, 10 * SIZE(BO)
LD b4, 11 * SIZE(BO)
-
+
SUB t31, b1, t31
SUB t32, b2, t32
SUB t33, b3, t33
SUB t34, b4, t34
-
+
LD b5, 12 * SIZE(BO)
LD b6, 13 * SIZE(BO)
LD b7, 14 * SIZE(BO)
@@ -371,7 +371,7 @@
SUB t44, b8, t44
- LD a1, 0 * SIZE(AO) # sa stores in col major
+ LD a1, 0 * SIZE(AO) # sa stores in col major
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
@@ -392,7 +392,7 @@
NMSUB t43, t43, a4, t13
NMSUB t44, t44, a4, t14
-
+
LD a5, 5 * SIZE(AO)
LD a6, 6 * SIZE(AO)
LD a7, 7 * SIZE(AO)
@@ -445,7 +445,7 @@
ST t43, 14 * SIZE(BO)
ST t44, 15 * SIZE(BO)
- ST t11, 0 * SIZE(CO1) # write back
+ ST t11, 0 * SIZE(CO1) # write back
ST t21, 1 * SIZE(CO1)
ST t31, 2 * SIZE(CO1)
ST t41, 3 * SIZE(CO1)
@@ -491,7 +491,7 @@
.align 3
.L20:
- andi I, M, 2 # mr=2,nr=4
+ andi I, M, 2 # mr=2,nr=4
blez I, .L50
nop
@@ -510,7 +510,7 @@
LD b1, 0 * SIZE(B) # get 4b
LD b2, 1 * SIZE(B)
LD b3, 2 * SIZE(B)
- LD b4, 3 * SIZE(B)
+ LD b4, 3 * SIZE(B)
dsra L, KK, 2
blez L, .L25
@@ -519,7 +519,7 @@
.align 3
.L22:
- LD a5, 2 * SIZE(AO)
+ LD a5, 2 * SIZE(AO)
LD a6, 3 * SIZE(AO)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
@@ -567,10 +567,10 @@
MADD t14, t14, a3, b4
MADD t24, t24, a4, b4
- daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
@@ -596,7 +596,7 @@
andi L, KK, 3
blez L, .L28
nop
-
+
.align 3
.L26:
MADD t11, t11, a1, b1 # 3rd compute
@@ -608,10 +608,10 @@
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
- daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 4 * SIZE # BO += 4nr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
@@ -643,7 +643,7 @@
SUB t24, b8, t24
- LD b1, 0 * SIZE(AO) # computes the triangular_part
+ LD b1, 0 * SIZE(AO) # computes the triangular_part
LD b2, 1 * SIZE(AO)
MUL t11, b1, t11
MUL t12, b1, t12
@@ -653,7 +653,7 @@
NMSUB t22, t22, b2, t12
NMSUB t23, t23, b2, t13
NMSUB t24, t24, b2, t14
-
+
LD b3, 3 * SIZE(AO)
MUL t21, b3, t21
MUL t22, b3, t22
@@ -705,7 +705,7 @@
.align 3
.L50:
- andi I, M, 1 # mr=1,nr=4
+ andi I, M, 1 # mr=1,nr=4
blez I, .L29
nop
@@ -723,7 +723,7 @@
LD b1, 0 * SIZE(B) # get 4b
LD b2, 1 * SIZE(B)
LD b3, 2 * SIZE(B)
- LD b4, 3 * SIZE(B)
+ LD b4, 3 * SIZE(B)
dsra L, KK, 2
blez L, .L55
@@ -732,7 +732,7 @@
.align 3
.L52:
- LD a5, 1 * SIZE(AO)
+ LD a5, 1 * SIZE(AO)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
LD b7, 6 * SIZE(BO)
@@ -765,10 +765,10 @@
MADD t13, t13, a3, b3
MADD t14, t14, a3, b4
- daddiu AO, AO, 4 * SIZE # AO += mr*4kr
+ daddiu AO, AO, 4 * SIZE # AO += mr*4kr
daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
@@ -789,7 +789,7 @@
andi L, KK, 3
blez L, .L58
nop
-
+
.align 3
.L56:
MADD t11, t11, a1, b1 # 3rd compute
@@ -797,10 +797,10 @@
MADD t13, t13, a1, b3
MADD t14, t14, a1, b4
- daddiu AO, AO, 1 * SIZE # AO += 2mr
+ daddiu AO, AO, 1 * SIZE # AO += 2mr
daddiu BO, BO, 4 * SIZE # BO += 4nr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
@@ -823,7 +823,7 @@
SUB t14, b4, t14
- LD b1, 0 * SIZE(AO) # computes the triangular_part
+ LD b1, 0 * SIZE(AO) # computes the triangular_part
MUL t11, b1, t11
MUL t12, b1, t12
MUL t13, b1, t13
@@ -858,8 +858,8 @@
move B, BO # fixed panel Bj
bgtz J, .L10
nop
-
-
+
+
.align 3
.L30:
andi J, N, 2 # nr=2
@@ -874,9 +874,9 @@
MOV t31, t11
MOV t41, t11
- move KK, OFFSET
+ move KK, OFFSET
move AO, A # reset A
- daddu C, CO2, LDC # fixed
+ daddu C, CO2, LDC # fixed
dsra I, M, 2 # I = mc/4
blez I, .L40
@@ -902,7 +902,7 @@
.align 3
.L32:
- LD a5, 4 * SIZE(AO)
+ LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
@@ -950,10 +950,10 @@
MADD t32, t32, a3, b4
MADD t42, t42, a4, b4
- daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
@@ -973,30 +973,30 @@
bgtz L, .L32
nop
-
+
.align 3
.L35:
andi L, KK, 3
blez L, .L38
nop
-
+
.align 3
.L36:
MADD t11, t11, a1, b1 # 3rd compute
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
-
+
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t32, t32, a3, b2
MADD t42, t42, a4, b2
- daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 2 * SIZE # BO += 2nr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
@@ -1027,7 +1027,7 @@
SUB t41, b7, t41
SUB t42, b8, t42
- LD a1, 0 * SIZE(AO) # sa stores in col major
+ LD a1, 0 * SIZE(AO) # sa stores in col major
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
@@ -1040,7 +1040,7 @@
NMSUB t41, t41, a4, t11
NMSUB t42, t42, a4, t12
-
+
LD a5, 5 * SIZE(AO)
LD a6, 6 * SIZE(AO)
LD a7, 7 * SIZE(AO)
@@ -1091,7 +1091,7 @@
daddu AO, AO, L # move AO to the end of Ai
daddu BO, BO, TEMP
- daddiu KK, KK, 4 #
+ daddiu KK, KK, 4 #
MTC $0, a1
MOV t11, a1
@@ -1122,12 +1122,12 @@
dsra L, KK, 2
blez L, .L45
- move BO, B # reset B
+ move BO, B # reset B
.align 3
.L42:
- LD a5, 2 * SIZE(AO)
+ LD a5, 2 * SIZE(AO)
LD a6, 3 * SIZE(AO)
LD b5, 2 * SIZE(BO)
LD b6, 3 * SIZE(BO)
@@ -1157,10 +1157,10 @@
MADD t12, t12, a3, b4
MADD t22, t22, a4, b4
- daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
@@ -1173,14 +1173,14 @@
daddiu L, L, -1
bgtz L, .L42
nop
-
+
.align 3
.L45:
andi L, KK, 3
blez L, .L48
nop
-
+
.align 3
.L46:
MADD t11, t11, a1, b1 # 3rd compute
@@ -1188,10 +1188,10 @@
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
- daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 2 * SIZE # BO += 2nr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
@@ -1211,13 +1211,13 @@
SUB t21, b3, t21
SUB t22, b4, t22
- LD b1, 0 * SIZE(AO) # computes the triangular_part
+ LD b1, 0 * SIZE(AO) # computes the triangular_part
LD b2, 1 * SIZE(AO)
MUL t11, b1, t11
MUL t12, b1, t12
NMSUB t21, t21, b2, t11
NMSUB t22, t22, b2, t12
-
+
LD b3, 3 * SIZE(AO)
MUL t21, b3, t21
MUL t22, b3, t22
@@ -1266,12 +1266,12 @@
dsra L, KK, 2
blez L, .L65
- move BO, B # reset B
+ move BO, B # reset B
.align 3
.L62:
- LD a5, 1 * SIZE(AO)
+ LD a5, 1 * SIZE(AO)
LD b5, 2 * SIZE(BO)
LD b6, 3 * SIZE(BO)
@@ -1292,10 +1292,10 @@
MADD t11, t11, a3, b3 # 3rd compute
MADD t12, t12, a3, b4
- daddiu AO, AO, 4 * SIZE # AO += mr*4kr
+ daddiu AO, AO, 4 * SIZE # AO += mr*4kr
daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
@@ -1305,23 +1305,23 @@
daddiu L, L, -1
bgtz L, .L62
nop
-
+
.align 3
.L65:
andi L, KK, 3
blez L, .L68
nop
-
+
.align 3
.L66:
MADD t11, t11, a1, b1 # 3rd compute
MADD t12, t12, a1, b2
- daddiu AO, AO, 1 * SIZE # AO += 1mr
+ daddiu AO, AO, 1 * SIZE # AO += 1mr
daddiu BO, BO, 2 * SIZE # BO += 2nr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
@@ -1336,7 +1336,7 @@
SUB t11, b1, t11
SUB t12, b2, t12
- LD b1, 0 * SIZE(AO) # computes the triangular_part
+ LD b1, 0 * SIZE(AO) # computes the triangular_part
MUL t11, b1, t11
MUL t12, b1, t12
@@ -1360,7 +1360,7 @@
.align 3
.L49:
move B, BO
-
+
.align 3
.L70:
@@ -1396,7 +1396,7 @@
.align 3
.L72:
- LD a5, 4 * SIZE(AO)
+ LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
@@ -1432,10 +1432,10 @@
MADD t31, t31, a3, b3
MADD t41, t41, a4, b3
- daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
@@ -1450,7 +1450,7 @@
daddiu L, L, -1
bgtz L, .L72
nop
-
+
.align 3
.L75:
@@ -1465,10 +1465,10 @@
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
- daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 1 * SIZE # BO += 1nr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
@@ -1490,7 +1490,7 @@
SUB t31, b3, t31
SUB t41, b4, t41
- LD a1, 0 * SIZE(AO) # sa stores in col major
+ LD a1, 0 * SIZE(AO) # sa stores in col major
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
@@ -1556,10 +1556,10 @@
dsra L, KK, 2
blez L, .L85
move BO, B
-
+
.align 3
.L82:
- LD a5, 2 * SIZE(AO)
+ LD a5, 2 * SIZE(AO)
LD a6, 3 * SIZE(AO)
LD b5, 1 * SIZE(BO)
@@ -1583,10 +1583,10 @@
MADD t11, t11, a3, b3 # 3rd compute
MADD t21, t21, a4, b3
- daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
@@ -1597,7 +1597,7 @@
daddiu L, L, -1
bgtz L, .L82
nop
-
+
.align 3
.L85:
@@ -1610,10 +1610,10 @@
MADD t11, t11, a1, b1 # 3rd compute
MADD t21, t21, a2, b1
- daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 1 * SIZE # BO += 1nr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
@@ -1630,11 +1630,11 @@
SUB t11, b1, t11
SUB t21, b2, t21
- LD b1, 0 * SIZE(AO) # computes the triangular_part
+ LD b1, 0 * SIZE(AO) # computes the triangular_part
LD b2, 1 * SIZE(AO)
MUL t11, b1, t11
NMSUB t21, t21, b2, t11
-
+
LD b3, 3 * SIZE(AO)
MUL t21, b3, t21
@@ -1646,7 +1646,7 @@
daddiu CO1, CO1, 2 * SIZE
-
+
dsubu TEMP, K, KK
dsll L, TEMP, 1 + BASE_SHIFT
dsll TEMP, TEMP, 0 + BASE_SHIFT
@@ -1655,7 +1655,7 @@
daddiu KK, KK, 2
-
+
.align 3
.L90:
andi I, M, 1 # mr=1
@@ -1670,10 +1670,10 @@
dsra L, KK, 2
blez L, .L95
move BO, B
-
+
.align 3
.L92:
- LD a5, 1 * SIZE(AO)
+ LD a5, 1 * SIZE(AO)
LD b5, 1 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
@@ -1688,10 +1688,10 @@
MADD t11, t11, a3, b3 # 3rd compute
- daddiu AO, AO, 4 * SIZE # AO += 2mr*4kr
+ daddiu AO, AO, 4 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
MADD t11, t11, a7, b7 # 4th compute
@@ -1699,7 +1699,7 @@
daddiu L, L, -1
bgtz L, .L92
nop
-
+
.align 3
.L95:
andi L, KK, 3
@@ -1710,10 +1710,10 @@
.L96:
MADD t11, t11, a1, b1 # 3rd compute
- daddiu AO, AO, 1 * SIZE # AO += 2mr
+ daddiu AO, AO, 1 * SIZE # AO += 2mr
daddiu BO, BO, 1 * SIZE # BO += 1nr
-
- LD a1, 0 * SIZE(AO) # next
+
+ LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
daddiu L, L, -1
@@ -1726,16 +1726,16 @@
SUB t11, b1, t11
- LD b1, 0 * SIZE(AO) # computes the triangular_part
+ LD b1, 0 * SIZE(AO) # computes the triangular_part
MUL t11, b1, t11
-
+
ST t11, 0 * SIZE(BO)
ST t11, 0 * SIZE(CO1)
daddiu CO1, CO1, 1 * SIZE
-
+
dsubu TEMP, K, KK
dsll L, TEMP, BASE_SHIFT
dsll TEMP, TEMP, BASE_SHIFT
diff --git a/kernel/mips64/trsm_kernel_RN_loongson3a.S b/kernel/mips64/trsm_kernel_RN_loongson3a.S
index 790d7c9..0827bf7 100644
--- a/kernel/mips64/trsm_kernel_RN_loongson3a.S
+++ b/kernel/mips64/trsm_kernel_RN_loongson3a.S
@@ -70,7 +70,7 @@
#define t44 $f25
PROLOGUE
-
+
daddiu $sp, $sp, -144
SDARG $16, 0($sp)
@@ -104,19 +104,19 @@
neg KK, OFFSET # for RN OFFSET always 0
- dsra J, N, 2 # J = NC/4
+ dsra J, N, 2 # J = NC/4
blez J, .L30
NOP
.L10:
daddiu J, J, -1
-
+
move CO1, C
daddu CO2, C, LDC
daddu CO3, CO2, LDC
daddu CO4, CO3, LDC
-
- move AO, A # A is the retangular matrix and B is the trigular matrix
+
+ move AO, A # A is the retangular matrix and B is the trigular matrix
daddu C, CO4, LDC # Fixed pointer C
dsra I, M, 2 # I=MC/4
@@ -139,14 +139,14 @@
MOV t23, t11
MOV t33, t11
MOV t43, t11
-
+
MOV t14, t11
MOV t24, t11
MOV t34, t11
MOV t44, t11
-
+
LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
- LD a2, 1 * SIZE(AO) # get 4 a
+ LD a2, 1 * SIZE(AO) # get 4 a
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
@@ -188,7 +188,7 @@
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
MADD t34, t34, a3, b4
- MADD t44, t44, a4, b4 # fisrt
+ MADD t44, t44, a4, b4 # fisrt
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
@@ -250,9 +250,9 @@
MADD t34, t34, a3, b4
MADD t44, t44, a4, b4 # third
- daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
@@ -313,11 +313,11 @@
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
MADD t34, t34, a3, b4
- MADD t44, t44, a4, b4
+ MADD t44, t44, a4, b4
- daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 4 * SIZE # BP += 4nr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
@@ -337,7 +337,7 @@
.L18: # .L18 always deal with the trigular data part
LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
LD b2, 1 * SIZE(AO) # Fixed results
- LD b3, 2 * SIZE(AO)
+ LD b3, 2 * SIZE(AO)
LD b4, 3 * SIZE(AO) # sa stored as col major
SUB t11, b1, t11
@@ -398,7 +398,7 @@
NMSUB t34, t34, b4, t31
NMSUB t44, t44, b4, t41
-
+
LD b5, 5 * SIZE(BO)
LD b6, 6 * SIZE(BO)
LD b7, 7 * SIZE(BO)
@@ -463,17 +463,17 @@
ST t21, 1 * SIZE(CO1)
ST t31, 2 * SIZE(CO1)
ST t41, 3 * SIZE(CO1)
-
+
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
ST t32, 2 * SIZE(CO2)
ST t42, 3 * SIZE(CO2)
-
+
ST t13, 0 * SIZE(CO3)
ST t23, 1 * SIZE(CO3)
ST t33, 2 * SIZE(CO3)
ST t43, 3 * SIZE(CO3)
-
+
ST t14, 0 * SIZE(CO4)
ST t24, 1 * SIZE(CO4)
ST t34, 2 * SIZE(CO4)
@@ -496,7 +496,7 @@
NOP
.align 3
-.L20:
+.L20:
andi I, M, 2 # mr=2
blez I, .L50
nop
@@ -515,14 +515,14 @@
MOV t23, t11
MOV t33, t11
MOV t43, t11
-
+
MOV t14, t11
MOV t24, t11
MOV t34, t11
MOV t44, t11
-
+
LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
- LD a2, 1 * SIZE(AO) # get 4 a
+ LD a2, 1 * SIZE(AO) # get 4 a
LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
LD b2, 1 * SIZE(B) # get 4 b
@@ -594,9 +594,9 @@
MADD t14, t14, a3, b4
MADD t24, t24, a4, b4
- daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
@@ -641,9 +641,9 @@
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
- daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 4 * SIZE # BP += 4nr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
@@ -698,7 +698,7 @@
NMSUB t14, t14, b4, t11
NMSUB t24, t24, b4, t21
-
+
LD b5, 5 * SIZE(BO)
LD b6, 6 * SIZE(BO)
LD b7, 7 * SIZE(BO)
@@ -741,13 +741,13 @@
ST t11, 0 * SIZE(CO1) # write back results
ST t21, 1 * SIZE(CO1)
-
+
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
-
+
ST t13, 0 * SIZE(CO3)
ST t23, 1 * SIZE(CO3)
-
+
ST t14, 0 * SIZE(CO4)
ST t24, 1 * SIZE(CO4)
@@ -764,7 +764,7 @@
daddu BO, BO, TEMP # move BO to the end of this panel
.align 3
-.L50:
+.L50:
andi I, M, 1 # mr=1
blez I, .L29
nop
@@ -783,12 +783,12 @@
MOV t23, t11
MOV t33, t11
MOV t43, t11
-
+
MOV t14, t11
MOV t24, t11
MOV t34, t11
MOV t44, t11
-
+
LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
@@ -837,9 +837,9 @@
MADD t13, t13, a3, b3
MADD t14, t14, a3, b4
- daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
+ daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr
-
+
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
@@ -869,9 +869,9 @@
MADD t13, t13, a1, b3
MADD t14, t14, a1, b4
- daddiu AO, AO, 1 * SIZE # AO += 1mr
+ daddiu AO, AO, 1 * SIZE # AO += 1mr
daddiu BO, BO, 4 * SIZE # BP += 4nr
-
+
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
@@ -907,7 +907,7 @@
NMSUB t13, t13, b3, t11
NMSUB t14, t14, b4, t11
-
+
LD b5, 5 * SIZE(BO)
LD b6, 6 * SIZE(BO)
LD b7, 7 * SIZE(BO)
@@ -954,13 +954,13 @@
.align 3
.L29:
move B, BO # change to next panel of Bj
- daddiu KK, KK, 4 # rectangular data length increase by 4
+ daddiu KK, KK, 4 # rectangular data length increase by 4
bgtz J, .L10
NOP
.align 3
-
+
.L30:
andi J, N, 2
blez J, .L70
@@ -968,8 +968,8 @@
move CO1, C
daddu CO2, C, LDC
-
- move AO, A # A is the retangular matrix and B is the trigular matrix
+
+ move AO, A # A is the retangular matrix and B is the trigular matrix
daddu C, CO2, LDC # Fixed pointer C
dsra I, M, 2 # I=MC/4
@@ -989,7 +989,7 @@
MOV t42, t11
LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
- LD a2, 1 * SIZE(AO) # get 4 a
+ LD a2, 1 * SIZE(AO) # get 4 a
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
@@ -1055,9 +1055,9 @@
MADD t32, t32, a3, b4
MADD t42, t42, a4, b4
- daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
@@ -1098,9 +1098,9 @@
MADD t32, t32, a3, b2
MADD t42, t42, a4, b2
- daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 2 * SIZE # BP += 2nr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
@@ -1118,7 +1118,7 @@
.L38: # .L38 always deal with the trigular data part
LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
LD b2, 1 * SIZE(AO) # Fixed results
- LD b3, 2 * SIZE(AO)
+ LD b3, 2 * SIZE(AO)
LD b4, 3 * SIZE(AO) # sa stored as col major
SUB t11, b1, t11
@@ -1147,7 +1147,7 @@
NMSUB t22, t22, b2, t21
NMSUB t32, t32, b2, t31
NMSUB t42, t42, b2, t41
-
+
LD b5, 3 * SIZE(BO)
MUL t12, b5, t12
MUL t22, b5, t22
@@ -1169,7 +1169,7 @@
ST t21, 1 * SIZE(CO1)
ST t31, 2 * SIZE(CO1)
ST t41, 3 * SIZE(CO1)
-
+
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
ST t32, 2 * SIZE(CO2)
@@ -1201,7 +1201,7 @@
MOV t22, t11
LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
- LD a2, 1 * SIZE(AO) # get 4 a
+ LD a2, 1 * SIZE(AO) # get 4 a
LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
LD b2, 1 * SIZE(B) # get 4 b
@@ -1241,9 +1241,9 @@
MADD t12, t12, a3, b4
MADD t22, t22, a4, b4
- daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
@@ -1271,9 +1271,9 @@
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
- daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 2 * SIZE # BP += 2nr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
@@ -1305,7 +1305,7 @@
MUL t21, b1, t21
NMSUB t12, t12, b2, t11
NMSUB t22, t22, b2, t21
-
+
LD b5, 3 * SIZE(BO)
MUL t12, b5, t12
MUL t22, b5, t22
@@ -1371,9 +1371,9 @@
MADD t11, t11, a3, b3
MADD t12, t12, a3, b4
- daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
+ daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr
-
+
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
@@ -1396,9 +1396,9 @@
MADD t11, t11, a1, b1
MADD t12, t12, a1, b2
- daddiu AO, AO, 1 * SIZE # AO += mr
+ daddiu AO, AO, 1 * SIZE # AO += mr
daddiu BO, BO, 2 * SIZE # BP += 2nr
-
+
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
@@ -1421,7 +1421,7 @@
LD b2, 1 * SIZE(BO)
MUL t11, b1, t11
NMSUB t12, t12, b2, t11
-
+
LD b5, 3 * SIZE(BO)
MUL t12, b5, t12
@@ -1445,7 +1445,7 @@
.align 3
.L39:
move B, BO # change to next panel of Bj
- daddiu KK, KK, 2 # rectangular data length increase by 4
+ daddiu KK, KK, 2 # rectangular data length increase by 4
@@ -1473,7 +1473,7 @@
MOV t41, t11
LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
- LD a2, 1 * SIZE(AO) # get 4 a
+ LD a2, 1 * SIZE(AO) # get 4 a
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
@@ -1520,9 +1520,9 @@
MADD t31, t31, a3, b3
MADD t41, t41, a4, b3
- daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
@@ -1552,9 +1552,9 @@
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
- daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 1 * SIZE # BP += 1nr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
@@ -1571,7 +1571,7 @@
.L78: # .L78 always deal with the trigular data part
LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
LD b2, 1 * SIZE(AO) # Fixed results
- LD b3, 2 * SIZE(AO)
+ LD b3, 2 * SIZE(AO)
LD b4, 3 * SIZE(AO) # sa stored as col major
SUB t11, b1, t11
@@ -1596,7 +1596,7 @@
ST t21, 1 * SIZE(CO1)
ST t31, 2 * SIZE(CO1)
ST t41, 3 * SIZE(CO1)
-
+
daddiu CO1, CO1, 4 * SIZE # fixed address
@@ -1621,7 +1621,7 @@
MOV t21, t11
LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
- LD a2, 1 * SIZE(AO) # get 4 a
+ LD a2, 1 * SIZE(AO) # get 4 a
LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
@@ -1654,9 +1654,9 @@
MADD t11, t11, a3, b3
MADD t21, t21, a4, b3
- daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
@@ -1680,9 +1680,9 @@
MADD t11, t11, a1, b1
MADD t21, t21, a2, b1
- daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 1 * SIZE # BP += 1nr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
@@ -1712,7 +1712,7 @@
ST t11, 0 * SIZE(CO1) # write back results
ST t21, 1 * SIZE(CO1)
-
+
daddiu CO1, CO1, 2 * SIZE # fixed address
@@ -1754,9 +1754,9 @@
MADD t11, t11, a3, b3
- daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
+ daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr
-
+
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
@@ -1776,9 +1776,9 @@
.L96:
MADD t11, t11, a1, b1
- daddiu AO, AO, 1 * SIZE # AO += 2mr
+ daddiu AO, AO, 1 * SIZE # AO += 2mr
daddiu BO, BO, 1 * SIZE # BP += 1nr
-
+
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
@@ -1801,7 +1801,7 @@
ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
ST t11, 0 * SIZE(CO1) # write back results
-
+
daddiu CO1, CO1, 1 * SIZE # fixed address
diff --git a/kernel/mips64/trsm_kernel_RT.S b/kernel/mips64/trsm_kernel_RT.S
index 81bbfec..adfe081 100644
--- a/kernel/mips64/trsm_kernel_RT.S
+++ b/kernel/mips64/trsm_kernel_RT.S
@@ -104,7 +104,7 @@
#define ALPHA $f15
PROLOGUE
-
+
daddiu $sp, $sp, -144
SDARG $16, 0($sp)
@@ -3495,7 +3495,7 @@
bgtz J, .L10
NOP
.align 3
-
+
.L999:
diff --git a/kernel/mips64/trsm_kernel_RT_loongson3a.S b/kernel/mips64/trsm_kernel_RT_loongson3a.S
index cf20cf9..f37611d 100644
--- a/kernel/mips64/trsm_kernel_RT_loongson3a.S
+++ b/kernel/mips64/trsm_kernel_RT_loongson3a.S
@@ -70,7 +70,7 @@
#define t44 $f25
PROLOGUE
-
+
daddiu $sp, $sp, -144
SDARG $16, 0($sp)
@@ -110,7 +110,7 @@
# Be carefull B has no effeck of mc!!
mult N, LDC
mflo TEMP
- daddu C, C, TEMP # C point to the last colum of blockB
+ daddu C, C, TEMP # C point to the last colum of blockB
dsubu KK, K, OFFSET # KC-KK is the length of rectangular data part of Bj
@@ -126,17 +126,17 @@
move CO1, C
move AORIG, A
-
+
dsra I, M, 2
blez I, .L80
NOP
.L31: # mr=4,nr=1
- dsll L, KK, 2 + BASE_SHIFT # mr=4
+ dsll L, KK, 2 + BASE_SHIFT # mr=4
dsll TEMP, KK, BASE_SHIFT # nr=1
- daddu AO, AORIG, L
+ daddu AO, AORIG, L
daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
- dsubu TEMP, K, KK # temp = the length of rectangular data part
+ dsubu TEMP, K, KK # temp = the length of rectangular data part
MTC $0, t11 # clear 4 results registers
MOV t21, t11
@@ -153,7 +153,7 @@
dsra L, TEMP, 2 # L=(KC-offset)/4
blez L, .L35
NOP
-
+
.align 3
.L32:
@@ -193,9 +193,9 @@
MADD t31, t31, a3, b3
MADD t41, t41, a4, b3
- daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
@@ -227,9 +227,9 @@
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
- daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 1 * SIZE # BO += 2nr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
@@ -277,11 +277,11 @@
ST t21, 1 * SIZE(CO1)
ST t31, 2 * SIZE(CO1)
ST t41, 3 * SIZE(CO1)
-
+
daddiu CO1, CO1, 4 * SIZE # fixed pointer
- dsll TEMP, K, 2 + BASE_SHIFT
+ dsll TEMP, K, 2 + BASE_SHIFT
daddu AORIG, AORIG, TEMP # move to next panel Ai
daddiu I, I, -1
@@ -295,11 +295,11 @@
blez I, .L90
nop
- dsll L, KK, 1 + BASE_SHIFT # mr=2
+ dsll L, KK, 1 + BASE_SHIFT # mr=2
dsll TEMP, KK, BASE_SHIFT # nr=1
- daddu AO, AORIG, L
+ daddu AO, AORIG, L
daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
- dsubu TEMP, K, KK # temp = the length of rectangular data part
+ dsubu TEMP, K, KK # temp = the length of rectangular data part
MTC $0, t11 # clear 4 results registers
MOV t21, t11
@@ -312,7 +312,7 @@
dsra L, TEMP, 2 # L=(KC-offset)/4
blez L, .L85
NOP
-
+
.align 3
.L82:
@@ -340,9 +340,9 @@
MADD t11, t11, a3, b3
MADD t21, t21, a4, b3
- daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
@@ -368,9 +368,9 @@
MADD t11, t11, a1, b1
MADD t21, t21, a2, b1
- daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 1 * SIZE # BO += 1nr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
@@ -406,11 +406,11 @@
ST t11, 0 * SIZE(CO1) # write back
ST t21, 1 * SIZE(CO1)
-
+
daddiu CO1, CO1, 2 * SIZE # fixed pointer
- dsll TEMP, K, 1 + BASE_SHIFT
+ dsll TEMP, K, 1 + BASE_SHIFT
daddu AORIG, AORIG, TEMP # move to next panel Ai
@@ -420,11 +420,11 @@
blez I, .L39
nop
- dsll L, KK, BASE_SHIFT # mr=1
+ dsll L, KK, BASE_SHIFT # mr=1
dsll TEMP, KK, BASE_SHIFT # nr=1
- daddu AO, AORIG, L
+ daddu AO, AORIG, L
daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
- dsubu TEMP, K, KK # temp = the length of rectangular data part
+ dsubu TEMP, K, KK # temp = the length of rectangular data part
MTC $0, t11 # clear 4 results registers
@@ -434,7 +434,7 @@
dsra L, TEMP, 2 # L=(KC-offset)/4
blez L, .L95
NOP
-
+
.align 3
.L92:
@@ -453,9 +453,9 @@
MADD t11, t11, a3, b3
- daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
+ daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
-
+
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
@@ -477,9 +477,9 @@
.L96:
MADD t11, t11, a1, b1
- daddiu AO, AO, 1 * SIZE # AO += 2mr
+ daddiu AO, AO, 1 * SIZE # AO += 2mr
daddiu BO, BO, 1 * SIZE # BO += 1nr
-
+
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
@@ -508,11 +508,11 @@
ST t11, 0 * SIZE(AO) # updata packed A
ST t11, 0 * SIZE(CO1) # write back
-
+
daddiu CO1, CO1, 1 * SIZE # fixed pointer
- dsll TEMP, K, BASE_SHIFT
+ dsll TEMP, K, BASE_SHIFT
daddu AORIG, AORIG, TEMP # move to next panel Ai
@@ -529,7 +529,7 @@
dsll TEMP, K, 1 + BASE_SHIFT # Kc*2nr move B to the beginning address of Bj
dsubu B, B, TEMP
- dsll TEMP, LDC, 1 # C
+ dsll TEMP, LDC, 1 # C
dsubu C, C, TEMP
move CO1, C
@@ -542,11 +542,11 @@
NOP
.L51: # mr=4,nr=2
- dsll L, KK, 2 + BASE_SHIFT # mr=4
+ dsll L, KK, 2 + BASE_SHIFT # mr=4
dsll TEMP, KK, 1 + BASE_SHIFT # nr=2
- daddu AO, AORIG, L
+ daddu AO, AORIG, L
daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
- dsubu TEMP, K, KK # temp = the length of rectangular data part
+ dsubu TEMP, K, KK # temp = the length of rectangular data part
MTC $0, t11 # clear 8 results registers
MOV t21, t11
@@ -568,7 +568,7 @@
dsra L, TEMP, 2 # L=(KC-offset)/4
blez L, .L55
NOP
-
+
.align 3
.L52:
@@ -626,9 +626,9 @@
MADD t32, t32, a3, b4
MADD t42, t42, a4, b4
- daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
@@ -671,9 +671,9 @@
MADD t32, t32, a3, b2
MADD t42, t42, a4, b2
- daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 2 * SIZE # BO += 2nr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
@@ -749,16 +749,16 @@
ST t21, 1 * SIZE(CO1)
ST t31, 2 * SIZE(CO1)
ST t41, 3 * SIZE(CO1)
-
+
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
ST t32, 2 * SIZE(CO2)
ST t42, 3 * SIZE(CO2)
-
+
daddiu CO1, CO1, 4 * SIZE # fixed pointer
daddiu CO2, CO2, 4 * SIZE
- dsll TEMP, K, 2 + BASE_SHIFT
+ dsll TEMP, K, 2 + BASE_SHIFT
daddu AORIG, AORIG, TEMP # move to next panel Ai
daddiu I, I, -1
@@ -773,11 +773,11 @@
blez I, .L70
nop
- dsll L, KK, 1 + BASE_SHIFT # mr=2
+ dsll L, KK, 1 + BASE_SHIFT # mr=2
dsll TEMP, KK, 1 + BASE_SHIFT # nr=2
- daddu AO, AORIG, L
+ daddu AO, AORIG, L
daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
- dsubu TEMP, K, KK # temp = the length of rectangular data part
+ dsubu TEMP, K, KK # temp = the length of rectangular data part
MTC $0, t11 # clear 8 results registers
MOV t21, t11
@@ -793,7 +793,7 @@
dsra L, TEMP, 2 # L=(KC-offset)/4
blez L, .L65
NOP
-
+
.align 3
.L62:
@@ -833,9 +833,9 @@
MADD t12, t12, a3, b4
MADD t22, t22, a4, b4
- daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
@@ -868,9 +868,9 @@
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
- daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 2 * SIZE # BO += 2nr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
@@ -921,10 +921,10 @@
ST t11, 0 * SIZE(CO1) # write back
ST t21, 1 * SIZE(CO1)
-
+
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
-
+
daddiu CO1, CO1, 2 * SIZE # fixed pointer
daddiu CO2, CO2, 2 * SIZE
@@ -939,11 +939,11 @@
blez I, .L59
nop
- dsll L, KK, BASE_SHIFT # mr=1
+ dsll L, KK, BASE_SHIFT # mr=1
dsll TEMP, KK, 1 + BASE_SHIFT # nr=2
- daddu AO, AORIG, L
+ daddu AO, AORIG, L
daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
- dsubu TEMP, K, KK # temp = the length of rectangular data part
+ dsubu TEMP, K, KK # temp = the length of rectangular data part
MTC $0, t11 # clear 8 results registers
MOV t12, t11
@@ -956,7 +956,7 @@
dsra L, TEMP, 2 # L=(KC-offset)/4
blez L, .L75
NOP
-
+
.align 3
.L72:
@@ -984,9 +984,9 @@
MADD t11, t11, a3, b3
MADD t12, t12, a3, b4
- daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
+ daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
-
+
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
@@ -1012,9 +1012,9 @@
MADD t11, t11, a1, b1
MADD t12, t12, a1, b2
- daddiu AO, AO, 1 * SIZE # AO += 1mr
+ daddiu AO, AO, 1 * SIZE # AO += 1mr
daddiu BO, BO, 2 * SIZE # BO += 2nr
-
+
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
@@ -1055,7 +1055,7 @@
ST t11, 0 * SIZE(CO1) # write back
ST t12, 0 * SIZE(CO2)
-
+
daddiu CO1, CO1, 1 * SIZE # fixed pointer
daddiu CO2, CO2, 1 * SIZE
@@ -1070,13 +1070,13 @@
.align 3
.L50:
- dsra J, N, 2 # J = NC/4
+ dsra J, N, 2 # J = NC/4
blez J, .L999
NOP
.L10:
dsll TEMP, K, 2 + BASE_SHIFT
- dsubu B, B, TEMP # move B to the beginning address of Bj
+ dsubu B, B, TEMP # move B to the beginning address of Bj
dsll TEMP, LDC, 2
dsubu C, C, TEMP # move C to the beginning address of Cj
@@ -1087,7 +1087,7 @@
daddu CO2, C, LDC
daddu CO3, CO2, LDC
daddu CO4, CO3, LDC
-
+
move AORIG, A # reset A
dsra I, M, 2 # I=MC/4
@@ -1096,11 +1096,11 @@
.align 3
.L11:
- dsll L, KK, 2 + BASE_SHIFT # mr=4
+ dsll L, KK, 2 + BASE_SHIFT # mr=4
dsll TEMP, KK, 2 + BASE_SHIFT # nr=4
- daddu AO, AORIG, L
+ daddu AO, AORIG, L
daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
- dsubu TEMP, K, KK # temp = the length of rectangular data part
+ dsubu TEMP, K, KK # temp = the length of rectangular data part
MTC $0, t11 # clear 16 results registers
MOV t21, t11
@@ -1132,7 +1132,7 @@
dsra L, TEMP, 2 # L=(KC-offset)/4
blez L, .L15
NOP
-
+
.align 3
.L12:
@@ -1164,7 +1164,7 @@
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
MADD t34, t34, a3, b4
- MADD t44, t44, a4, b4 # fisrt
+ MADD t44, t44, a4, b4 # fisrt
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
@@ -1226,9 +1226,9 @@
MADD t34, t34, a3, b4
MADD t44, t44, a4, b4 # third
- daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
+ daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
@@ -1293,9 +1293,9 @@
MADD t34, t34, a3, b4
MADD t44, t44, a4, b4 # third
- daddiu AO, AO, 4 * SIZE # AO += 4mr
+ daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 4 * SIZE # BO += 4nr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
@@ -1381,7 +1381,7 @@
NMSUB t31, t31, b4, t34
NMSUB t41, t41, b4, t44
-
+
LD b5, 10 * SIZE(BO)
LD b6, 9 * SIZE(BO)
LD b7, 8 * SIZE(BO)
@@ -1442,17 +1442,17 @@
ST t21, 1 * SIZE(CO1)
ST t31, 2 * SIZE(CO1)
ST t41, 3 * SIZE(CO1)
-
+
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
ST t32, 2 * SIZE(CO2)
ST t42, 3 * SIZE(CO2)
-
+
ST t13, 0 * SIZE(CO3)
ST t23, 1 * SIZE(CO3)
ST t33, 2 * SIZE(CO3)
ST t43, 3 * SIZE(CO3)
-
+
ST t14, 0 * SIZE(CO4)
ST t24, 1 * SIZE(CO4)
ST t34, 2 * SIZE(CO4)
@@ -1463,7 +1463,7 @@
daddiu CO3, CO3, 4 * SIZE
daddiu CO4, CO4, 4 * SIZE
- dsll TEMP, K, 2 + BASE_SHIFT
+ dsll TEMP, K, 2 + BASE_SHIFT
daddu AORIG, AORIG, TEMP # move to next panel Ai
daddiu I, I, -1
@@ -1476,11 +1476,11 @@
blez I, .L40
NOP
- dsll L, KK, 1 + BASE_SHIFT # mr=2
+ dsll L, KK, 1 + BASE_SHIFT # mr=2
dsll TEMP, KK, 2 + BASE_SHIFT # nr=4
- daddu AO, AORIG, L
+ daddu AO, AORIG, L
daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
- dsubu TEMP, K, KK # temp = the length of rectangular data part
+ dsubu TEMP, K, KK # temp = the length of rectangular data part
MTC $0, t11 # clear 8 results registers
MOV t21, t11
@@ -1502,7 +1502,7 @@
dsra L, TEMP, 2 # L=(KC-offset)/4
blez L, .L25
NOP
-
+
.align 3
.L22:
@@ -1566,9 +1566,9 @@
MADD t14, t14, a3, b4
MADD t24, t24, a4, b4
- daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
+ daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
@@ -1615,9 +1615,9 @@
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
- daddiu AO, AO, 2 * SIZE # AO += 2mr
+ daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 4 * SIZE # BO += 4nr
-
+
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
@@ -1677,7 +1677,7 @@
NMSUB t11, t11, b4, t14
NMSUB t21, t21, b4, t24
-
+
LD b5, 10 * SIZE(BO)
LD b6, 9 * SIZE(BO)
LD b7, 8 * SIZE(BO)
@@ -1716,13 +1716,13 @@
ST t11, 0 * SIZE(CO1) # write back
ST t21, 1 * SIZE(CO1)
-
+
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
-
+
ST t13, 0 * SIZE(CO3)
ST t23, 1 * SIZE(CO3)
-
+
ST t14, 0 * SIZE(CO4)
ST t24, 1 * SIZE(CO4)
@@ -1741,11 +1741,11 @@
blez I, .L29
NOP
- dsll L, KK, BASE_SHIFT # mr=1
+ dsll L, KK, BASE_SHIFT # mr=1
dsll TEMP, KK, 2 + BASE_SHIFT # nr=4
- daddu AO, AORIG, L
+ daddu AO, AORIG, L
daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
- dsubu TEMP, K, KK # temp = the length of rectangular data part
+ dsubu TEMP, K, KK # temp = the length of rectangular data part
MTC $0, t11 # clear 4 results registers
MOV t12, t11
@@ -1762,7 +1762,7 @@
dsra L, TEMP, 2 # L=(KC-offset)/4
blez L, .L45
NOP
-
+
.align 3
.L42:
@@ -1802,9 +1802,9 @@
MADD t13, t13, a3, b3
MADD t14, t14, a3, b4
- daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
+ daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
-
+
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
@@ -1836,9 +1836,9 @@
MADD t13, t13, a1, b3
MADD t14, t14, a1, b4
- daddiu AO, AO, 1 * SIZE # AO += 2mr
+ daddiu AO, AO, 1 * SIZE # AO += 2mr
daddiu BO, BO, 4 * SIZE # BO += 4nr
-
+
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
@@ -1879,7 +1879,7 @@
NMSUB t12, t12, b3, t14
NMSUB t11, t11, b4, t14
-
+
LD b5, 10 * SIZE(BO)
LD b6, 9 * SIZE(BO)
LD b7, 8 * SIZE(BO)
diff --git a/kernel/mips64/zamax.S b/kernel/mips64/zamax.S
index e993867..4a83629 100644
--- a/kernel/mips64/zamax.S
+++ b/kernel/mips64/zamax.S
@@ -42,7 +42,7 @@
#define N $4
#define X $5
#define INCX $6
-
+
#define I $2
#define TEMP $3
@@ -70,7 +70,7 @@
#define s4 $f3
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
diff --git a/kernel/mips64/zamin.S b/kernel/mips64/zamin.S
index bd1d509..0e3323b 100644
--- a/kernel/mips64/zamin.S
+++ b/kernel/mips64/zamin.S
@@ -42,7 +42,7 @@
#define N $4
#define X $5
#define INCX $6
-
+
#define I $2
#define TEMP $3
@@ -70,7 +70,7 @@
#define s4 $f3
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
diff --git a/kernel/mips64/zasum.S b/kernel/mips64/zasum.S
index d6dc205..cd22f98 100644
--- a/kernel/mips64/zasum.S
+++ b/kernel/mips64/zasum.S
@@ -42,7 +42,7 @@
#define N $4
#define X $5
#define INCX $6
-
+
#define I $2
#define TEMP $3
@@ -64,7 +64,7 @@
#define s2 $f1
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
diff --git a/kernel/mips64/zaxpy.S b/kernel/mips64/zaxpy.S
index 8a7b29a..e7fa22e 100644
--- a/kernel/mips64/zaxpy.S
+++ b/kernel/mips64/zaxpy.S
@@ -86,7 +86,7 @@
#endif
PROLOGUE
-
+
LDARG INCY, 0($sp)
li TEMP, 2 * SIZE
@@ -95,7 +95,7 @@
sdc1 $f20, 0($sp)
sdc1 $f21, 8($sp)
#endif
-
+
blez N, .L999
dsll INCX, INCX, ZBASE_SHIFT
diff --git a/kernel/mips64/zcopy.S b/kernel/mips64/zcopy.S
index 5a4ce9c..8b31772 100644
--- a/kernel/mips64/zcopy.S
+++ b/kernel/mips64/zcopy.S
@@ -44,7 +44,7 @@
#define INCX $6
#define Y $7
#define INCY $8
-
+
#define I $2
#define TEMP $3
@@ -58,7 +58,7 @@
#define a8 $f7
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
diff --git a/kernel/mips64/zdot.S b/kernel/mips64/zdot.S
index c50fe31..c24dd12 100644
--- a/kernel/mips64/zdot.S
+++ b/kernel/mips64/zdot.S
@@ -44,7 +44,7 @@
#define INCX $6
#define Y $7
#define INCY $8
-
+
#define I $2
#define TEMP $3
@@ -62,9 +62,9 @@
#define s3 $f2
#define s4 $f3
-
+
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
diff --git a/kernel/mips64/zgemm3m_kernel.S b/kernel/mips64/zgemm3m_kernel.S
index 14bb746..d3fbcf1 100644
--- a/kernel/mips64/zgemm3m_kernel.S
+++ b/kernel/mips64/zgemm3m_kernel.S
@@ -106,7 +106,7 @@
#define ALPHA_I $f16
PROLOGUE
-
+
daddiu $sp, $sp, -128
SDARG $16, 0($sp)
@@ -859,7 +859,7 @@
bgtz J, .L10
move B, BO
.align 3
-
+
.L30:
andi J, N, 4
blez J, .L50
@@ -1540,7 +1540,7 @@
LD $f1, 1 * SIZE(CO1)
LD $f2, 2 * SIZE(CO1)
LD $f3, 3 * SIZE(CO1)
-
+
ADD c11, c11, c21
daddiu I, I, -1
ADD c12, c12, c22
diff --git a/kernel/mips64/zgemm_kernel.S b/kernel/mips64/zgemm_kernel.S
index c48519c..b9ac3b5 100644
--- a/kernel/mips64/zgemm_kernel.S
+++ b/kernel/mips64/zgemm_kernel.S
@@ -130,7 +130,7 @@
#endif
PROLOGUE
-
+
LDARG LDC, 0($sp)
daddiu $sp, $sp, -128
@@ -759,7 +759,7 @@
bgtz J, .L10
move B, BO
.align 3
-
+
.L20:
andi J, N, 2
MTC $0, c11
diff --git a/kernel/mips64/zgemm_kernel_loongson3a_2x2.S b/kernel/mips64/zgemm_kernel_loongson3a_2x2.S
index a8faad2..ab67365 100644
--- a/kernel/mips64/zgemm_kernel_loongson3a_2x2.S
+++ b/kernel/mips64/zgemm_kernel_loongson3a_2x2.S
@@ -143,7 +143,7 @@
#endif
PROLOGUE
-
+
LDARG LDC, 0($sp)
daddiu $sp, $sp, -STACKSIZE
@@ -188,7 +188,7 @@
move KK, OFFSET
#endif
- daddiu J, J, -1
+ daddiu J, J, -1
dsra I, M, 1 # I=M/2
dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4
@@ -223,7 +223,7 @@
MOV c21, c11
MOV c22, c11
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
-
+
MOV c23, c11
MOV c24, c11
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
@@ -235,7 +235,7 @@
FETCH $0, 0 * SIZE(CO1)
MOV c33, c11
MOV c34, c11
-
+
FETCH $0, 4 * SIZE(CO2)
MOV c41, c11
MOV c42, c11
@@ -258,7 +258,7 @@
#else
- dsra L, K, 2 # Unroll K 4 times
+ dsra L, K, 2 # Unroll K 4 times
move BO, B
MTC $0, c11 # Clear results regs
@@ -272,7 +272,7 @@
MOV c21, c11
MOV c22, c11
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
-
+
MOV c23, c11
MOV c24, c11
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
@@ -284,7 +284,7 @@
FETCH $0, 0 * SIZE(CO1)
MOV c33, c11
MOV c34, c11
-
+
FETCH $0, 4 * SIZE(CO2)
MOV c41, c11
MOV c42, c11
@@ -300,13 +300,13 @@
.align 5
.L12:
- gsLQC1(R12, F9, F8, 2) # Unroll K=1
- gsLQC1(R13, F13, F12, 2)
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ gsLQC1(R12, F9, F8, 2) # Unroll K=1
+ gsLQC1(R13, F13, F12, 2)
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
- gsLQC1(R12, F11, F10, 3)
- gsLQC1(R13, F16, F15, 3)
+ gsLQC1(R12, F11, F10, 3)
+ gsLQC1(R13, F16, F15, 3)
MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd
@@ -329,16 +329,16 @@
MADD2 c42, c42, a4, b3
MADD4 c44, c44, a4, b4
- gsLQC1(R12, F1, F0, 4) # unroll k=2
- gsLQC1(R13, F5, F4, 4)
- MADD1 c11, c11, a5, b5 # axc A1xB1
+ gsLQC1(R12, F1, F0, 4) # unroll k=2
+ gsLQC1(R13, F5, F4, 4)
+ MADD1 c11, c11, a5, b5 # axc A1xB1
MADD3 c13, c13, a5, b6 # axd
MADD2 c12, c12, a6, b5 # bxc
MADD4 c14, c14, a6, b6 # bxd
- gsLQC1(R12, F3, F2, 5)
- gsLQC1(R13, F7, F6, 5)
+ gsLQC1(R12, F3, F2, 5)
+ gsLQC1(R13, F7, F6, 5)
MADD1 c21, c21, a7, b5 # A2xB1
MADD3 c23, c23, a7, b6
@@ -359,12 +359,12 @@
MADD4 c44, c44, a8, b8
gsLQC1(R12, F9, F8, 6) # Unroll K=3
- gsLQC1(R13, F13, F12, 6)
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ gsLQC1(R13, F13, F12, 6)
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
- gsLQC1(R13, F16, F15, 7)
- gsLQC1(R12, F11, F10, 7)
+ gsLQC1(R13, F16, F15, 7)
+ gsLQC1(R12, F11, F10, 7)
MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd
@@ -395,7 +395,7 @@
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
- MADD1 c11, c11, a5, b5 # axc A1xB1
+ MADD1 c11, c11, a5, b5 # axc A1xB1
MADD3 c13, c13, a5, b6 # axd
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
@@ -442,17 +442,17 @@
.L16:
daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx
daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
- daddiu PREA, PREA, 4 * SIZE
- daddiu PREB, PREB, 4 * SIZE
+ daddiu PREA, PREA, 4 * SIZE
+ daddiu PREB, PREB, 4 * SIZE
MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd
MADD1 c21, c21, a3, b1 # A2xB1
MADD3 c23, c23, a3, b2
-
+
MADD2 c22, c22, a4, b1
MADD4 c24, c24, a4, b2
@@ -593,9 +593,9 @@
#endif
dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4
- daddiu CO1,CO1, 4 * SIZE
+ daddiu CO1,CO1, 4 * SIZE
bgtz I, .L11
- daddiu CO2,CO2, 4 * SIZE
+ daddiu CO2,CO2, 4 * SIZE
.align 5
.L30:
@@ -620,7 +620,7 @@
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
MTC $0, c11 # Clear results regs
MOV c12, c11
-
+
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MOV c13, c11
MOV c14, c11
@@ -642,7 +642,7 @@
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1 # MR=1
-#else
+#else
daddiu TEMP, KK, 2 # NR=2
#endif
dsra L, TEMP, 2
@@ -652,13 +652,13 @@
#else
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
- dsra L, K, 2 # Unroll K 4 times
+ dsra L, K, 2 # Unroll K 4 times
move BO, B
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MTC $0, c11 # Clear results regs
MOV c12, c11
-
+
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
MOV c13, c11
MOV c14, c11
@@ -681,29 +681,29 @@
.L32:
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
- gsLQC1(R13, F13, F12, 2)
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ gsLQC1(R13, F13, F12, 2)
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
- gsLQC1(R13, F16, F15, 3)
+ gsLQC1(R13, F16, F15, 3)
MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd
NOP
MADD1 c31, c31, a1, b3 # A1xB2
MADD3 c33, c33, a1, b4
-
+
FETCH $0, 4 * SIZE(PREB)
MADD2 c32, c32, a2, b3
MADD4 c34, c34, a2, b4
NOP
- gsLQC1(R12, F9, F8, 2) # Unroll K=1
- gsLQC1(R13, F5, F4, 4)
- MADD1 c11, c11, a3, b5 # axc A1xB1
+ gsLQC1(R12, F9, F8, 2) # Unroll K=1
+ gsLQC1(R13, F5, F4, 4)
+ MADD1 c11, c11, a3, b5 # axc A1xB1
MADD3 c13, c13, a3, b6 # axd
- gsLQC1(R13, F7, F6, 5)
+ gsLQC1(R13, F7, F6, 5)
MADD2 c12, c12, a4, b5 # bxc
MADD4 c14, c14, a4, b6 # bxd
NOP
@@ -716,12 +716,12 @@
MADD4 c34, c34, a4, b8
daddiu L, L, -1
- gsLQC1(R12, F11, F10, 3)
- gsLQC1(R13, F13, F12, 6)
- MADD1 c11, c11, a5, b1 # axc A1xB1
+ gsLQC1(R12, F11, F10, 3)
+ gsLQC1(R13, F13, F12, 6)
+ MADD1 c11, c11, a5, b1 # axc A1xB1
MADD3 c13, c13, a5, b2 # axd
- gsLQC1(R13, F16, F15, 7)
+ gsLQC1(R13, F16, F15, 7)
MADD2 c12, c12, a6, b1 # bxc
MADD4 c14, c14, a6, b2 # bxd
daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx
@@ -736,7 +736,7 @@
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
- MADD1 c11, c11, a7, b5 # axc A1xB1
+ MADD1 c11, c11, a7, b5 # axc A1xB1
MADD3 c13, c13, a7, b6 # axd
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
@@ -767,7 +767,7 @@
.L36:
daddiu L, L, -1
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx
@@ -777,8 +777,8 @@
daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx
MADD1 c31, c31, a1, b3 # A1xB2
MADD3 c33, c33, a1, b4
-
- daddiu PREB, PREB, 4 * SIZE
+
+ daddiu PREB, PREB, 4 * SIZE
MADD2 c32, c32, a2, b3
MADD4 c34, c34, a2, b4
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
@@ -819,8 +819,8 @@
ST a3, 0 * SIZE(CO2)
ST a4, 1 * SIZE(CO2)
- daddiu CO1,CO1, 2 * SIZE
- daddiu CO2,CO2, 2 * SIZE
+ daddiu CO1,CO1, 2 * SIZE
+ daddiu CO2,CO2, 2 * SIZE
#else
ADD c11, c14, c11
ADD c12, c13, c12
@@ -845,8 +845,8 @@
ST a3, 0 * SIZE(CO2)
ST a4, 1 * SIZE(CO2)
- daddiu CO1,CO1, 2 * SIZE
- daddiu CO2,CO2, 2 * SIZE
+ daddiu CO1,CO1, 2 * SIZE
+ daddiu CO2,CO2, 2 * SIZE
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
@@ -879,7 +879,7 @@
move B, BO
.align 5
-
+
.L20:
andi J, N, 1
blez J, .L999
@@ -938,7 +938,7 @@
NOP
#else
- dsra L, K, 2 # Unroll K 4 times
+ dsra L, K, 2 # Unroll K 4 times
move BO, B
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
@@ -967,31 +967,31 @@
.align 5
.L22:
- gsLQC1(R12, F9, F8, 2) # Unroll K=1
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ gsLQC1(R12, F9, F8, 2) # Unroll K=1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd
- gsLQC1(R12, F11, F10, 3)
+ gsLQC1(R12, F11, F10, 3)
MADD1 c21, c21, a3, b1 # A2xB1
MADD3 c23, c23, a3, b2
-
+
FETCH $0, 4 * SIZE(PREA)
MADD2 c22, c22, a4, b1
MADD4 c24, c24, a4, b2
- gsLQC1(R12, F1, F0, 4) # Unroll K=2
- MADD1 c11, c11, a5, b3 # axc A1xB1
+ gsLQC1(R12, F1, F0, 4) # Unroll K=2
+ MADD1 c11, c11, a5, b3 # axc A1xB1
MADD3 c13, c13, a5, b4 # axd
- gsLQC1(R13, F13, F12, 2)
+ gsLQC1(R13, F13, F12, 2)
MADD2 c12, c12, a6, b3 # bxc
MADD4 c14, c14, a6, b4 # bxd
- gsLQC1(R12, F3, F2, 5)
+ gsLQC1(R12, F3, F2, 5)
MADD1 c21, c21, a7, b3 # A2xB1
MADD3 c23, c23, a7, b4
@@ -1001,14 +1001,14 @@
daddiu L, L, -1
gsLQC1(R12, F9, F8, 6) # Unroll K=3
- MADD1 c11, c11, a1, b5 # axc A1xB1
+ MADD1 c11, c11, a1, b5 # axc A1xB1
MADD3 c13, c13, a1, b6 # axd
- gsLQC1(R13, F16, F15, 3)
+ gsLQC1(R13, F16, F15, 3)
MADD2 c12, c12, a2, b5 # bxc
MADD4 c14, c14, a2, b6 # bxd
- gsLQC1(R12, F11, F10, 7)
+ gsLQC1(R12, F11, F10, 7)
MADD1 c21, c21, a3, b5 # A2xB1
MADD3 c23, c23, a3, b6
daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx
@@ -1019,9 +1019,9 @@
daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
- MADD1 c11, c11, a5, b7 # axc A1xB1
+ MADD1 c11, c11, a5, b7 # axc A1xB1
MADD3 c13, c13, a5, b8 # axd
- daddiu PREA, PREA, 16 * SIZE
+ daddiu PREA, PREA, 16 * SIZE
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
MADD2 c12, c12, a6, b7 # bxc
@@ -1051,7 +1051,7 @@
.L26:
daddiu L, L, -1
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx
@@ -1142,7 +1142,7 @@
daddiu KK, KK, 2
#endif
#endif
- daddiu CO1,CO1, 4 * SIZE
+ daddiu CO1,CO1, 4 * SIZE
bgtz I, .L21
NOP
@@ -1184,7 +1184,7 @@
NOP
#else
- dsra L, K, 2 # Unroll K 4 times
+ dsra L, K, 2 # Unroll K 4 times
move BO, B
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
@@ -1205,34 +1205,34 @@
.L42:
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd
- gsLQC1(R12, F9, F8, 2) # Unroll K=1
- MADD1 c11, c11, a3, b3 # axc A1xB1
+ gsLQC1(R12, F9, F8, 2) # Unroll K=1
+ MADD1 c11, c11, a3, b3 # axc A1xB1
MADD3 c13, c13, a3, b4 # axd
- gsLQC1(R13, F13, F12, 2)
+ gsLQC1(R13, F13, F12, 2)
MADD2 c12, c12, a4, b3 # bxc
MADD4 c14, c14, a4, b4 # bxd
daddiu L, L, -1
- gsLQC1(R12, F11, F10, 3)
- MADD1 c11, c11, a5, b5 # axc A1xB1
+ gsLQC1(R12, F11, F10, 3)
+ MADD1 c11, c11, a5, b5 # axc A1xB1
MADD3 c13, c13, a5, b6 # axd
daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx
- gsLQC1(R13, F16, F15, 3)
+ gsLQC1(R13, F16, F15, 3)
MADD2 c12, c12, a6, b5 # bxc
MADD4 c14, c14, a6, b6 # bxd
daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
- MADD1 c11, c11, a7, b7 # axc A1xB1
+ MADD1 c11, c11, a7, b7 # axc A1xB1
MADD3 c13, c13, a7, b8 # axd
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
@@ -1259,7 +1259,7 @@
daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx
daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd
@@ -1319,7 +1319,7 @@
daddiu KK, KK, 1
#endif
- daddiu CO1,CO1, 2 * SIZE
+ daddiu CO1,CO1, 2 * SIZE
#endif
diff --git a/kernel/mips64/zgemm_kernel_loongson3b_2x2.S b/kernel/mips64/zgemm_kernel_loongson3b_2x2.S
index 5ded7ae..675cad0 100644
--- a/kernel/mips64/zgemm_kernel_loongson3b_2x2.S
+++ b/kernel/mips64/zgemm_kernel_loongson3b_2x2.S
@@ -144,7 +144,7 @@
#endif
PROLOGUE
-
+
LDARG LDC, 0($sp)
daddiu $sp, $sp, -STACKSIZE
@@ -190,7 +190,7 @@
move KK, OFFSET
#endif
- daddiu J, J, -1
+ daddiu J, J, -1
dsra I, M, 1 # I=M/2
dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4
@@ -228,7 +228,7 @@
LD a3, 2 * SIZE(AO)
MOV c22, c11
LD a4, 3 * SIZE(AO)
-
+
MOV c23, c11
LD b3, 2 * SIZE(BO)
MOV c24, c11
@@ -241,7 +241,7 @@
FETCH $0, 0 * SIZE(CO1)
MOV c33, c11
MOV c34, c11
-
+
FETCH $0, 4 * SIZE(CO2)
MOV c41, c11
MOV c42, c11
@@ -264,7 +264,7 @@
#else
- dsra L, K, 2 # Unroll K 4 times
+ dsra L, K, 2 # Unroll K 4 times
move BO, B
MTC $0, c11 # Clear results regs
@@ -281,7 +281,7 @@
LD a3, 2 * SIZE(AO)
MOV c22, c11
LD a4, 3 * SIZE(AO)
-
+
MOV c23, c11
LD b3, 2 * SIZE(BO)
MOV c24, c11
@@ -294,7 +294,7 @@
MOV c33, c11
MOV c34, c11
FETCH $0, 0 * SIZE(CO1)
-
+
MOV c41, c11
MOV c42, c11
FETCH $0, 4 * SIZE(CO2)
@@ -313,7 +313,7 @@
.L12:
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
LD b5, 4 * SIZE(BO)
@@ -346,7 +346,7 @@
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
- MADD1 c11, c11, a5, b5 # axc A1xB1
+ MADD1 c11, c11, a5, b5 # axc A1xB1
MADD3 c13, c13, a5, b6 # axd
LD b1, 8 * SIZE(BO)
@@ -355,7 +355,7 @@
MADD4 c14, c14, a6, b6 # bxd
LD a3, 10 * SIZE(AO)
- LD a4, 11 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
MADD1 c21, c21, a7, b5 # A2xB1
MADD3 c23, c23, a7, b6
@@ -379,7 +379,7 @@
LD a5, 12 * SIZE(AO)
LD a6, 13 * SIZE(AO)
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
LD b5, 12 * SIZE(BO)
@@ -418,7 +418,7 @@
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
- MADD1 c11, c11, a5, b5 # axc A1xB1
+ MADD1 c11, c11, a5, b5 # axc A1xB1
MADD3 c13, c13, a5, b6 # axd
LD b1, 0 * SIZE(BO)
@@ -469,17 +469,17 @@
.L16:
daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx
daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
- daddiu PREA, PREA, 4 * SIZE
- daddiu PREB, PREB, 4 * SIZE
+ daddiu PREA, PREA, 4 * SIZE
+ daddiu PREB, PREB, 4 * SIZE
MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd
MADD1 c21, c21, a3, b1 # A2xB1
MADD3 c23, c23, a3, b2
-
+
MADD2 c22, c22, a4, b1
MADD4 c24, c24, a4, b2
@@ -624,9 +624,9 @@
#endif
dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4
- daddiu CO1,CO1, 4 * SIZE
+ daddiu CO1,CO1, 4 * SIZE
bgtz I, .L11
- daddiu CO2,CO2, 4 * SIZE
+ daddiu CO2,CO2, 4 * SIZE
.align 5
.L30:
@@ -652,7 +652,7 @@
LD a2, 1 * SIZE(AO)
MTC $0, c11 # Clear results regs
MOV c12, c11
-
+
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
MOV c13, c11
@@ -676,7 +676,7 @@
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1 # MR=1
-#else
+#else
daddiu TEMP, KK, 2 # NR=2
#endif
dsra L, TEMP, 2
@@ -687,14 +687,14 @@
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
- dsra L, K, 2 # Unroll K 4 times
+ dsra L, K, 2 # Unroll K 4 times
move BO, B
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
MTC $0, c11 # Clear results regs
MOV c12, c11
-
+
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
MOV c13, c11
@@ -719,19 +719,19 @@
.L32:
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd
-
+
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD1 c31, c31, a1, b3 # A1xB2
MADD3 c33, c33, a1, b4
-
+
FETCH $0, 4 * SIZE(PREB)
MADD2 c32, c32, a2, b3
MADD4 c34, c34, a2, b4
@@ -739,14 +739,14 @@
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
- MADD1 c11, c11, a3, b5 # axc A1xB1
+ MADD1 c11, c11, a3, b5 # axc A1xB1
MADD3 c13, c13, a3, b6 # axd
LD b1, 8 * SIZE(BO)
LD b2, 9 * SIZE(BO)
MADD2 c12, c12, a4, b5 # bxc
MADD4 c14, c14, a4, b6 # bxd
-
+
LD b3, 10 * SIZE(BO)
LD b4, 11 * SIZE(BO)
MADD1 c31, c31, a3, b7 # A1xB2
@@ -759,7 +759,7 @@
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
- MADD1 c11, c11, a5, b1 # axc A1xB1
+ MADD1 c11, c11, a5, b1 # axc A1xB1
MADD3 c13, c13, a5, b2 # axd
LD b5, 12 * SIZE(BO)
@@ -782,7 +782,7 @@
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
- MADD1 c11, c11, a7, b5 # axc A1xB1
+ MADD1 c11, c11, a7, b5 # axc A1xB1
MADD3 c13, c13, a7, b6 # axd
LD b1, 0 * SIZE(BO)
@@ -818,7 +818,7 @@
.L36:
daddiu L, L, -1
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx
@@ -828,8 +828,8 @@
daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx
MADD1 c31, c31, a1, b3 # A1xB2
MADD3 c33, c33, a1, b4
-
- daddiu PREB, PREB, 4 * SIZE
+
+ daddiu PREB, PREB, 4 * SIZE
MADD2 c32, c32, a2, b3
MADD4 c34, c34, a2, b4
@@ -873,8 +873,8 @@
ST a3, 0 * SIZE(CO2)
ST a4, 1 * SIZE(CO2)
- daddiu CO1,CO1, 2 * SIZE
- daddiu CO2,CO2, 2 * SIZE
+ daddiu CO1,CO1, 2 * SIZE
+ daddiu CO2,CO2, 2 * SIZE
#else
ADD c11, c14, c11
@@ -901,8 +901,8 @@
ST a3, 0 * SIZE(CO2)
ST a4, 1 * SIZE(CO2)
- daddiu CO1,CO1, 2 * SIZE
- daddiu CO2,CO2, 2 * SIZE
+ daddiu CO1,CO1, 2 * SIZE
+ daddiu CO2,CO2, 2 * SIZE
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
@@ -935,7 +935,7 @@
move B, BO
.align 5
-
+
.L20:
andi J, N, 1
blez J, .L999
@@ -998,7 +998,7 @@
NOP
#else
- dsra L, K, 2 # Unroll K 4 times
+ dsra L, K, 2 # Unroll K 4 times
move BO, B
LD a1, 0 * SIZE(AO)
@@ -1032,7 +1032,7 @@
.L22:
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
LD b3, 2 * SIZE(BO)
@@ -1044,14 +1044,14 @@
LD a8, 7 * SIZE(AO)
MADD1 c21, c21, a3, b1 # A2xB1
MADD3 c23, c23, a3, b2
-
+
FETCH $0, 4 * SIZE(PREA)
MADD2 c22, c22, a4, b1
MADD4 c24, c24, a4, b2
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
- MADD1 c11, c11, a5, b3 # axc A1xB1
+ MADD1 c11, c11, a5, b3 # axc A1xB1
MADD3 c13, c13, a5, b4 # axd
LD b5, 4 * SIZE(BO)
@@ -1071,7 +1071,7 @@
LD a5, 12 * SIZE(AO)
LD a6, 13 * SIZE(AO)
- MADD1 c11, c11, a1, b5 # axc A1xB1
+ MADD1 c11, c11, a1, b5 # axc A1xB1
MADD3 c13, c13, a1, b6 # axd
LD b7, 6 * SIZE(BO)
@@ -1090,11 +1090,11 @@
FETCH $0, 12 * SIZE(PREA)
MADD2 c22, c22, a4, b5
MADD4 c24, c24, a4, b6
- daddiu PREA, PREA, 16 * SIZE
+ daddiu PREA, PREA, 16 * SIZE
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
- MADD1 c11, c11, a5, b7 # axc A1xB1
+ MADD1 c11, c11, a5, b7 # axc A1xB1
MADD3 c13, c13, a5, b8 # axd
LD b1, 0 * SIZE(BO)
@@ -1127,7 +1127,7 @@
.L26:
daddiu L, L, -1
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx
@@ -1224,7 +1224,7 @@
daddiu KK, KK, 2
#endif
#endif
- daddiu CO1,CO1, 4 * SIZE
+ daddiu CO1,CO1, 4 * SIZE
bgtz I, .L21
NOP
@@ -1270,7 +1270,7 @@
NOP
#else
- dsra L, K, 2 # Unroll K 4 times
+ dsra L, K, 2 # Unroll K 4 times
move BO, B
# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2
@@ -1297,7 +1297,7 @@
# gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
# gsLQC1(R13, F7, F6, 1) # R:b2 I:b3
@@ -1306,27 +1306,27 @@
MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd
-# gsLQC1(R12, F9, F8, 2) # Unroll K=1
+# gsLQC1(R12, F9, F8, 2) # Unroll K=1
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
- MADD1 c11, c11, a3, b3 # axc A1xB1
+ MADD1 c11, c11, a3, b3 # axc A1xB1
MADD3 c13, c13, a3, b4 # axd
-# gsLQC1(R13, F13, F12, 2)
+# gsLQC1(R13, F13, F12, 2)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
MADD2 c12, c12, a4, b3 # bxc
MADD4 c14, c14, a4, b4 # bxd
-# gsLQC1(R12, F11, F10, 3)
+# gsLQC1(R12, F11, F10, 3)
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
- MADD1 c11, c11, a5, b5 # axc A1xB1
+ MADD1 c11, c11, a5, b5 # axc A1xB1
MADD3 c13, c13, a5, b6 # axd
daddiu L, L, -1
-# gsLQC1(R13, F16, F15, 3)
+# gsLQC1(R13, F16, F15, 3)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD2 c12, c12, a6, b5 # bxc
@@ -1338,7 +1338,7 @@
# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
- MADD1 c11, c11, a7, b7 # axc A1xB1
+ MADD1 c11, c11, a7, b7 # axc A1xB1
MADD3 c13, c13, a7, b8 # axd
# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2
@@ -1369,7 +1369,7 @@
daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx
daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx
- MADD1 c11, c11, a1, b1 # axc A1xB1
+ MADD1 c11, c11, a1, b1 # axc A1xB1
MADD3 c13, c13, a1, b2 # axd
MADD2 c12, c12, a2, b1 # bxc
MADD4 c14, c14, a2, b2 # bxd
@@ -1432,7 +1432,7 @@
daddiu KK, KK, 1
#endif
- daddiu CO1,CO1, 2 * SIZE
+ daddiu CO1,CO1, 2 * SIZE
#endif
diff --git a/kernel/mips64/zgemv_n.S b/kernel/mips64/zgemv_n.S
index c6cc896..5709102 100644
--- a/kernel/mips64/zgemv_n.S
+++ b/kernel/mips64/zgemv_n.S
@@ -119,7 +119,7 @@
#endif
PROLOGUE
-
+
LDARG INCX, 0($sp)
LDARG Y, 8($sp)
LDARG INCY, 16($sp)
diff --git a/kernel/mips64/zgemv_n_loongson3a.c b/kernel/mips64/zgemv_n_loongson3a.c
index 3b1b6f7..f66818c 100644
--- a/kernel/mips64/zgemv_n_loongson3a.c
+++ b/kernel/mips64/zgemv_n_loongson3a.c
@@ -1,4 +1,4 @@
-#include "common.h"
+#include "common.h"
//typedef int BLASLONG;
//typedef double FLOAT;
diff --git a/kernel/mips64/zgemv_t.S b/kernel/mips64/zgemv_t.S
index f7f7fdf..da702a5 100644
--- a/kernel/mips64/zgemv_t.S
+++ b/kernel/mips64/zgemv_t.S
@@ -114,7 +114,7 @@
#endif
PROLOGUE
-
+
LDARG INCX, 0($sp)
LDARG Y, 8($sp)
LDARG INCY, 16($sp)
@@ -143,7 +143,7 @@
dsll INCY, INCY, ZBASE_SHIFT
li XORIG, 2 * SIZE
-
+
beq INCX, XORIG, .L10
move XORIG, X
@@ -449,10 +449,10 @@
.L19:
LD a1, 0 * SIZE(Y)
LD a2, 1 * SIZE(Y)
- daddu Y, Y, INCY
+ daddu Y, Y, INCY
LD a3, 0 * SIZE(Y)
LD a4, 1 * SIZE(Y)
- daddu Y, Y, INCY
+ daddu Y, Y, INCY
MADD a1, a1, ALPHA_R, y1
MADD a2, a2, ALPHA_I, y1
@@ -468,12 +468,12 @@
ST a1, 0 * SIZE(YY)
ST a2, 1 * SIZE(YY)
- daddu YY, YY, INCY
+ daddu YY, YY, INCY
ST a3, 0 * SIZE(YY)
ST a4, 1 * SIZE(YY)
bgtz J, .L11
- daddu YY, YY, INCY
+ daddu YY, YY, INCY
.align 3
.L20:
diff --git a/kernel/mips64/zgemv_t_loongson3a.c b/kernel/mips64/zgemv_t_loongson3a.c
index 3af44ca..2fa71cc 100644
--- a/kernel/mips64/zgemv_t_loongson3a.c
+++ b/kernel/mips64/zgemv_t_loongson3a.c
@@ -1,4 +1,4 @@
-#include "common.h"
+#include "common.h"
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
#define likely(x) __builtin_expect(!!(x), 1)
diff --git a/kernel/mips64/znrm2.S b/kernel/mips64/znrm2.S
index 1f4a90e..1c247bc 100644
--- a/kernel/mips64/znrm2.S
+++ b/kernel/mips64/znrm2.S
@@ -43,7 +43,7 @@
#define X $5
#define INCX $6
#define XX $7
-
+
#define I $2
#define TEMP $3
@@ -71,7 +71,7 @@
PROLOGUE
-
+
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
diff --git a/kernel/mips64/zrot.S b/kernel/mips64/zrot.S
index 0a20569..35f9035 100644
--- a/kernel/mips64/zrot.S
+++ b/kernel/mips64/zrot.S
@@ -44,7 +44,7 @@
#define INCX $6
#define Y $7
#define INCY $8
-
+
#define XX $9
#define YY $10
@@ -70,7 +70,7 @@
#define t4 $f3
PROLOGUE
-
+
dsll INCX, INCX, ZBASE_SHIFT
li TEMP, 2 * SIZE
diff --git a/kernel/mips64/zscal.S b/kernel/mips64/zscal.S
index 3feaf5a..f11b1c8 100644
--- a/kernel/mips64/zscal.S
+++ b/kernel/mips64/zscal.S
@@ -67,7 +67,7 @@
#define t4 $f11
PROLOGUE
-
+
li TEMP, 2 * SIZE
MTC $0, a1
@@ -168,7 +168,7 @@
NOP
.align 3
-.L50:
+.L50:
bne INCX, TEMP, .L60
dsra I, N, 2
diff --git a/kernel/mips64/zswap.S b/kernel/mips64/zswap.S
index 663da23..84e1b97 100644
--- a/kernel/mips64/zswap.S
+++ b/kernel/mips64/zswap.S
@@ -70,7 +70,7 @@
#define b8 $f15
PROLOGUE
-
+
LDARG INCY, 0($sp)
li TEMP, 2 * SIZE
diff --git a/kernel/mips64/zsymv_L.S b/kernel/mips64/zsymv_L.S
index 65d5ce3..1c19bc7 100644
--- a/kernel/mips64/zsymv_L.S
+++ b/kernel/mips64/zsymv_L.S
@@ -103,7 +103,7 @@
#endif
PROLOGUE
-
+
LDARG INCY, 0($sp)
LDARG BUFFER, 8($sp)
#ifdef __64BIT__
diff --git a/kernel/mips64/zsymv_U.S b/kernel/mips64/zsymv_U.S
index 938d911..e972826 100644
--- a/kernel/mips64/zsymv_U.S
+++ b/kernel/mips64/zsymv_U.S
@@ -99,9 +99,9 @@
#define ADD1 MADD
#define ADD2 NMSUB
#endif
-
+
PROLOGUE
-
+
LDARG INCY, 0($sp)
LDARG BUFFER, 8($sp)
#ifdef __64BIT__
diff --git a/kernel/mips64/ztrsm_kernel_LT.S b/kernel/mips64/ztrsm_kernel_LT.S
index 0e70118..00a48a6 100644
--- a/kernel/mips64/ztrsm_kernel_LT.S
+++ b/kernel/mips64/ztrsm_kernel_LT.S
@@ -125,7 +125,7 @@
#endif
PROLOGUE
-
+
daddiu $sp, $sp, -128
SDARG $16, 0($sp)
@@ -988,7 +988,7 @@
bgtz J, .L10
NOP
.align 3
-
+
.L20:
andi J, N, 2
blez J, .L30
diff --git a/kernel/mips64/ztrsm_kernel_RT.S b/kernel/mips64/ztrsm_kernel_RT.S
index 1fc2684..89bc546 100644
--- a/kernel/mips64/ztrsm_kernel_RT.S
+++ b/kernel/mips64/ztrsm_kernel_RT.S
@@ -125,7 +125,7 @@
#endif
PROLOGUE
-
+
daddiu $sp, $sp, -128
SDARG $16, 0($sp)
diff --git a/kernel/power/KERNEL.CELL b/kernel/power/KERNEL.CELL
index 745e16e..b177464 100644
--- a/kernel/power/KERNEL.CELL
+++ b/kernel/power/KERNEL.CELL
@@ -23,8 +23,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_cell.S
diff --git a/kernel/power/KERNEL.POWER5 b/kernel/power/KERNEL.POWER5
index af0960d..fbef79e 100644
--- a/kernel/power/KERNEL.POWER5
+++ b/kernel/power/KERNEL.POWER5
@@ -1,10 +1,10 @@
SGEMMKERNEL = gemm_kernel.S
-SGEMMINCOPY =
-SGEMMITCOPY =
+SGEMMINCOPY =
+SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ =
-SGEMMITCOPYOBJ =
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel.S
@@ -17,8 +17,8 @@ DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel.S
-CGEMMINCOPY =
-CGEMMITCOPY =
+CGEMMINCOPY =
+CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
diff --git a/kernel/power/KERNEL.POWER6 b/kernel/power/KERNEL.POWER6
index ef5f744..344b205 100644
--- a/kernel/power/KERNEL.POWER6
+++ b/kernel/power/KERNEL.POWER6
@@ -1,6 +1,6 @@
SGEMMKERNEL = gemm_kernel_power6.S
-SGEMMINCOPY =
-SGEMMITCOPY =
+SGEMMINCOPY =
+SGEMMITCOPY =
SGEMMONCOPY = gemm_ncopy_4.S
SGEMMOTCOPY = gemm_tcopy_4.S
SGEMMINCOPYOBJ =
diff --git a/kernel/power/KERNEL.PPC440 b/kernel/power/KERNEL.PPC440
index 5e2a7f9..988a4b7 100644
--- a/kernel/power/KERNEL.PPC440
+++ b/kernel/power/KERNEL.PPC440
@@ -61,12 +61,12 @@ CSCALKERNEL = zscal_ppc440.S
ZSCALKERNEL = zscal_ppc440.S
SGEMMKERNEL = gemm_kernel_ppc440.S
-SGEMMINCOPY =
-SGEMMITCOPY =
+SGEMMINCOPY =
+SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ =
-SGEMMITCOPYOBJ =
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_ppc440.S
@@ -79,8 +79,8 @@ DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_ppc440.S
-CGEMMINCOPY =
-CGEMMITCOPY =
+CGEMMINCOPY =
+CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970
index bfa43b7..7431a77 100644
--- a/kernel/power/KERNEL.PPC970
+++ b/kernel/power/KERNEL.PPC970
@@ -3,8 +3,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel.S
diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4
index c41df97..f615754 100644
--- a/kernel/power/KERNEL.PPCG4
+++ b/kernel/power/KERNEL.PPCG4
@@ -65,8 +65,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = gemm_ncopy_4.S
SGEMMOTCOPY = gemm_tcopy_4.S
-SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_g4.S
diff --git a/kernel/power/amax.S b/kernel/power/amax.S
index 7fbe39e..caa789d 100644
--- a/kernel/power/amax.S
+++ b/kernel/power/amax.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREA r8
diff --git a/kernel/power/amax_cell.S b/kernel/power/amax_cell.S
index 3f25e75..d2d9836 100644
--- a/kernel/power/amax_cell.S
+++ b/kernel/power/amax_cell.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREA r8
@@ -640,7 +640,7 @@ LL(28):
fsub f16, f0, f8
fsel f0, f16, f0, f8
.align 4
-
+
LL(999):
fsub f8, f0, f1
fsub f9, f2, f3
diff --git a/kernel/power/amax_hummer.S b/kernel/power/amax_hummer.S
index 0d8b97d..a3de92b 100644
--- a/kernel/power/amax_hummer.S
+++ b/kernel/power/amax_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCX2 r6
#define X2 r7
@@ -86,7 +86,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
stfpdux f16, SP, r10
stfpdux f17, SP, r10
stfpdux f18, SP, r10
diff --git a/kernel/power/amax_ppc440.S b/kernel/power/amax_ppc440.S
index 0184493..68de61c 100644
--- a/kernel/power/amax_ppc440.S
+++ b/kernel/power/amax_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREX r8
diff --git a/kernel/power/amin.S b/kernel/power/amin.S
index 01056c3..4aeb952 100644
--- a/kernel/power/amin.S
+++ b/kernel/power/amin.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREA r8
diff --git a/kernel/power/amin_cell.S b/kernel/power/amin_cell.S
index e4179f5..d95f503 100644
--- a/kernel/power/amin_cell.S
+++ b/kernel/power/amin_cell.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREA r8
@@ -640,7 +640,7 @@ LL(28):
fsub f16, f0, f8
fsel f0, f16, f8, f0
.align 4
-
+
LL(999):
fsub f8, f0, f1
fsub f9, f2, f3
diff --git a/kernel/power/amin_hummer.S b/kernel/power/amin_hummer.S
index f4bbf07..b16faae 100644
--- a/kernel/power/amin_hummer.S
+++ b/kernel/power/amin_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCX2 r6
#define X2 r7
@@ -86,7 +86,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
stfpdux f16, SP, r10
stfpdux f17, SP, r10
stfpdux f18, SP, r10
diff --git a/kernel/power/amin_ppc440.S b/kernel/power/amin_ppc440.S
index b47742b..6328629 100644
--- a/kernel/power/amin_ppc440.S
+++ b/kernel/power/amin_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INC1 r6
#define PREX r8
diff --git a/kernel/power/asum.S b/kernel/power/asum.S
index 1188aa5..e5dc9a6 100644
--- a/kernel/power/asum.S
+++ b/kernel/power/asum.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREA r8
diff --git a/kernel/power/asum_cell.S b/kernel/power/asum_cell.S
index 076651f..f409d0b 100644
--- a/kernel/power/asum_cell.S
+++ b/kernel/power/asum_cell.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREA r8
diff --git a/kernel/power/asum_hummer.S b/kernel/power/asum_hummer.S
index 9906a44..c3985fa 100644
--- a/kernel/power/asum_hummer.S
+++ b/kernel/power/asum_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCX2 r6
#define X2 r7
@@ -72,7 +72,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
li r10, 0
stwu r10, -4(SP)
stwu r10, -4(SP)
diff --git a/kernel/power/asum_ppc440.S b/kernel/power/asum_ppc440.S
index c6ad0f0..ec929f4 100644
--- a/kernel/power/asum_ppc440.S
+++ b/kernel/power/asum_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREX r6
#define ATTR r7
diff --git a/kernel/power/axpy.S b/kernel/power/axpy.S
index 9f9605f..190f82d 100644
--- a/kernel/power/axpy.S
+++ b/kernel/power/axpy.S
@@ -38,12 +38,12 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifdef linux
#ifndef __64BIT__
#define N r3
#define X r6
-#define INCX r7
+#define INCX r7
#define Y r8
#define INCY r9
#define PREA r4
@@ -51,7 +51,7 @@
#else
#define N r3
#define X r7
-#define INCX r8
+#define INCX r8
#define Y r9
#define INCY r10
#define PREA r4
@@ -63,7 +63,7 @@
#if !defined(__64BIT__) && defined(DOUBLE)
#define N r3
#define X r8
-#define INCX r9
+#define INCX r9
#define Y r10
#define INCY r4
#define PREA r5
@@ -71,7 +71,7 @@
#else
#define N r3
#define X r7
-#define INCX r8
+#define INCX r8
#define Y r9
#define INCY r10
#define PREA r4
@@ -82,7 +82,7 @@
#define ALPHA f24
#ifndef NEEDPARAM
-
+
#define STACKSIZE 96
PROLOGUE
@@ -108,7 +108,7 @@
#if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE)
lwz INCY, 56 + STACKSIZE(SP)
#endif
-
+
fmr ALPHA, f1
slwi INCX, INCX, BASE_SHIFT
slwi INCY, INCY, BASE_SHIFT
@@ -116,7 +116,7 @@
#ifdef L1_DUALFETCH
li PREA, (L1_PREFETCHSIZE) / 2
#else
- li PREA, (L1_PREFETCHSIZE)
+ li PREA, (L1_PREFETCHSIZE)
#endif
cmpwi cr0, N, 0
diff --git a/kernel/power/axpy_hummer.S b/kernel/power/axpy_hummer.S
index 372a846..f66b652 100644
--- a/kernel/power/axpy_hummer.S
+++ b/kernel/power/axpy_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r6
-#define INCX r7
+#define INCX r7
#define Y r8
#define INCY r9
@@ -634,7 +634,7 @@ LL(118):
LL(999):
li r10, 16
subi SP, SP, 16
-
+
lfpdux f25, SP, r10
lfpdux f24, SP, r10
lfpdux f23, SP, r10
diff --git a/kernel/power/axpy_ppc440.S b/kernel/power/axpy_ppc440.S
index cc2605c..df3f25e 100644
--- a/kernel/power/axpy_ppc440.S
+++ b/kernel/power/axpy_ppc440.S
@@ -38,12 +38,12 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifdef linux
#ifndef __64BIT__
#define N r3
#define X r6
-#define INCX r7
+#define INCX r7
#define Y r8
#define INCY r9
#define YY r5
@@ -51,7 +51,7 @@
#else
#define N r3
#define X r7
-#define INCX r8
+#define INCX r8
#define Y r9
#define INCY r10
#define YY r5
@@ -63,7 +63,7 @@
#if !defined(__64BIT__) && defined(DOUBLE)
#define N r3
#define X r8
-#define INCX r9
+#define INCX r9
#define Y r10
#define INCY r4
#define YY r6
@@ -71,7 +71,7 @@
#else
#define N r3
#define X r7
-#define INCX r8
+#define INCX r8
#define Y r9
#define INCY r10
#define YY r5
@@ -106,7 +106,7 @@
#if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE)
lwz INCY, 56 + STACKSIZE(SP)
#endif
-
+
fmr ALPHA, f1
slwi INCX, INCX, BASE_SHIFT
slwi INCY, INCY, BASE_SHIFT
diff --git a/kernel/power/cnrm2.S b/kernel/power/cnrm2.S
index 930ea29..c115650 100644
--- a/kernel/power/cnrm2.S
+++ b/kernel/power/cnrm2.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREA r8
#define INCXM1 r9
diff --git a/kernel/power/cnrm2_hummer.S b/kernel/power/cnrm2_hummer.S
index e6b022f..46c29c6 100644
--- a/kernel/power/cnrm2_hummer.S
+++ b/kernel/power/cnrm2_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCX2 r6
#define X2 r7
@@ -336,7 +336,7 @@ LL(98):
lfpdux f14, SP, r10
addi SP, SP, 16
blr
-#endif
+#endif
.align 4
LL(99):
@@ -517,7 +517,7 @@ LL(118):
LL(198):
LFDX A1, X, INCX2
fmadd C4, A1, A1, C4
-
+
fpadd C1, C1, C5
lis r3, 0x3f00
fpadd C2, C2, C6
diff --git a/kernel/power/cnrm2_ppc440.S b/kernel/power/cnrm2_ppc440.S
index 5ead681..c71c34b 100644
--- a/kernel/power/cnrm2_ppc440.S
+++ b/kernel/power/cnrm2_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PRE r8
#define INC1 r9
@@ -99,8 +99,8 @@
slwi INCX, INCX, ZBASE_SHIFT
li INC1, SIZE
- li PRE, 3 * 16 * SIZE
-
+ li PRE, 3 * 16 * SIZE
+
cmpwi cr0, N, 0
ble- LL(999)
cmpwi cr0, INCX, 0
diff --git a/kernel/power/copy.S b/kernel/power/copy.S
index 5a6c610..8f67733 100644
--- a/kernel/power/copy.S
+++ b/kernel/power/copy.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define Y r6
#define INCY r7
#define PREA r8
@@ -63,7 +63,7 @@
#ifdef L1_DUALFETCH
li PREA, (L1_PREFETCHSIZE) / 2
#else
- li PREA, (L1_PREFETCHSIZE)
+ li PREA, (L1_PREFETCHSIZE)
#endif
cmpwi cr0, N, 0
diff --git a/kernel/power/copy_hummer.S b/kernel/power/copy_hummer.S
index 1efa6fb..19646bf 100644
--- a/kernel/power/copy_hummer.S
+++ b/kernel/power/copy_hummer.S
@@ -38,12 +38,12 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define Y r6
-#define INCY r7
+#define INCY r7
#define INCX2 r8
#define INCY2 r9
@@ -75,7 +75,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
slwi INCX, INCX, BASE_SHIFT
slwi INCY, INCY, BASE_SHIFT
add INCX2, INCX, INCX
@@ -287,7 +287,7 @@ LL(23):
LL(25):
andi. r0, N, 15
beq LL(999)
-
+
andi. r0, N, 8
beq LL(26)
@@ -430,7 +430,7 @@ LL(33):
LL(35):
andi. r0, N, 15
beq LL(999)
-
+
andi. r0, N, 8
beq LL(36)
@@ -711,7 +711,7 @@ LL(58):
STFDUX A1, Y, INCY2
b LL(999)
.align 4
-
+
# INCX == 1, INCY != 1
LL(60):
@@ -857,7 +857,7 @@ LL(68):
STFDUX A1, Y, INCY
b LL(999)
.align 4
-
+
LL(100):
sub X, X, INCX
sub Y, Y, INCY
@@ -951,7 +951,7 @@ LL(999):
lfpdux f15, SP, r10
lfpdux f14, SP, r10
-
+
addi SP, SP, 16
blr
diff --git a/kernel/power/dnrm2_hummer.S b/kernel/power/dnrm2_hummer.S
index 4faa6c9..4931f5a 100644
--- a/kernel/power/dnrm2_hummer.S
+++ b/kernel/power/dnrm2_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCX2 r6
#define X2 r7
@@ -91,7 +91,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
stfpdux f16, SP, r10
stfpdux f17, SP, r10
stfpdux f18, SP, r10
@@ -330,7 +330,7 @@ LL(20):
fdiv ALPHA_R, ALPHA_R, ALPHA
lfpsx C1, SP, r10 # Zero clear
-
+
fpmr C2, C1
fpmr C3, C1
fpmr C4, C1
@@ -795,7 +795,7 @@ LL(120):
fdiv ALPHA_R, ALPHA_R, ALPHA
lfpsx C1, SP, r10 # Zero clear
-
+
fpmr C2, C1
fpmr C3, C1
fpmr C4, C1
diff --git a/kernel/power/dnrm2_ppc440.S b/kernel/power/dnrm2_ppc440.S
index 6be9ead..849ca1f 100644
--- a/kernel/power/dnrm2_ppc440.S
+++ b/kernel/power/dnrm2_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define NN r6
#define XX r7
@@ -106,7 +106,7 @@
slwi INCX, INCX, BASE_SHIFT
sub X, X, INCX
- li PRE, 3 * 16 * SIZE
+ li PRE, 3 * 16 * SIZE
cmpwi cr0, N, 0
ble- LL(999)
diff --git a/kernel/power/dot.S b/kernel/power/dot.S
index 724b0c3..cf96c18 100644
--- a/kernel/power/dot.S
+++ b/kernel/power/dot.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
#define INCX r5
@@ -92,7 +92,7 @@
#ifdef L1_DUALFETCH
li PREA, (L1_PREFETCHSIZE) / 2
#else
- li PREA, (L1_PREFETCHSIZE)
+ li PREA, (L1_PREFETCHSIZE)
#endif
cmpwi cr0, N, 0
diff --git a/kernel/power/dot_cell.S b/kernel/power/dot_cell.S
index 617fb13..f7bd077 100644
--- a/kernel/power/dot_cell.S
+++ b/kernel/power/dot_cell.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
#define INCX r5
diff --git a/kernel/power/dot_hummer.S b/kernel/power/dot_hummer.S
index 14a3780..1004f76 100644
--- a/kernel/power/dot_hummer.S
+++ b/kernel/power/dot_hummer.S
@@ -38,12 +38,12 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define Y r6
-#define INCY r7
+#define INCY r7
#define INCX2 r8
#define INCY2 r9
@@ -81,7 +81,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
stfpdux f16, SP, r10
stfpdux f17, SP, r10
stfpdux f18, SP, r10
diff --git a/kernel/power/dot_ppc440.S b/kernel/power/dot_ppc440.S
index b3f3efc..5317c57 100644
--- a/kernel/power/dot_ppc440.S
+++ b/kernel/power/dot_ppc440.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
#define INCX r5
diff --git a/kernel/power/exfunc.S b/kernel/power/exfunc.S
index 257736c..6e90945 100644
--- a/kernel/power/exfunc.S
+++ b/kernel/power/exfunc.S
@@ -41,7 +41,7 @@
.machine "any"
- .globl .rpcc
+ .globl .rpcc
.rpcc:
mftb r3
rlinm r3, r3, 3, 0, 31 # ldc(scaling)
diff --git a/kernel/power/gemm_beta.S b/kernel/power/gemm_beta.S
index e531bde..969f54c 100644
--- a/kernel/power/gemm_beta.S
+++ b/kernel/power/gemm_beta.S
@@ -90,7 +90,7 @@
fmr ALPHA, f1
lfs f0, 24(SP)
-
+
cmpwi cr0, M, 0
ble- LL(999)
cmpwi cr0, N, 0
@@ -133,7 +133,7 @@ LL(12):
addi CO1, CO1, 16 * SIZE
bdnz LL(12)
.align 4
-
+
LL(15):
andi. r0, M, 15
mtspr CTR, r0
@@ -221,7 +221,7 @@ LL(22):
dcbtst PRE, CO1
bdnz LL(22)
.align 4
-
+
LL(25):
andi. r0, M, 15
mtspr CTR, r0
diff --git a/kernel/power/gemm_kernel.S b/kernel/power/gemm_kernel.S
index 2b7d1d9..cae2fab 100644
--- a/kernel/power/gemm_kernel.S
+++ b/kernel/power/gemm_kernel.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -230,7 +230,7 @@
#ifdef linux
#ifndef __64BIT__
- mr PREA, r10
+ mr PREA, r10
lwz PREB, 8 + STACKSIZE(SP)
lwz PREC, 12 + STACKSIZE(SP)
#else
@@ -322,7 +322,7 @@ LL(10):
fmr f13, f0
fmr f14, f0
fmr f15, f0
-
+
srawi. I, M, 2
mr AO, A
add C, CO4, LDC
@@ -582,7 +582,7 @@ LL(12):
LFD f30, 22 * SIZE(BO)
LFD f31, 23 * SIZE(BO)
#endif
-
+
addi AO, AO, 16 * SIZE
addi BO, BO, 16 * SIZE
@@ -778,7 +778,7 @@ LL(18):
addi CO2, CO2, 4 * SIZE
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
@@ -1364,7 +1364,7 @@ LL(40):
fmr f5, f0
fmr f6, f0
fmr f7, f0
-
+
srawi. I, M, 2
add C, CO2, LDC
mr AO, A
@@ -2273,7 +2273,7 @@ LL(78):
fmr f1, f0
fmr f2, f0
fmr f3, f0
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
diff --git a/kernel/power/gemm_kernel_altivec.S b/kernel/power/gemm_kernel_altivec.S
index 6f5c362..8a525ef 100644
--- a/kernel/power/gemm_kernel_altivec.S
+++ b/kernel/power/gemm_kernel_altivec.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -707,7 +707,7 @@ LL(18):
addi CO2, CO2, 16 * SIZE
addi CO3, CO3, 16 * SIZE
addi CO4, CO4, 16 * SIZE
-
+
addic. I, I, -1
bgt+ LL(11)
b LL(20)
@@ -809,7 +809,7 @@ LL(19):
addi CO2, CO2, 16 * SIZE
addi CO3, CO3, 16 * SIZE
addi CO4, CO4, 16 * SIZE
-
+
addic. I, I, -1
bgt+ LL(11)
.align 4
@@ -1917,7 +1917,7 @@ LL(118):
FADD f0, f0, f2
FADD f1, f1, f3
-
+
FMADD f0, f0, f13, f8
FMADD f1, f1, f13, f9
@@ -2629,7 +2629,7 @@ LL(178):
STFD f0, 0 * SIZE(CO1)
.align 4
-
+
LL(999):
mr SP, STACK
diff --git a/kernel/power/gemm_kernel_altivec_cell.S b/kernel/power/gemm_kernel_altivec_cell.S
index 010ed39..ac750c2 100644
--- a/kernel/power/gemm_kernel_altivec_cell.S
+++ b/kernel/power/gemm_kernel_altivec_cell.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -710,7 +710,7 @@ LL(18):
addi CO2, CO2, 16 * SIZE
addi CO3, CO3, 16 * SIZE
addi CO4, CO4, 16 * SIZE
-
+
addic. I, I, -1
bgt+ LL(11)
b LL(20)
@@ -812,7 +812,7 @@ LL(19):
addi CO2, CO2, 16 * SIZE
addi CO3, CO3, 16 * SIZE
addi CO4, CO4, 16 * SIZE
-
+
addic. I, I, -1
bgt+ LL(11)
.align 4
@@ -1920,7 +1920,7 @@ LL(118):
FADD f0, f0, f2
FADD f1, f1, f3
-
+
FMADD f0, f0, f13, f8
FMADD f1, f1, f13, f9
@@ -2632,7 +2632,7 @@ LL(178):
STFD f0, 0 * SIZE(CO1)
.align 4
-
+
LL(999):
mr SP, STACK
diff --git a/kernel/power/gemm_kernel_altivec_g4.S b/kernel/power/gemm_kernel_altivec_g4.S
index 24d437d..26339af 100644
--- a/kernel/power/gemm_kernel_altivec_g4.S
+++ b/kernel/power/gemm_kernel_altivec_g4.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -646,7 +646,7 @@ LL(18):
addi CO2, CO2, 16 * SIZE
addi CO3, CO3, 16 * SIZE
addi CO4, CO4, 16 * SIZE
-
+
addic. I, I, -1
bgt+ LL(11)
b LL(20)
@@ -748,7 +748,7 @@ LL(19):
addi CO2, CO2, 16 * SIZE
addi CO3, CO3, 16 * SIZE
addi CO4, CO4, 16 * SIZE
-
+
addic. I, I, -1
bgt+ LL(11)
.align 4
@@ -1856,7 +1856,7 @@ LL(118):
FADD f0, f0, f2
FADD f1, f1, f3
-
+
FMADD f0, f0, f13, f8
FMADD f1, f1, f13, f9
@@ -2568,7 +2568,7 @@ LL(178):
STFD f0, 0 * SIZE(CO1)
.align 4
-
+
LL(999):
mr SP, STACK
diff --git a/kernel/power/gemm_kernel_cell.S b/kernel/power/gemm_kernel_cell.S
index 0b0d75f..1dbacc7 100644
--- a/kernel/power/gemm_kernel_cell.S
+++ b/kernel/power/gemm_kernel_cell.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -228,7 +228,7 @@
#ifdef linux
#ifndef __64BIT__
- mr PREA, r10
+ mr PREA, r10
lwz PREB, 8 + STACKSIZE(SP)
lwz PREC, 12 + STACKSIZE(SP)
#else
@@ -294,7 +294,7 @@ LL(10):
fmr f13, f0
fmr f14, f0
fmr f15, f0
-
+
srawi. I, M, 2
mr AO, A
add C, CO4, LDC
@@ -715,7 +715,7 @@ LL(18):
addi CO2, CO2, 4 * SIZE
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
@@ -1301,7 +1301,7 @@ LL(40):
fmr f5, f0
fmr f6, f0
fmr f7, f0
-
+
srawi. I, M, 2
add C, CO2, LDC
mr AO, A
@@ -2210,7 +2210,7 @@ LL(78):
fmr f1, f0
fmr f2, f0
fmr f3, f0
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
diff --git a/kernel/power/gemm_kernel_g4.S b/kernel/power/gemm_kernel_g4.S
index 1ee4b28..b6c8499 100644
--- a/kernel/power/gemm_kernel_g4.S
+++ b/kernel/power/gemm_kernel_g4.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -263,7 +263,7 @@
fmr f13, f0
fmr f14, f0
fmr f15, f0
-
+
srawi. I, M, 2
mr AO, A
add C, CO4, LDC
@@ -475,7 +475,7 @@
FMADD f15, A4, B4, f15
addi AO, AO, 4 * SIZE
addi BO, BO, 4 * SIZE
-
+
.align 4
.L18:
@@ -582,7 +582,7 @@
addi CO2, CO2, 4 * SIZE
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
@@ -1149,7 +1149,7 @@
fmr f5, f0
fmr f6, f0
fmr f7, f0
-
+
srawi. I, M, 2
add C, CO2, LDC
mr AO, A
@@ -2001,7 +2001,7 @@
fmr f1, f0
fmr f2, f0
fmr f3, f0
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
diff --git a/kernel/power/gemm_kernel_hummer.S b/kernel/power/gemm_kernel_hummer.S
index 6b4e6b9..3a8e1ed 100644
--- a/kernel/power/gemm_kernel_hummer.S
+++ b/kernel/power/gemm_kernel_hummer.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define ALPHA 0
#define FZERO 8
@@ -70,7 +70,7 @@
#define BO r25
#define AO2 r26
#define BO2 r27
-
+
#define CO1 r28
#define CO2 r29
#define CO3 r30
@@ -122,7 +122,7 @@
stfpdux f29, SP, r0
stfpdux f30, SP, r0
stfpdux f31, SP, r0
-
+
stwu r31, -4(SP)
stwu r30, -4(SP)
stwu r29, -4(SP)
@@ -197,7 +197,7 @@
#endif
addi AO, A, -4 * SIZE
-
+
li r0, FZERO
lfpsx f0, SP, r0
@@ -431,7 +431,7 @@
fxcsmadd f7, B6, A4, f7
LFPDUX A9, AO, INC4
fxcpmadd f11, B4, A4, f11
- nop
+ nop
fxcsmadd f15, B4, A4, f15
bdnz+ .L12
.align 4
@@ -1626,7 +1626,7 @@
fsmfp A1, A2
fsmfp A3, A4
-
+
fxcpmadd f0, AP, f0, A1
fxcpmadd f1, AP, f1, A3
#else
@@ -1687,7 +1687,7 @@
#endif
addi AO, A, -2 * SIZE
-
+
li r0, FZERO
lfpsx f0, SP, r0
@@ -2649,7 +2649,7 @@
mr CO1, C
addi AO, A, -2 * SIZE
-
+
li r0, FZERO
lfpsx f0, SP, r0
@@ -3507,7 +3507,7 @@
#endif
addi AO, A, -4 * SIZE
-
+
li r0, FZERO
lfpsx f0, SP, r0
@@ -3741,7 +3741,7 @@
fxcsmadd f7, B6, A4, f7
LFPDUX A9, AO, INC4
fxcpmadd f11, B4, A4, f11
- nop
+ nop
fxcsmadd f15, B4, A4, f15
bdnz+ .L1012
.align 4
@@ -5120,7 +5120,7 @@
#endif
addi AO, A, -2 * SIZE
-
+
li r0, FZERO
lfpsx f0, SP, r0
@@ -6147,7 +6147,7 @@
mr CO1, C
addi AO, A, -2 * SIZE
-
+
li r0, FZERO
lfpsx f0, SP, r0
diff --git a/kernel/power/gemm_kernel_power3.S b/kernel/power/gemm_kernel_power3.S
index 92e8e9f..6fe2def 100644
--- a/kernel/power/gemm_kernel_power3.S
+++ b/kernel/power/gemm_kernel_power3.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -189,7 +189,7 @@
#ifdef linux
#ifndef __64BIT__
- mr PREA, r10
+ mr PREA, r10
lwz PREB, 8 + STACKSIZE(SP)
lwz PREC, 12 + STACKSIZE(SP)
#else
@@ -246,7 +246,7 @@ LL(10):
fmr f13, f0
fmr f14, f0
fmr f15, f0
-
+
srawi. I, M, 2
mr AO, A
add C, CO4, LDC
@@ -522,7 +522,7 @@ LL(18):
addi CO4, CO4, 4 * SIZE
fmr f14, f31
fmr f15, f31
-
+
addic. I, I, -1
bgt+ LL(11)
.align 4
@@ -880,7 +880,7 @@ LL(40):
fmr f5, f0
fmr f6, f0
fmr f7, f0
-
+
srawi. I, M, 2
add C, CO2, LDC
mr AO, A
diff --git a/kernel/power/gemm_kernel_power6.S b/kernel/power/gemm_kernel_power6.S
index b10a042..5f8fa76 100644
--- a/kernel/power/gemm_kernel_power6.S
+++ b/kernel/power/gemm_kernel_power6.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -746,7 +746,7 @@ LL(18):
addi CO2, CO2, 4 * SIZE
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
@@ -1332,7 +1332,7 @@ LL(40):
fmr f5, f0
fmr f6, f0
fmr f7, f0
-
+
srawi. I, M, 2
add C, CO2, LDC
mr AO, A
@@ -2238,7 +2238,7 @@ LL(78):
fmr f1, f0
fmr f2, f0
fmr f3, f0
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
diff --git a/kernel/power/gemm_kernel_ppc440.S b/kernel/power/gemm_kernel_ppc440.S
index 5d3b306..2e86d51 100644
--- a/kernel/power/gemm_kernel_ppc440.S
+++ b/kernel/power/gemm_kernel_ppc440.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -259,7 +259,7 @@
fmr f13, f0
fmr f14, f0
fmr f15, f0
-
+
srawi. I, M, 2
mr AO, A
add C, CO4, LDC
@@ -640,7 +640,7 @@
addi CO2, CO2, 4 * SIZE
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
@@ -1207,7 +1207,7 @@
fmr f5, f0
fmr f6, f0
fmr f7, f0
-
+
srawi. I, M, 2
add C, CO2, LDC
mr AO, A
@@ -2059,7 +2059,7 @@
fmr f1, f0
fmr f2, f0
fmr f3, f0
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
diff --git a/kernel/power/gemm_ncopy_4.S b/kernel/power/gemm_ncopy_4.S
index 93c687b..a4dcc49 100644
--- a/kernel/power/gemm_ncopy_4.S
+++ b/kernel/power/gemm_ncopy_4.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M r3
#define N r4
#define A r5
@@ -204,7 +204,7 @@ LL(12):
dcbt PREA, AO3
dcbt PREA, AO4
#endif
-
+
dcbtst PREB1, B
addi AO1, AO1, 4 * SIZE
@@ -214,7 +214,7 @@ LL(12):
addi B, B, 16 * SIZE
bdnz LL(12)
.align 4
-
+
LL(15):
andi. r0, M, 3
mtspr CTR, r0
@@ -284,7 +284,7 @@ LL(22):
addi B, B, 8 * SIZE
bdnz LL(22)
.align 4
-
+
LL(25):
andi. r0, M, 3
mtspr CTR, r0
@@ -330,7 +330,7 @@ LL(32):
addi B, B, 4 * SIZE
bdnz LL(32)
.align 4
-
+
LL(35):
andi. r0, M, 3
mtspr CTR, r0
diff --git a/kernel/power/gemm_ncopy_hummer_4.S b/kernel/power/gemm_ncopy_hummer_4.S
index f05fdaa..7f5a55c 100644
--- a/kernel/power/gemm_ncopy_hummer_4.S
+++ b/kernel/power/gemm_ncopy_hummer_4.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M r3
#define N r4
#define A r5
@@ -91,7 +91,7 @@
stfpdux f17, SP, r0
stfpdux f18, SP, r0
stfpdux f19, SP, r0
-
+
stwu r31, -4(SP)
stwu r30, -4(SP)
@@ -201,7 +201,7 @@
STFXDUX c08, B, INC2
bdnz .L12
.align 4
-
+
.L15:
andi. r0, M, 7
ble .L19
@@ -323,7 +323,7 @@
STFXDUX c04, B, INC2
bdnz .L22
.align 4
-
+
.L25:
andi. r0, M, 7
ble .L30
@@ -395,7 +395,7 @@
STFPDUX c04, B, INC2
bdnz .L32
.align 4
-
+
.L35:
andi. r0, M, 7
ble .L99
@@ -529,7 +529,7 @@
STFPDUX c16, B, INC2
bdnz .L112
.align 4
-
+
.L115:
andi. r0, M, 7
ble .L119
@@ -656,7 +656,7 @@
STFPDUX c12, B, INC2
bdnz .L122
.align 4
-
+
.L125:
andi. r0, M, 7
ble .L130
@@ -738,7 +738,7 @@
STFPDUX c07, B, INC2
bdnz .L132
.align 4
-
+
.L135:
andi. r0, M, 7
ble .L999
diff --git a/kernel/power/gemm_ncopy_hummer_8.S b/kernel/power/gemm_ncopy_hummer_8.S
index fec7c13..1c8adc9 100644
--- a/kernel/power/gemm_ncopy_hummer_8.S
+++ b/kernel/power/gemm_ncopy_hummer_8.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M r3
#define N r4
#define A r5
@@ -120,7 +120,7 @@
stfpdux f29, SP, r0
stfpdux f30, SP, r0
stfpdux f31, SP, r0
-
+
stwu r31, -4(SP)
stwu r30, -4(SP)
stwu r29, -4(SP)
@@ -237,7 +237,7 @@
STFXDUX c08, B, INC2
bdnz .L12
.align 4
-
+
.L15:
andi. r0, M, 3
ble .L19
@@ -373,7 +373,7 @@
STFXDUX c08, B, INC2
bdnz .L22
.align 4
-
+
.L25:
andi. r0, M, 7
ble .L30
@@ -441,12 +441,12 @@
fsmfp c01, c02
fsmfp c03, c04
-
+
STFPDUX c01, B, INC2
STFPDUX c03, B, INC2
.align 4
-
+
.L30:
andi. J, N, 2
ble .L40
@@ -490,7 +490,7 @@
STFXDUX c16, B, INC2
bdnz .L32
.align 4
-
+
.L35:
andi. r0, M, 7
ble .L40
@@ -562,7 +562,7 @@
STFPDUX c04, B, INC2
bdnz .L42
.align 4
-
+
.L45:
andi. r0, M, 7
ble .L999
@@ -734,7 +734,7 @@
STFPDUX c32, B, INC2
bdnz .L112
.align 4
-
+
.L115:
andi. r0, M, 7
ble .L119
@@ -936,7 +936,7 @@
STFPDUX c16, B, INC2
bdnz .L122
.align 4
-
+
.L125:
andi. r0, M, 7
ble .L130
@@ -1013,7 +1013,7 @@
STFPDUX c05, B, INC2
.align 4
-
+
.L130:
andi. J, N, 2
ble .L140
@@ -1059,7 +1059,7 @@
STFPDUX c12, B, INC2
bdnz .L132
.align 4
-
+
.L135:
andi. r0, M, 7
ble .L140
@@ -1141,7 +1141,7 @@
STFPDUX c07, B, INC2
bdnz .L142
.align 4
-
+
.L145:
andi. r0, M, 7
ble .L999
diff --git a/kernel/power/gemm_tcopy_4.S b/kernel/power/gemm_tcopy_4.S
index 712420f..1b6af48 100644
--- a/kernel/power/gemm_tcopy_4.S
+++ b/kernel/power/gemm_tcopy_4.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M r3
#define N r4
#define A r5
@@ -58,7 +58,7 @@
#define B2 r17
#define B3 r18
#define M4 r19
-
+
#define c01 f0
#define c02 f1
#define c03 f2
@@ -235,7 +235,7 @@ LL(12):
dcbt PREA, AO3
dcbt PREA, AO4
#endif
-
+
dcbtst PREB1, B
addi AO1, AO1, 4 * SIZE
@@ -245,7 +245,7 @@ LL(12):
add B1, B1, M4
bdnz LL(12)
.align 4
-
+
LL(13):
andi. r0, N, 2
ble LL(14)
@@ -341,7 +341,7 @@ LL(22):
add B1, B1, M4
bdnz LL(22)
.align 4
-
+
LL(23):
andi. r0, N, 2
ble LL(24)
@@ -402,7 +402,7 @@ LL(32):
add B1, B1, M4
bdnz LL(32)
.align 4
-
+
LL(33):
andi. r0, N, 2
ble LL(34)
diff --git a/kernel/power/gemm_tcopy_hummer_4.S b/kernel/power/gemm_tcopy_hummer_4.S
index dc94b04..8352a83 100644
--- a/kernel/power/gemm_tcopy_hummer_4.S
+++ b/kernel/power/gemm_tcopy_hummer_4.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M r3
#define N r4
#define A r5
@@ -57,7 +57,7 @@
#define M4 r29
#define INC r30
#define INC2 r31
-
+
#define c01 f0
#define c02 f1
#define c03 f2
@@ -113,7 +113,7 @@
bne .L100
andi. r0, LDA, 2 * SIZE - 1
bne .L100
-
+
subi A, A, 2 * SIZE
srawi. J, M, 2
ble .L20
@@ -154,7 +154,7 @@
STFPDUX c08, B1, INC2
bdnz .L12
.align 4
-
+
.L15:
andi. r0, N, 3
ble .L19
@@ -224,7 +224,7 @@
STFPDUX c04, B1, INC2
bdnz .L22
.align 4
-
+
.L23:
andi. r0, N, 2
ble .L24
@@ -268,7 +268,7 @@
STFPDUX c02, B1, INC2
bdnz .L32
.align 4
-
+
.L33:
andi. r0, N, 2
ble .L34
@@ -353,7 +353,7 @@
STFPDUX c08, B1, INC2
bdnz .L112
.align 4
-
+
.L115:
andi. r0, N, 3
ble .L119
@@ -433,7 +433,7 @@
STFPDUX c04, B1, INC2
bdnz .L122
.align 4
-
+
.L123:
andi. r0, N, 2
ble .L124
@@ -484,7 +484,7 @@
STFPDUX c03, B1, INC2
bdnz .L132
.align 4
-
+
.L133:
andi. r0, N, 2
ble .L134
diff --git a/kernel/power/gemm_tcopy_hummer_8.S b/kernel/power/gemm_tcopy_hummer_8.S
index 5062f65..e1770b3 100644
--- a/kernel/power/gemm_tcopy_hummer_8.S
+++ b/kernel/power/gemm_tcopy_hummer_8.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M r3
#define N r4
#define A r5
@@ -64,7 +64,7 @@
#define AO8 r29
#define INC r30
#define INC2 r31
-
+
#define c01 f0
#define c02 f1
#define c03 f2
@@ -124,7 +124,7 @@
stfpdux f29, SP, r0
stfpdux f30, SP, r0
stfpdux f31, SP, r0
-
+
stwu r31, -4(SP)
stwu r30, -4(SP)
stwu r29, -4(SP)
@@ -283,7 +283,7 @@
STFPDUX c32, B1, INC2
bdnz .L12
.align 4
-
+
.L15:
andi. r0, N, 7
ble .L19
@@ -437,7 +437,7 @@
STFPDUX c16, B1, INC2
bdnz .L22
.align 4
-
+
.L25:
andi. r0, N, 7
ble .L30
@@ -534,7 +534,7 @@
STFPDUX c08, B1, INC2
bdnz .L32
.align 4
-
+
.L35:
andi. r0, N, 7
ble .L40
@@ -601,7 +601,7 @@
STFPDUX c04, B1, INC2
bdnz .L42
.align 4
-
+
.L45:
andi. r0, N, 7
ble .L999
@@ -778,7 +778,7 @@
STFPDUX c32, B1, INC2
bdnz .L112
.align 4
-
+
.L115:
andi. r0, N, 7
ble .L119
@@ -982,7 +982,7 @@
STFPDUX c16, B1, INC2
bdnz .L122
.align 4
-
+
.L125:
andi. r0, N, 7
ble .L130
@@ -1111,7 +1111,7 @@
STFPDUX c15, B1, INC2
bdnz .L132
.align 4
-
+
.L135:
andi. r0, N, 7
ble .L140
@@ -1202,7 +1202,7 @@
STFPDUX c07, B1, INC2
bdnz .L142
.align 4
-
+
.L145:
andi. r0, N, 7
ble .L999
diff --git a/kernel/power/gemv_hummer_n.S b/kernel/power/gemv_hummer_n.S
index a9340be..7f1b35e 100644
--- a/kernel/power/gemv_hummer_n.S
+++ b/kernel/power/gemv_hummer_n.S
@@ -375,7 +375,7 @@
cmpi cr0, 0, J, 0
bgt .L11
.align 4
-
+
.L20:
andi. J, N, 2
ble .L30
@@ -870,7 +870,7 @@
cmpi cr0, 0, J, 0
bgt .L41
.align 4
-
+
.L50:
andi. J, N, 2
ble .L60
@@ -1419,7 +1419,7 @@
cmpi cr0, 0, J, 0
bgt .L71
.align 4
-
+
.L80:
andi. J, N, 2
ble .L90
diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S
index b66caa7..2b19f0a 100644
--- a/kernel/power/gemv_n.S
+++ b/kernel/power/gemv_n.S
@@ -1559,7 +1559,7 @@ LL(19):
cmpi cr0, 0, J, 0
bgt LL(11)
.align 4
-
+
LL(20):
andi. J, N, 4
mr AO1, A
diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S
index a70e8b8..005e5d5 100644
--- a/kernel/power/gemv_t.S
+++ b/kernel/power/gemv_t.S
@@ -797,7 +797,7 @@ LL(12):
addi BO, BO, 16 * SIZE
bdnz LL(12)
- .align 4
+ .align 4
LL(13):
FMADD y01, a1, b1, y01
@@ -1551,7 +1551,7 @@ LL(19):
cmpi cr0, 0, J, 0
bgt LL(11)
.align 4
-
+
LL(20):
andi. J, N, 7
ble LL(99)
@@ -1778,7 +1778,7 @@ LL(22):
addi BO, BO, 16 * SIZE
bdnz LL(22)
- .align 4
+ .align 4
LL(23):
FMADD y01, a1, b1, y01
@@ -2332,7 +2332,7 @@ LL(32):
addi BO, BO, 16 * SIZE
bdnz LL(32)
- .align 4
+ .align 4
LL(33):
FMADD y01, a1, b1, y01
@@ -2594,7 +2594,7 @@ LL(40):
mr AO1, A
add A, A, LDA
mr BO, XP
-
+
lfd y01, FZERO
fmr y02, y01
fmr y03, y01
@@ -2715,7 +2715,7 @@ LL(42):
addi BO, BO, 16 * SIZE
DCBT(AO1, PREA)
bdnz LL(42)
- .align 4
+ .align 4
LL(43):
FMADD y01, a1, b1, y01
diff --git a/kernel/power/gemv_t_ppc440.S b/kernel/power/gemv_t_ppc440.S
index 1aa59b2..62433af 100644
--- a/kernel/power/gemv_t_ppc440.S
+++ b/kernel/power/gemv_t_ppc440.S
@@ -452,7 +452,7 @@ LL(22):
#endif
bdnz LL(22)
- .align 4
+ .align 4
LL(23):
FMADD y01, a1, b1, y01
@@ -756,7 +756,7 @@ LL(32):
LFDU b4, 1 * SIZE(X1)
bdnz LL(32)
- .align 4
+ .align 4
LL(33):
FMADD y01, a1, b1, y01
@@ -960,7 +960,7 @@ LL(42):
LFDU b4, 1 * SIZE(X1)
bdnz LL(42)
- .align 4
+ .align 4
LL(43):
FMADD y01, a1, b1, y01
diff --git a/kernel/power/ger.S b/kernel/power/ger.S
index 0068569..bc10bf4 100644
--- a/kernel/power/ger.S
+++ b/kernel/power/ger.S
@@ -326,7 +326,7 @@ LL(06):
addi X1, X1, SIZE
bdnz+ LL(06)
.align 4
-
+
LL(10):
srawi. J, N, 1
ble LL(20)
@@ -834,7 +834,7 @@ LL(19):
cmpi cr0, 0, J, 0
bgt LL(11)
.align 4
-
+
LL(20):
andi. J, N, 1
ble LL(999)
diff --git a/kernel/power/iamax.S b/kernel/power/iamax.S
index cdc57fa..45461ae 100644
--- a/kernel/power/iamax.S
+++ b/kernel/power/iamax.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define RET r3
#define X r4
-#define INCX r5
+#define INCX r5
#define N r6
#define NN r7
@@ -634,7 +634,7 @@ LL(1060):
fabs f8, f8
addi RET, RET, 1
fcmpu cr0, f1, f8
- beq cr0, LL(9999)
+ beq cr0, LL(9999)
bdnz LL(1060)
b LL(9999)
.align 4
@@ -768,7 +768,7 @@ LL(1160):
fabs f8, f8
addi RET, RET, 1
fcmpu cr0, f1, f8
- beq cr0, LL(9999)
+ beq cr0, LL(9999)
bdnz LL(1160)
.align 4
diff --git a/kernel/power/iamax_hummer.S b/kernel/power/iamax_hummer.S
index 9b23709..9b3b225 100644
--- a/kernel/power/iamax_hummer.S
+++ b/kernel/power/iamax_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCX2 r6
#define X2 r7
@@ -90,7 +90,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
stfpdux f16, SP, r10
stfpdux f17, SP, r10
stfpdux f18, SP, r10
diff --git a/kernel/power/iamax_ppc440.S b/kernel/power/iamax_ppc440.S
index 11ea4cb..a43cc77 100644
--- a/kernel/power/iamax_ppc440.S
+++ b/kernel/power/iamax_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define RET r3
#define X r4
-#define INCX r5
+#define INCX r5
#define N r6
#define NN r7
@@ -95,7 +95,7 @@
slwi INCX, INCX, BASE_SHIFT
sub X, X, INCX
li PRE, 3 * 16 * SIZE
-
+
mr NN, N
mr XX, X
@@ -448,7 +448,7 @@ LL(1160):
fabs f8, f8
addi RET, RET, 1
fcmpu cr0, f1, f8
- beq cr0, LL(9999)
+ beq cr0, LL(9999)
bdnz LL(1160)
.align 4
diff --git a/kernel/power/iamin.S b/kernel/power/iamin.S
index c3dbb84..477fd75 100644
--- a/kernel/power/iamin.S
+++ b/kernel/power/iamin.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define RET r3
#define X r4
-#define INCX r5
+#define INCX r5
#define N r6
#define NN r7
@@ -635,7 +635,7 @@ LL(1060):
fabs f8, f8
addi RET, RET, 1
fcmpu cr0, f1, f8
- beq cr0, LL(9999)
+ beq cr0, LL(9999)
bdnz LL(1060)
b LL(9999)
.align 4
@@ -769,7 +769,7 @@ LL(1160):
fabs f8, f8
addi RET, RET, 1
fcmpu cr0, f1, f8
- beq cr0, LL(9999)
+ beq cr0, LL(9999)
bdnz LL(1160)
.align 4
diff --git a/kernel/power/iamin_hummer.S b/kernel/power/iamin_hummer.S
index 6dad3be..7a2c29c 100644
--- a/kernel/power/iamin_hummer.S
+++ b/kernel/power/iamin_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCX2 r6
#define X2 r7
@@ -90,7 +90,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
stfpdux f16, SP, r10
stfpdux f17, SP, r10
stfpdux f18, SP, r10
diff --git a/kernel/power/iamin_ppc440.S b/kernel/power/iamin_ppc440.S
index 888e74a..bbcc301 100644
--- a/kernel/power/iamin_ppc440.S
+++ b/kernel/power/iamin_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define RET r3
#define X r4
-#define INCX r5
+#define INCX r5
#define N r6
#define NN r7
@@ -95,7 +95,7 @@
slwi INCX, INCX, BASE_SHIFT
sub X, X, INCX
li PRE, 3 * 16 * SIZE
-
+
mr NN, N
mr XX, X
@@ -448,7 +448,7 @@ LL(1160):
fabs f8, f8
addi RET, RET, 1
fcmpu cr0, f1, f8
- beq cr0, LL(9999)
+ beq cr0, LL(9999)
bdnz LL(1160)
.align 4
diff --git a/kernel/power/imax.S b/kernel/power/imax.S
index 6b6cd45..33762b9 100644
--- a/kernel/power/imax.S
+++ b/kernel/power/imax.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define RET r3
#define X r4
-#define INCX r5
+#define INCX r5
#define N r6
#define NN r7
diff --git a/kernel/power/imax_hummer.S b/kernel/power/imax_hummer.S
index 110dc18..6ea6f5c 100644
--- a/kernel/power/imax_hummer.S
+++ b/kernel/power/imax_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCX2 r6
#define X2 r7
@@ -81,7 +81,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
stfpdux f16, SP, r10
stfpdux f17, SP, r10
stfpdux f18, SP, r10
diff --git a/kernel/power/imax_ppc440.S b/kernel/power/imax_ppc440.S
index b4a6449..b6cea76 100644
--- a/kernel/power/imax_ppc440.S
+++ b/kernel/power/imax_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define RET r3
#define X r4
-#define INCX r5
+#define INCX r5
#define N r6
#define NN r7
diff --git a/kernel/power/imin.S b/kernel/power/imin.S
index 2dd774d..dc76672 100644
--- a/kernel/power/imin.S
+++ b/kernel/power/imin.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define RET r3
#define X r4
-#define INCX r5
+#define INCX r5
#define N r6
#define NN r7
diff --git a/kernel/power/imin_hummer.S b/kernel/power/imin_hummer.S
index d333329..f86e79c 100644
--- a/kernel/power/imin_hummer.S
+++ b/kernel/power/imin_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCX2 r6
#define X2 r7
@@ -81,7 +81,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
stfpdux f16, SP, r10
stfpdux f17, SP, r10
stfpdux f18, SP, r10
diff --git a/kernel/power/imin_ppc440.S b/kernel/power/imin_ppc440.S
index 4e1185d..c84bdfa 100644
--- a/kernel/power/imin_ppc440.S
+++ b/kernel/power/imin_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define RET r3
#define X r4
-#define INCX r5
+#define INCX r5
#define N r6
#define NN r7
@@ -380,7 +380,7 @@ LL(1160):
LFDUX f8, XX, INCX
addi RET, RET, 1
fcmpu cr0, f1, f8
- beq cr0, LL(9999)
+ beq cr0, LL(9999)
bdnz LL(1160)
.align 4
diff --git a/kernel/power/izamax.S b/kernel/power/izamax.S
index 4851047..8e1e1ef 100644
--- a/kernel/power/izamax.S
+++ b/kernel/power/izamax.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define RET r3
#define X r4
-#define INCX r5
+#define INCX r5
#define N r6
#define NN r7
@@ -685,7 +685,7 @@ LL(1060):
addi RET, RET, 1
fcmpu cr0, f1, f8
- beq cr0, LL(9999)
+ beq cr0, LL(9999)
bdnz LL(1060)
b LL(9999)
.align 4
@@ -885,7 +885,7 @@ LL(1160):
addi RET, RET, 1
fcmpu cr0, f1, f8
- beq cr0, LL(9999)
+ beq cr0, LL(9999)
bdnz LL(1160)
.align 4
diff --git a/kernel/power/izamax_hummer.S b/kernel/power/izamax_hummer.S
index 8dffa0c..1f1e48c 100644
--- a/kernel/power/izamax_hummer.S
+++ b/kernel/power/izamax_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCX2 r6
#define X2 r7
@@ -91,7 +91,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
stfpdux f16, SP, r10
stfpdux f17, SP, r10
stfpdux f18, SP, r10
diff --git a/kernel/power/izamax_ppc440.S b/kernel/power/izamax_ppc440.S
index f80c9ad..76f18be 100644
--- a/kernel/power/izamax_ppc440.S
+++ b/kernel/power/izamax_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define RET r3
#define X r4
-#define INCX r5
+#define INCX r5
#define N r6
#define NN r7
@@ -96,7 +96,7 @@
slwi INCX, INCX, ZBASE_SHIFT
sub X, X, INCX
- li INC1, SIZE
+ li INC1, SIZE
li PRE, 3 * 16 * SIZE
mr NN, N
diff --git a/kernel/power/izamin.S b/kernel/power/izamin.S
index 17275fc..ea1cdfa 100644
--- a/kernel/power/izamin.S
+++ b/kernel/power/izamin.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define RET r3
#define X r4
-#define INCX r5
+#define INCX r5
#define N r6
#define NN r7
diff --git a/kernel/power/izamin_hummer.S b/kernel/power/izamin_hummer.S
index 75145ab..f13cf12 100644
--- a/kernel/power/izamin_hummer.S
+++ b/kernel/power/izamin_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCX2 r6
#define X2 r7
@@ -91,7 +91,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
stfpdux f16, SP, r10
stfpdux f17, SP, r10
stfpdux f18, SP, r10
diff --git a/kernel/power/izamin_ppc440.S b/kernel/power/izamin_ppc440.S
index 2cdb8bf..eb90276 100644
--- a/kernel/power/izamin_ppc440.S
+++ b/kernel/power/izamin_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define RET r3
#define X r4
-#define INCX r5
+#define INCX r5
#define N r6
#define NN r7
@@ -96,7 +96,7 @@
slwi INCX, INCX, ZBASE_SHIFT
sub X, X, INCX
- li INC1, SIZE
+ li INC1, SIZE
li PRE, 3 * 16 * SIZE
mr NN, N
diff --git a/kernel/power/max.S b/kernel/power/max.S
index 5862bc9..71f055d 100644
--- a/kernel/power/max.S
+++ b/kernel/power/max.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREA r8
diff --git a/kernel/power/max_hummer.S b/kernel/power/max_hummer.S
index 01ff907..7e226ed 100644
--- a/kernel/power/max_hummer.S
+++ b/kernel/power/max_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCX2 r6
#define X2 r7
@@ -86,7 +86,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
stfpdux f16, SP, r10
stfpdux f17, SP, r10
stfpdux f18, SP, r10
diff --git a/kernel/power/max_ppc440.S b/kernel/power/max_ppc440.S
index 7afdf56..9a12470 100644
--- a/kernel/power/max_ppc440.S
+++ b/kernel/power/max_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREA r8
diff --git a/kernel/power/min.S b/kernel/power/min.S
index 727a6a7..8a80548 100644
--- a/kernel/power/min.S
+++ b/kernel/power/min.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREA r8
diff --git a/kernel/power/min_hummer.S b/kernel/power/min_hummer.S
index bd82687..88a13a9 100644
--- a/kernel/power/min_hummer.S
+++ b/kernel/power/min_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCX2 r6
#define X2 r7
@@ -86,7 +86,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
stfpdux f16, SP, r10
stfpdux f17, SP, r10
stfpdux f18, SP, r10
diff --git a/kernel/power/min_ppc440.S b/kernel/power/min_ppc440.S
index ab67bbc..5ffdfd0 100644
--- a/kernel/power/min_ppc440.S
+++ b/kernel/power/min_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREA r8
diff --git a/kernel/power/nrm2.S b/kernel/power/nrm2.S
index e2b635e..bf84330 100644
--- a/kernel/power/nrm2.S
+++ b/kernel/power/nrm2.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define NN r6
#define XX r7
diff --git a/kernel/power/rot.S b/kernel/power/rot.S
index b9e9338..3e6b8f7 100644
--- a/kernel/power/rot.S
+++ b/kernel/power/rot.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define Y r6
#define INCY r7
#define PREA r8
diff --git a/kernel/power/rot_ppc440.S b/kernel/power/rot_ppc440.S
index bb19583..7a115de 100644
--- a/kernel/power/rot_ppc440.S
+++ b/kernel/power/rot_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define Y r6
#define INCY r7
#define PRE r8
diff --git a/kernel/power/scal.S b/kernel/power/scal.S
index f242f08..7c65d12 100644
--- a/kernel/power/scal.S
+++ b/kernel/power/scal.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define XX r4
#define PREA r5
@@ -65,7 +65,7 @@
#define FZERO f0
#define ALPHA f1
-
+
PROLOGUE
PROFCODE
diff --git a/kernel/power/scal_hummer.S b/kernel/power/scal_hummer.S
index 0b58486..fd7c669 100644
--- a/kernel/power/scal_hummer.S
+++ b/kernel/power/scal_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r6
-#define INCX r7
+#define INCX r7
#define INCX2 r4
#define X2 r5
@@ -74,7 +74,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
stfpdux f16, SP, r10
-
+
li r10, 0
stwu r10, -4(SP)
stwu r10, -4(SP)
diff --git a/kernel/power/scal_ppc440.S b/kernel/power/scal_ppc440.S
index 8b9e271..ed14883 100644
--- a/kernel/power/scal_ppc440.S
+++ b/kernel/power/scal_ppc440.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define XX r4
#define PRE r5
@@ -65,7 +65,7 @@
#define FZERO f0
#define ALPHA f1
-
+
PROLOGUE
PROFCODE
diff --git a/kernel/power/snrm2.S b/kernel/power/snrm2.S
index f235c67..be974cc 100644
--- a/kernel/power/snrm2.S
+++ b/kernel/power/snrm2.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREA r8
diff --git a/kernel/power/snrm2_hummer.S b/kernel/power/snrm2_hummer.S
index a002492..a0ff3d1 100644
--- a/kernel/power/snrm2_hummer.S
+++ b/kernel/power/snrm2_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCX2 r6
#define X2 r7
diff --git a/kernel/power/snrm2_ppc440.S b/kernel/power/snrm2_ppc440.S
index ffda99e..0a80d12 100644
--- a/kernel/power/snrm2_ppc440.S
+++ b/kernel/power/snrm2_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PRE r8
@@ -98,7 +98,7 @@
#endif
slwi INCX, INCX, BASE_SHIFT
- li PRE, 3 * 16 * SIZE
+ li PRE, 3 * 16 * SIZE
sub X, X, INCX
diff --git a/kernel/power/swap.S b/kernel/power/swap.S
index a0d150f..f8b56d4 100644
--- a/kernel/power/swap.S
+++ b/kernel/power/swap.S
@@ -38,12 +38,12 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifdef linux
#ifndef __64BIT__
#define N r3
#define X r6
-#define INCX r7
+#define INCX r7
#define Y r8
#define INCY r9
#define PREA r4
@@ -52,7 +52,7 @@
#else
#define N r3
#define X r7
-#define INCX r8
+#define INCX r8
#define Y r9
#define INCY r10
#define PREA r4
@@ -65,7 +65,7 @@
#if !defined(__64BIT__) && defined(DOUBLE)
#define N r3
#define X r8
-#define INCX r9
+#define INCX r9
#define Y r10
#define INCY r4
#define PREA r5
@@ -74,7 +74,7 @@
#else
#define N r3
#define X r7
-#define INCX r8
+#define INCX r8
#define Y r9
#define INCY r10
#define PREA r4
@@ -124,7 +124,7 @@
#ifdef L1_DUALFETCH
li PREA, (L1_PREFETCHSIZE) / 2
#else
- li PREA, (L1_PREFETCHSIZE)
+ li PREA, (L1_PREFETCHSIZE)
#endif
cmpwi cr0, N, 0
diff --git a/kernel/power/swap_hummer.S b/kernel/power/swap_hummer.S
index 293a28b..fa65acd 100644
--- a/kernel/power/swap_hummer.S
+++ b/kernel/power/swap_hummer.S
@@ -41,9 +41,9 @@
#define N r3
#define X r6
-#define INCX r7
+#define INCX r7
#define Y r8
-#define INCY r9
+#define INCY r9
#define INCX2 r4
#define INCY2 r5
@@ -78,7 +78,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
stfpdux f16, SP, r10
-
+
slwi INCX, INCX, BASE_SHIFT
slwi INCY, INCY, BASE_SHIFT
add INCX2, INCX, INCX
@@ -293,7 +293,7 @@ LL(23):
LL(25):
andi. r0, N, 7
beq LL(29)
-
+
andi. r0, N, 4
beq LL(27)
@@ -447,7 +447,7 @@ LL(33):
LL(35):
andi. r0, N, 7
beq LL(39)
-
+
andi. r0, N, 4
beq LL(37)
@@ -696,7 +696,7 @@ LL(999):
lfpdux f16, SP, r10
lfpdux f15, SP, r10
lfpdux f14, SP, r10
-
+
addi SP, SP, 16
blr
diff --git a/kernel/power/symv_L.S b/kernel/power/symv_L.S
index 91bfb5e..fbf735a 100644
--- a/kernel/power/symv_L.S
+++ b/kernel/power/symv_L.S
@@ -409,7 +409,7 @@ LL(11):
LFD a16, 3 * SIZE(AO4)
- LFD a5, ALPHA
+ LFD a5, ALPHA
FMUL xsum1, atemp1, a1
FMUL xsum2, atemp1, a2
@@ -522,7 +522,7 @@ LL(12):
FMADD y04, atemp2, a8, y04
# DCBT(X, PREX)
NOP2
-
+
FMADD xsum1, xtemp3, a3, xsum1
LFD a3, 6 * SIZE(AO1)
FMADD y01, atemp3, a9, y01
@@ -1211,7 +1211,7 @@ LL(18):
LFD y03, 2 * SIZE(YY)
LFD y04, 3 * SIZE(YY)
- LFD xtemp1, ALPHA
+ LFD xtemp1, ALPHA
FMUL xsum1, xtemp1, xsum1
FMUL xsum2, xtemp1, xsum2
@@ -1254,7 +1254,7 @@ LL(20):
LFD a2, 1 * SIZE(AO1)
LFD a6, 1 * SIZE(AO2)
- LFD a5, ALPHA
+ LFD a5, ALPHA
FMUL xsum1, atemp1, a1
FMUL xsum2, atemp1, a2
@@ -1288,7 +1288,7 @@ LL(28):
LFD y01, 0 * SIZE(YY)
LFD y02, 1 * SIZE(YY)
- LFD xtemp1, ALPHA
+ LFD xtemp1, ALPHA
FMUL xsum1, xtemp1, xsum1
FMUL xsum2, xtemp1, xsum2
@@ -1314,7 +1314,7 @@ LL(30):
LFD atemp1, 0 * SIZE(XX)
LFD a1, 0 * SIZE(AO1)
- LFD xtemp1, ALPHA
+ LFD xtemp1, ALPHA
LFD y01, 0 * SIZE(YY)
FMUL xsum1, atemp1, a1
diff --git a/kernel/power/symv_U.S b/kernel/power/symv_U.S
index 76cbd64..ec1aeea 100644
--- a/kernel/power/symv_U.S
+++ b/kernel/power/symv_U.S
@@ -280,7 +280,7 @@
li PREA, PREFETCHSIZE_A * SIZE
sub IS, M, IS
-
+
cmpwi cr0, M, 0
ble- LL(999)
@@ -390,7 +390,7 @@ LL(11):
slwi TEMP, IS, BASE_SHIFT
add TEMP, X, TEMP
- LFD a16, ALPHA
+ LFD a16, ALPHA
lfd xsum1, FZERO
LFD atemp1, 0 * SIZE(TEMP)
@@ -484,7 +484,7 @@ LL(12):
FMADD y04, atemp2, a8, y04
# DCBT(X, PREX)
NOP2
-
+
FMADD xsum1, xtemp3, a3, xsum1
LFD a3, 6 * SIZE(AO1)
FMADD y01, atemp3, a9, y01
@@ -1106,7 +1106,7 @@ LL(15):
.align 4
LL(18):
- LFD xtemp1, ALPHA
+ LFD xtemp1, ALPHA
FMUL xsum1, xtemp1, xsum1
FMUL xsum2, xtemp1, xsum2
@@ -1163,7 +1163,7 @@ LL(20):
LFD atemp1, 0 * SIZE(TEMP)
LFD atemp2, 1 * SIZE(TEMP)
- LFD a1, ALPHA
+ LFD a1, ALPHA
FMUL atemp1, a1, atemp1
FMUL atemp2, a1, atemp2
@@ -1228,7 +1228,7 @@ LL(22):
.align 4
LL(28):
- LFD xtemp1, ALPHA
+ LFD xtemp1, ALPHA
FMUL xsum1, xtemp1, xsum1
FMUL xsum2, xtemp1, xsum2
@@ -1246,7 +1246,7 @@ LL(28):
addi IS, IS, 2
.align 4
-
+
LL(30):
andi. TEMP, M, 1
ble LL(990)
@@ -1258,7 +1258,7 @@ LL(30):
LFD atemp1, 0 * SIZE(TEMP)
- LFD a1, ALPHA
+ LFD a1, ALPHA
FMUL atemp1, a1, atemp1
@@ -1299,7 +1299,7 @@ LL(32):
.align 4
LL(38):
- LFD xtemp1, ALPHA
+ LFD xtemp1, ALPHA
FMUL xsum1, xtemp1, xsum1
diff --git a/kernel/power/trsm_kernel_LN.S b/kernel/power/trsm_kernel_LN.S
index 6be8e28..0c13a25 100644
--- a/kernel/power/trsm_kernel_LN.S
+++ b/kernel/power/trsm_kernel_LN.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -238,7 +238,7 @@
#ifdef linux
#ifndef __64BIT__
- mr PREA, r10
+ mr PREA, r10
lwz PREB, 8 + STACKSIZE(SP)
lwz PREC, 12 + STACKSIZE(SP)
#else
@@ -863,7 +863,7 @@ LL(28):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -1370,7 +1370,7 @@ LL(18):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -1410,7 +1410,7 @@ LL(18):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -1802,7 +1802,7 @@ LL(18):
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 2 + BASE_SHIFT
add AORIG, AORIG, r0
@@ -1881,7 +1881,7 @@ LL(40):
fmr f5, f0
fmr f6, f0
fmr f7, f0
-
+
#if defined(LN) || defined(RT)
mr AORIG, A
#else
@@ -2608,7 +2608,7 @@ LL(48):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -3448,7 +3448,7 @@ LL(78):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
FSUB f0, f16, f0
FSUB f1, f17, f1
FSUB f2, f18, f2
diff --git a/kernel/power/trsm_kernel_LT.S b/kernel/power/trsm_kernel_LT.S
index 0d28744..06481e5 100644
--- a/kernel/power/trsm_kernel_LT.S
+++ b/kernel/power/trsm_kernel_LT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -259,7 +259,7 @@
#ifdef linux
#ifndef __64BIT__
- mr PREA, r10
+ mr PREA, r10
lwz PREB, 8 + STACKSIZE(SP)
lwz PREC, 12 + STACKSIZE(SP)
#else
@@ -367,7 +367,7 @@ LL(10):
fmr f13, f0
fmr f14, f0
fmr f15, f0
-
+
srawi. I, M, 2
#if defined(LN) || defined(RT)
@@ -639,7 +639,7 @@ LL(18):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -679,7 +679,7 @@ LL(18):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -1071,7 +1071,7 @@ LL(18):
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 2 + BASE_SHIFT
add AORIG, AORIG, r0
@@ -1292,7 +1292,7 @@ LL(28):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -1904,7 +1904,7 @@ LL(40):
fmr f5, f0
fmr f6, f0
fmr f7, f0
-
+
srawi. I, M, 2
#if defined(LN) || defined(RT)
mr AORIG, A
@@ -2117,7 +2117,7 @@ LL(48):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -3061,7 +3061,7 @@ LL(78):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
FSUB f0, f16, f0
FSUB f1, f17, f1
FSUB f2, f18, f2
diff --git a/kernel/power/trsm_kernel_RT.S b/kernel/power/trsm_kernel_RT.S
index 533f299..1777ba8 100644
--- a/kernel/power/trsm_kernel_RT.S
+++ b/kernel/power/trsm_kernel_RT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -256,7 +256,7 @@
#ifdef linux
#ifndef __64BIT__
- mr PREA, r10
+ mr PREA, r10
lwz PREB, 8 + STACKSIZE(SP)
lwz PREC, 12 + STACKSIZE(SP)
#else
@@ -511,7 +511,7 @@ LL(78):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
FSUB f0, f16, f0
FSUB f1, f17, f1
FSUB f2, f18, f2
@@ -1100,7 +1100,7 @@ LL(40):
fmr f5, f0
fmr f6, f0
fmr f7, f0
-
+
srawi. I, M, 2
#if defined(LN) || defined(RT)
mr AORIG, A
@@ -1313,7 +1313,7 @@ LL(48):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -2106,7 +2106,7 @@ LL(10):
fmr f13, f0
fmr f14, f0
fmr f15, f0
-
+
srawi. I, M, 2
#if defined(LN) || defined(RT)
@@ -2378,7 +2378,7 @@ LL(18):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -2418,7 +2418,7 @@ LL(18):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -2810,7 +2810,7 @@ LL(18):
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 2 + BASE_SHIFT
add AORIG, AORIG, r0
@@ -3031,7 +3031,7 @@ LL(28):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
diff --git a/kernel/power/trsm_kernel_cell_LN.S b/kernel/power/trsm_kernel_cell_LN.S
index 179db31..b5ed925 100644
--- a/kernel/power/trsm_kernel_cell_LN.S
+++ b/kernel/power/trsm_kernel_cell_LN.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -233,7 +233,7 @@
#ifdef linux
#ifndef __64BIT__
- mr PREA, r10
+ mr PREA, r10
lwz PREB, 8 + STACKSIZE(SP)
lwz PREC, 12 + STACKSIZE(SP)
#else
@@ -862,7 +862,7 @@ LL(28):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -1166,7 +1166,7 @@ LL(11):
dcbtst CO2, PREC
dcbtst CO3, PREC
dcbtst CO4, PREC
-
+
srawi. r0, TEMP, 2
mtspr CTR, r0
#endif
@@ -1384,7 +1384,7 @@ LL(18):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -1424,7 +1424,7 @@ LL(18):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -1816,7 +1816,7 @@ LL(18):
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 2 + BASE_SHIFT
add AORIG, AORIG, r0
@@ -1895,7 +1895,7 @@ LL(40):
fmr f5, f0
fmr f6, f0
fmr f7, f0
-
+
#if defined(LN) || defined(RT)
mr AORIG, A
#else
@@ -2622,7 +2622,7 @@ LL(48):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -3462,7 +3462,7 @@ LL(78):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
FSUB f0, f16, f0
FSUB f1, f17, f1
FSUB f2, f18, f2
diff --git a/kernel/power/trsm_kernel_cell_LT.S b/kernel/power/trsm_kernel_cell_LT.S
index 06b3d9e..cdc6f75 100644
--- a/kernel/power/trsm_kernel_cell_LT.S
+++ b/kernel/power/trsm_kernel_cell_LT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -259,7 +259,7 @@
#ifdef linux
#ifndef __64BIT__
- mr PREA, r10
+ mr PREA, r10
lwz PREB, 8 + STACKSIZE(SP)
lwz PREC, 12 + STACKSIZE(SP)
#else
@@ -367,7 +367,7 @@ LL(10):
fmr f13, f0
fmr f14, f0
fmr f15, f0
-
+
srawi. I, M, 2
#if defined(LN) || defined(RT)
@@ -405,7 +405,7 @@ LL(11):
dcbtst CO2, PREC
dcbtst CO3, PREC
dcbtst CO4, PREC
-
+
srawi. r0, KK, 2
mtspr CTR, r0
mr BO, B
@@ -654,7 +654,7 @@ LL(18):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -694,7 +694,7 @@ LL(18):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -1086,7 +1086,7 @@ LL(18):
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 2 + BASE_SHIFT
add AORIG, AORIG, r0
@@ -1307,7 +1307,7 @@ LL(28):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -1919,7 +1919,7 @@ LL(40):
fmr f5, f0
fmr f6, f0
fmr f7, f0
-
+
srawi. I, M, 2
#if defined(LN) || defined(RT)
mr AORIG, A
@@ -2132,7 +2132,7 @@ LL(48):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -3076,7 +3076,7 @@ LL(78):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
FSUB f0, f16, f0
FSUB f1, f17, f1
FSUB f2, f18, f2
diff --git a/kernel/power/trsm_kernel_cell_RT.S b/kernel/power/trsm_kernel_cell_RT.S
index 51e7bc4..731f52c 100644
--- a/kernel/power/trsm_kernel_cell_RT.S
+++ b/kernel/power/trsm_kernel_cell_RT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -233,7 +233,7 @@
#ifdef linux
#ifndef __64BIT__
- mr PREA, r10
+ mr PREA, r10
lwz PREB, 8 + STACKSIZE(SP)
lwz PREC, 12 + STACKSIZE(SP)
#else
@@ -492,7 +492,7 @@ LL(78):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
FSUB f0, f16, f0
FSUB f1, f17, f1
FSUB f2, f18, f2
@@ -1081,7 +1081,7 @@ LL(40):
fmr f5, f0
fmr f6, f0
fmr f7, f0
-
+
srawi. I, M, 2
#if defined(LN) || defined(RT)
mr AORIG, A
@@ -1294,7 +1294,7 @@ LL(48):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -2087,7 +2087,7 @@ LL(10):
fmr f13, f0
fmr f14, f0
fmr f15, f0
-
+
srawi. I, M, 2
#if defined(LN) || defined(RT)
@@ -2374,7 +2374,7 @@ LL(18):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -2414,7 +2414,7 @@ LL(18):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -2806,7 +2806,7 @@ LL(18):
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 2 + BASE_SHIFT
add AORIG, AORIG, r0
@@ -3027,7 +3027,7 @@ LL(28):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
diff --git a/kernel/power/trsm_kernel_hummer_LN.S b/kernel/power/trsm_kernel_hummer_LN.S
index 32f4d0d..109dacb 100644
--- a/kernel/power/trsm_kernel_hummer_LN.S
+++ b/kernel/power/trsm_kernel_hummer_LN.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define ALPHA 0
#define FZERO 8
@@ -70,7 +70,7 @@
#define BO r25
#define AO2 r26
#define BO2 r27
-
+
#define CO1 r28
#define CO2 r29
#define CO3 r30
@@ -122,7 +122,7 @@
stfpdux f29, SP, r0
stfpdux f30, SP, r0
stfpdux f31, SP, r0
-
+
stwu r31, -4(SP)
stwu r30, -4(SP)
stwu r29, -4(SP)
@@ -272,7 +272,7 @@
mtspr CTR, r0
ble .L44
#endif
-
+
LFPDUX A1, AO, INC4
LFPDUX B1, BO, INC4
LFPDUX B2, BO2, INC4
@@ -1774,7 +1774,7 @@
fxcsmadd f7, B6, A4, f7
LFPDUX A9, AO, INC4
fxcpmadd f11, B4, A4, f11
- nop
+ nop
fxcsmadd f15, B4, A4, f15
bdnz+ .L12
.align 4
@@ -4597,7 +4597,7 @@
mtspr CTR, r0
ble .L114
#endif
-
+
LFPDUX A1, AO, INC2
LFPDUX A2, AO, INC2
LFPDUX B1, BO, INC2
@@ -5428,7 +5428,7 @@
fsmfp f0, f4
fsmfp f1, f5
fsmfp f2, f6
- fsmfp f3, f7
+ fsmfp f3, f7
#endif
#ifdef LT
@@ -5528,7 +5528,7 @@
fsmfp f0, f4
fsmfp f1, f5
fsmfp f2, f6
- fsmfp f3, f7
+ fsmfp f3, f7
#endif
#ifdef RN
diff --git a/kernel/power/trsm_kernel_hummer_LT.S b/kernel/power/trsm_kernel_hummer_LT.S
index 027fcf0..1ad062a 100644
--- a/kernel/power/trsm_kernel_hummer_LT.S
+++ b/kernel/power/trsm_kernel_hummer_LT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define ALPHA 0
#define FZERO 8
@@ -70,7 +70,7 @@
#define BO r25
#define AO2 r26
#define BO2 r27
-
+
#define CO1 r28
#define CO2 r29
#define CO3 r30
@@ -122,7 +122,7 @@
stfpdux f29, SP, r0
stfpdux f30, SP, r0
stfpdux f31, SP, r0
-
+
stwu r31, -4(SP)
stwu r30, -4(SP)
stwu r29, -4(SP)
@@ -484,7 +484,7 @@
fxcsmadd f7, B6, A4, f7
LFPDUX A9, AO, INC4
fxcpmadd f11, B4, A4, f11
- nop
+ nop
fxcsmadd f15, B4, A4, f15
bdnz+ .L12
.align 4
@@ -2465,7 +2465,7 @@
mtspr CTR, r0
ble .L44
#endif
-
+
LFPDUX A1, AO, INC4
LFPDUX B1, BO, INC4
LFPDUX B2, BO2, INC4
@@ -4667,7 +4667,7 @@
fsmfp f0, f4
fsmfp f1, f5
fsmfp f2, f6
- fsmfp f3, f7
+ fsmfp f3, f7
#endif
#ifdef LT
@@ -4767,7 +4767,7 @@
fsmfp f0, f4
fsmfp f1, f5
fsmfp f2, f6
- fsmfp f3, f7
+ fsmfp f3, f7
#endif
#ifdef RN
@@ -5230,7 +5230,7 @@
mtspr CTR, r0
ble .L114
#endif
-
+
LFPDUX A1, AO, INC2
LFPDUX A2, AO, INC2
LFPDUX B1, BO, INC2
diff --git a/kernel/power/trsm_kernel_hummer_RT.S b/kernel/power/trsm_kernel_hummer_RT.S
index e0b5d21..94b3c0c 100644
--- a/kernel/power/trsm_kernel_hummer_RT.S
+++ b/kernel/power/trsm_kernel_hummer_RT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define ALPHA 0
#define FZERO 8
@@ -70,7 +70,7 @@
#define BO r25
#define AO2 r26
#define BO2 r27
-
+
#define CO1 r28
#define CO2 r29
#define CO3 r30
@@ -122,7 +122,7 @@
stfpdux f29, SP, r0
stfpdux f30, SP, r0
stfpdux f31, SP, r0
-
+
stwu r31, -4(SP)
stwu r30, -4(SP)
stwu r29, -4(SP)
@@ -527,7 +527,7 @@
fsmfp f0, f4
fsmfp f1, f5
fsmfp f2, f6
- fsmfp f3, f7
+ fsmfp f3, f7
#endif
#ifdef LT
@@ -627,7 +627,7 @@
fsmfp f0, f4
fsmfp f1, f5
fsmfp f2, f6
- fsmfp f3, f7
+ fsmfp f3, f7
#endif
#ifdef RN
@@ -1090,7 +1090,7 @@
mtspr CTR, r0
ble .L114
#endif
-
+
LFPDUX A1, AO, INC2
LFPDUX A2, AO, INC2
LFPDUX B1, BO, INC2
@@ -3355,7 +3355,7 @@
fxcsmadd f7, B6, A4, f7
LFPDUX A9, AO, INC4
fxcpmadd f11, B4, A4, f11
- nop
+ nop
fxcsmadd f15, B4, A4, f15
bdnz+ .L12
.align 4
@@ -5336,7 +5336,7 @@
mtspr CTR, r0
ble .L44
#endif
-
+
LFPDUX A1, AO, INC4
LFPDUX B1, BO, INC4
LFPDUX B2, BO2, INC4
diff --git a/kernel/power/trsm_kernel_power6_LN.S b/kernel/power/trsm_kernel_power6_LN.S
index 60ba587..2f85cd1 100644
--- a/kernel/power/trsm_kernel_power6_LN.S
+++ b/kernel/power/trsm_kernel_power6_LN.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -794,7 +794,7 @@ LL(28):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -1406,7 +1406,7 @@ LL(18):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -1446,7 +1446,7 @@ LL(18):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -1838,7 +1838,7 @@ LL(18):
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 2 + BASE_SHIFT
add AORIG, AORIG, r0
@@ -1917,7 +1917,7 @@ LL(40):
fmr f5, f0
fmr f6, f0
fmr f7, f0
-
+
#if defined(LN) || defined(RT)
mr AORIG, A
#else
@@ -2644,7 +2644,7 @@ LL(48):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -3484,7 +3484,7 @@ LL(78):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
FSUB f0, f16, f0
FSUB f1, f17, f1
FSUB f2, f18, f2
diff --git a/kernel/power/trsm_kernel_power6_LT.S b/kernel/power/trsm_kernel_power6_LT.S
index 448b163..6b3d21b 100644
--- a/kernel/power/trsm_kernel_power6_LT.S
+++ b/kernel/power/trsm_kernel_power6_LT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -273,7 +273,7 @@ LL(10):
fmr f13, f0
fmr f14, f0
fmr f15, f0
-
+
srawi. I, M, 2
#if defined(LN) || defined(RT)
@@ -650,7 +650,7 @@ LL(18):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -690,7 +690,7 @@ LL(18):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -1082,7 +1082,7 @@ LL(18):
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 2 + BASE_SHIFT
add AORIG, AORIG, r0
@@ -1303,7 +1303,7 @@ LL(28):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -1915,7 +1915,7 @@ LL(40):
fmr f5, f0
fmr f6, f0
fmr f7, f0
-
+
srawi. I, M, 2
#if defined(LN) || defined(RT)
mr AORIG, A
@@ -2128,7 +2128,7 @@ LL(48):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -3072,7 +3072,7 @@ LL(78):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
FSUB f0, f16, f0
FSUB f1, f17, f1
FSUB f2, f18, f2
diff --git a/kernel/power/trsm_kernel_power6_RT.S b/kernel/power/trsm_kernel_power6_RT.S
index 1f36d17..f6b2e5c 100644
--- a/kernel/power/trsm_kernel_power6_RT.S
+++ b/kernel/power/trsm_kernel_power6_RT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -423,7 +423,7 @@ LL(78):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
FSUB f0, f16, f0
FSUB f1, f17, f1
FSUB f2, f18, f2
@@ -1012,7 +1012,7 @@ LL(40):
fmr f5, f0
fmr f6, f0
fmr f7, f0
-
+
srawi. I, M, 2
#if defined(LN) || defined(RT)
mr AORIG, A
@@ -1225,7 +1225,7 @@ LL(48):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -2018,7 +2018,7 @@ LL(10):
fmr f13, f0
fmr f14, f0
fmr f15, f0
-
+
srawi. I, M, 2
#if defined(LN) || defined(RT)
@@ -2395,7 +2395,7 @@ LL(18):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -2435,7 +2435,7 @@ LL(18):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -2827,7 +2827,7 @@ LL(18):
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 2 + BASE_SHIFT
add AORIG, AORIG, r0
@@ -3048,7 +3048,7 @@ LL(28):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
diff --git a/kernel/power/trsm_kernel_ppc440_LN.S b/kernel/power/trsm_kernel_ppc440_LN.S
index 43354c6..265e79e 100644
--- a/kernel/power/trsm_kernel_ppc440_LN.S
+++ b/kernel/power/trsm_kernel_ppc440_LN.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -786,7 +786,7 @@
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -1286,7 +1286,7 @@
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -1326,7 +1326,7 @@
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -1718,7 +1718,7 @@
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 2 + BASE_SHIFT
add AORIG, AORIG, r0
@@ -1797,7 +1797,7 @@
fmr f5, f0
fmr f6, f0
fmr f7, f0
-
+
#if defined(LN) || defined(RT)
mr AORIG, A
#else
@@ -2482,7 +2482,7 @@
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -3284,7 +3284,7 @@
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
FSUB f0, f16, f0
FSUB f1, f17, f1
FSUB f2, f18, f2
diff --git a/kernel/power/trsm_kernel_ppc440_LT.S b/kernel/power/trsm_kernel_ppc440_LT.S
index eb0d4e4..de7ff74 100644
--- a/kernel/power/trsm_kernel_ppc440_LT.S
+++ b/kernel/power/trsm_kernel_ppc440_LT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -284,7 +284,7 @@
fmr f13, f0
fmr f14, f0
fmr f15, f0
-
+
srawi. I, M, 2
#if defined(LN) || defined(RT)
@@ -548,7 +548,7 @@
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -588,7 +588,7 @@
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -980,7 +980,7 @@
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 2 + BASE_SHIFT
add AORIG, AORIG, r0
@@ -1198,7 +1198,7 @@
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -1796,7 +1796,7 @@
fmr f5, f0
fmr f6, f0
fmr f7, f0
-
+
srawi. I, M, 2
#if defined(LN) || defined(RT)
mr AORIG, A
@@ -1989,7 +1989,7 @@
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -2893,7 +2893,7 @@
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
FSUB f0, f16, f0
FSUB f1, f17, f1
FSUB f2, f18, f2
diff --git a/kernel/power/trsm_kernel_ppc440_RT.S b/kernel/power/trsm_kernel_ppc440_RT.S
index 54c59c2..e8d202d 100644
--- a/kernel/power/trsm_kernel_ppc440_RT.S
+++ b/kernel/power/trsm_kernel_ppc440_RT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -416,7 +416,7 @@
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
FSUB f0, f16, f0
FSUB f1, f17, f1
FSUB f2, f18, f2
@@ -985,7 +985,7 @@
fmr f5, f0
fmr f6, f0
fmr f7, f0
-
+
srawi. I, M, 2
#if defined(LN) || defined(RT)
mr AORIG, A
@@ -1178,7 +1178,7 @@
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -1949,7 +1949,7 @@
fmr f13, f0
fmr f14, f0
fmr f15, f0
-
+
srawi. I, M, 2
#if defined(LN) || defined(RT)
@@ -2213,7 +2213,7 @@
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -2253,7 +2253,7 @@
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -2645,7 +2645,7 @@
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 2 + BASE_SHIFT
add AORIG, AORIG, r0
@@ -2863,7 +2863,7 @@
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
diff --git a/kernel/power/zamax.S b/kernel/power/zamax.S
index 6acd96d..2c1e3b7 100644
--- a/kernel/power/zamax.S
+++ b/kernel/power/zamax.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREA r8
#define INCXM1 r9
diff --git a/kernel/power/zamax_cell.S b/kernel/power/zamax_cell.S
index 2af3d24..a693d5f 100644
--- a/kernel/power/zamax_cell.S
+++ b/kernel/power/zamax_cell.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREA r8
#define INCXM1 r9
diff --git a/kernel/power/zamax_hummer.S b/kernel/power/zamax_hummer.S
index 8431239..0f97053 100644
--- a/kernel/power/zamax_hummer.S
+++ b/kernel/power/zamax_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCX2 r6
@@ -86,7 +86,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
stfpdux f16, SP, r10
stfpdux f17, SP, r10
stfpdux f18, SP, r10
diff --git a/kernel/power/zamax_ppc440.S b/kernel/power/zamax_ppc440.S
index 17372bb..276ef42 100644
--- a/kernel/power/zamax_ppc440.S
+++ b/kernel/power/zamax_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREX r8
#define INC1 r9
@@ -86,7 +86,7 @@
sub X, X, INCX
li INC1, SIZE
-
+
cmpwi cr0, N, 0
ble- LL(9999)
cmpwi cr0, INCX, 0
diff --git a/kernel/power/zamin.S b/kernel/power/zamin.S
index 1ab8b6b..52168e4 100644
--- a/kernel/power/zamin.S
+++ b/kernel/power/zamin.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREA r8
#define INCXM1 r9
diff --git a/kernel/power/zamin_cell.S b/kernel/power/zamin_cell.S
index 6d32f60..45e4005 100644
--- a/kernel/power/zamin_cell.S
+++ b/kernel/power/zamin_cell.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREA r8
#define INCXM1 r9
diff --git a/kernel/power/zamin_hummer.S b/kernel/power/zamin_hummer.S
index 5ac1b89..ff685b6 100644
--- a/kernel/power/zamin_hummer.S
+++ b/kernel/power/zamin_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCX2 r6
@@ -86,7 +86,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
stfpdux f16, SP, r10
stfpdux f17, SP, r10
stfpdux f18, SP, r10
diff --git a/kernel/power/zamin_ppc440.S b/kernel/power/zamin_ppc440.S
index 9d70f76..60888a8 100644
--- a/kernel/power/zamin_ppc440.S
+++ b/kernel/power/zamin_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREX r8
#define INC1 r9
diff --git a/kernel/power/zasum.S b/kernel/power/zasum.S
index 14b58ce..e49011f 100644
--- a/kernel/power/zasum.S
+++ b/kernel/power/zasum.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCXM1 r9
#define PREA r8
diff --git a/kernel/power/zasum_cell.S b/kernel/power/zasum_cell.S
index 7389468..111285d 100644
--- a/kernel/power/zasum_cell.S
+++ b/kernel/power/zasum_cell.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define PREA r8
#define INCXM1 r9
diff --git a/kernel/power/zasum_hummer.S b/kernel/power/zasum_hummer.S
index f090e69..13b6970 100644
--- a/kernel/power/zasum_hummer.S
+++ b/kernel/power/zasum_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCX2 r6
#define X2 r7
@@ -73,7 +73,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
li r10, 0
stwu r10, -4(SP)
stwu r10, -4(SP)
diff --git a/kernel/power/zasum_ppc440.S b/kernel/power/zasum_ppc440.S
index 213c837..dd00c54 100644
--- a/kernel/power/zasum_ppc440.S
+++ b/kernel/power/zasum_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCXM1 r9
#define PREX r8
diff --git a/kernel/power/zaxpy.S b/kernel/power/zaxpy.S
index 7eb591d..1acd729 100644
--- a/kernel/power/zaxpy.S
+++ b/kernel/power/zaxpy.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifdef linux
#ifndef __64BIT__
#define N r3
@@ -53,7 +53,7 @@
#else
#define N r3
#define X r8
-#define INCX r9
+#define INCX r9
#define Y r10
#define INCY r4
#define INCXM1 r5
@@ -152,9 +152,9 @@
#ifdef L1_DUALFETCH
li PREA, (L1_PREFETCHSIZE) / 2
#else
- li PREA, (L1_PREFETCHSIZE)
+ li PREA, (L1_PREFETCHSIZE)
#endif
-
+
cmpwi cr0, N, 0
ble- LL(999)
diff --git a/kernel/power/zaxpy_hummer.S b/kernel/power/zaxpy_hummer.S
index 41b3495..23e702e 100644
--- a/kernel/power/zaxpy_hummer.S
+++ b/kernel/power/zaxpy_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r6
-#define INCX r7
+#define INCX r7
#define Y r8
#define INCY r9
@@ -481,7 +481,7 @@ LL(117):
LL(999):
li r10, 16
subi SP, SP, 16
-
+
lfpdux f25, SP, r10
lfpdux f24, SP, r10
lfpdux f23, SP, r10
diff --git a/kernel/power/zaxpy_ppc440.S b/kernel/power/zaxpy_ppc440.S
index 5100e94..1ac2324 100644
--- a/kernel/power/zaxpy_ppc440.S
+++ b/kernel/power/zaxpy_ppc440.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifdef linux
#ifndef __64BIT__
#define N r3
@@ -51,7 +51,7 @@
#else
#define N r3
#define X r8
-#define INCX r9
+#define INCX r9
#define Y r5
#define INCY r4
#define YY r6
@@ -96,7 +96,7 @@
PROFCODE
subi SP, SP, STACKSIZE
-
+
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
diff --git a/kernel/power/zcopy.S b/kernel/power/zcopy.S
index f5ed2f9..b755503 100644
--- a/kernel/power/zcopy.S
+++ b/kernel/power/zcopy.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define Y r6
#define INCY r7
#define PREA r8
@@ -69,7 +69,7 @@
#ifdef L1_DUALFETCH
li PREA, (L1_PREFETCHSIZE) / 2
#else
- li PREA, (L1_PREFETCHSIZE)
+ li PREA, (L1_PREFETCHSIZE)
#endif
cmpwi cr0, N, 0
diff --git a/kernel/power/zcopy_hummer.S b/kernel/power/zcopy_hummer.S
index 825b440..cbf4acd 100644
--- a/kernel/power/zcopy_hummer.S
+++ b/kernel/power/zcopy_hummer.S
@@ -38,12 +38,12 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define Y r6
-#define INCY r7
+#define INCY r7
#define INCX2 r8
#define INCY2 r9
@@ -75,7 +75,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
slwi INCX, INCX, BASE_SHIFT
slwi INCY, INCY, BASE_SHIFT
add INCX2, INCX, INCX
@@ -277,7 +277,7 @@ LL(23):
LL(25):
andi. r0, N, 7
beq LL(29)
-
+
andi. r0, N, 4
beq LL(26)
@@ -416,7 +416,7 @@ LL(33):
LL(35):
andi. r0, N, 7
beq LL(999)
-
+
andi. r0, N, 4
beq LL(36)
@@ -645,7 +645,7 @@ LL(999):
lfpdux f15, SP, r10
lfpdux f14, SP, r10
-
+
addi SP, SP, 16
blr
diff --git a/kernel/power/zdot.S b/kernel/power/zdot.S
index dab7eaa..f6a68aa 100644
--- a/kernel/power/zdot.S
+++ b/kernel/power/zdot.S
@@ -38,19 +38,19 @@
#define ASSEMBLER
#include "common.h"
-
+
#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C)
#define RESULT r3
#define N r4
#define X r5
-#define INCX r6
+#define INCX r6
#define Y r7
#define INCY r8
#define PREA r9
#else
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define Y r6
#define INCY r7
#define PREA r8
@@ -127,7 +127,7 @@
#ifdef L1_DUALFETCH
li PREA, (L1_PREFETCHSIZE) / 2
#else
- li PREA, (L1_PREFETCHSIZE)
+ li PREA, (L1_PREFETCHSIZE)
#endif
cmpwi cr0, N, 0
diff --git a/kernel/power/zdot_cell.S b/kernel/power/zdot_cell.S
index 66b7dfa..1fe15df 100644
--- a/kernel/power/zdot_cell.S
+++ b/kernel/power/zdot_cell.S
@@ -38,19 +38,19 @@
#define ASSEMBLER
#include "common.h"
-
+
#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C)
#define RESULT r3
#define N r4
#define X r5
-#define INCX r6
+#define INCX r6
#define Y r7
#define INCY r8
#define PREA r9
#else
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define Y r6
#define INCY r7
#define PREA r8
@@ -286,7 +286,7 @@ LL(20):
addi Y, Y, 16 * SIZE
FMADD f2, f9, f16, f2
nop
-
+
FMADD f4, f10, f18, f4
FMADD f7, f10, f19, f7
FMADD f5, f11, f19, f5
diff --git a/kernel/power/zdot_hummer.S b/kernel/power/zdot_hummer.S
index 83027cf..fa5003c 100644
--- a/kernel/power/zdot_hummer.S
+++ b/kernel/power/zdot_hummer.S
@@ -38,18 +38,18 @@
#define ASSEMBLER
#include "common.h"
-
+
#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C)
#define RESULT r3
#define N r4
#define X r5
-#define INCX r6
+#define INCX r6
#define Y r7
#define INCY r8
#else
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define Y r6
#define INCY r7
#endif
@@ -97,7 +97,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
stfpdux f16, SP, r10
stfpdux f17, SP, r10
stfpdux f18, SP, r10
diff --git a/kernel/power/zdot_ppc440.S b/kernel/power/zdot_ppc440.S
index 3340e65..490418c 100644
--- a/kernel/power/zdot_ppc440.S
+++ b/kernel/power/zdot_ppc440.S
@@ -38,19 +38,19 @@
#define ASSEMBLER
#include "common.h"
-
+
#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C)
#define RESULT r3
#define N r4
#define X r5
-#define INCX r6
+#define INCX r6
#define Y r7
#define INCY r8
#define PRE r9
#else
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define Y r6
#define INCY r7
#define PRE r8
diff --git a/kernel/power/zgemm_beta.S b/kernel/power/zgemm_beta.S
index c936a3d..4a9cbd8 100644
--- a/kernel/power/zgemm_beta.S
+++ b/kernel/power/zgemm_beta.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M r3
#define N r4
#define C r10
@@ -86,11 +86,11 @@
#endif
#endif
-
+
slwi LDC, LDC, ZBASE_SHIFT
lfs f0, 16(SP)
-
+
fmr ALPHA_R, f1
fmr ALPHA_I, f2
@@ -138,7 +138,7 @@ LL(12):
addi CO1, CO1, 16 * SIZE
bdnz LL(12)
.align 4
-
+
LL(15):
andi. r0, M, 7
mtspr CTR, r0
@@ -211,7 +211,7 @@ LL(22):
dcbtst PRE, CO1
bdnz LL(22)
.align 4
-
+
LL(25):
andi. r0, M, 3
mtspr CTR, r0
diff --git a/kernel/power/zgemm_kernel.S b/kernel/power/zgemm_kernel.S
index 5fef0da..3d66895 100644
--- a/kernel/power/zgemm_kernel.S
+++ b/kernel/power/zgemm_kernel.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -105,7 +105,7 @@
#define PREA r30
#define PREC r31
#define PREB PREA
-
+
#ifndef NEEDPARAM
PROLOGUE
@@ -335,7 +335,7 @@ LL(11):
LFD f21, 1 * SIZE(B)
LFD f22, 2 * SIZE(B)
LFD f23, 3 * SIZE(B)
-
+
#ifdef POWER5
LFD f28, 4 * SIZE(B)
LFD f29, 5 * SIZE(B)
@@ -564,7 +564,7 @@ LL(12):
LFD f30, 22 * SIZE(BO)
LFD f31, 23 * SIZE(BO)
#endif
-
+
addi AO, AO, 16 * SIZE
addi BO, BO, 16 * SIZE
@@ -831,7 +831,7 @@ LL(KERNEL_MainFinish):
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
diff --git a/kernel/power/zgemm_kernel_altivec.S b/kernel/power/zgemm_kernel_altivec.S
index b55300e..2267e97 100644
--- a/kernel/power/zgemm_kernel_altivec.S
+++ b/kernel/power/zgemm_kernel_altivec.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -1624,7 +1624,7 @@ LL(98):
STFD f8, 0 * SIZE(CO1)
STFD f9, 1 * SIZE(CO1)
.align 4
-
+
LL(999):
mr SP, STACK
diff --git a/kernel/power/zgemm_kernel_altivec_cell.S b/kernel/power/zgemm_kernel_altivec_cell.S
index 7b80e66..9a1407d 100644
--- a/kernel/power/zgemm_kernel_altivec_cell.S
+++ b/kernel/power/zgemm_kernel_altivec_cell.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -1779,7 +1779,7 @@ LL(98):
STFD f8, 0 * SIZE(CO1)
STFD f9, 1 * SIZE(CO1)
.align 4
-
+
LL(999):
mr SP, STACK
diff --git a/kernel/power/zgemm_kernel_altivec_g4.S b/kernel/power/zgemm_kernel_altivec_g4.S
index f827348..4c774a1 100644
--- a/kernel/power/zgemm_kernel_altivec_g4.S
+++ b/kernel/power/zgemm_kernel_altivec_g4.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -1678,7 +1678,7 @@ LL(98):
STFD f8, 0 * SIZE(CO1)
STFD f9, 1 * SIZE(CO1)
.align 4
-
+
LL(999):
mr SP, STACK
diff --git a/kernel/power/zgemm_kernel_cell.S b/kernel/power/zgemm_kernel_cell.S
index f0d3204..5667b13 100644
--- a/kernel/power/zgemm_kernel_cell.S
+++ b/kernel/power/zgemm_kernel_cell.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -105,7 +105,7 @@
#define PREA r30
#define PREC r31
#define PREB PREA
-
+
#ifndef NEEDPARAM
#ifndef DOUBLE
@@ -778,7 +778,7 @@ LL(KERNEL_MainFinish):
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
diff --git a/kernel/power/zgemm_kernel_g4.S b/kernel/power/zgemm_kernel_g4.S
index c652adf..af6f88e 100644
--- a/kernel/power/zgemm_kernel_g4.S
+++ b/kernel/power/zgemm_kernel_g4.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -660,7 +660,7 @@
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
diff --git a/kernel/power/zgemm_kernel_hummer.S b/kernel/power/zgemm_kernel_hummer.S
index 7378950..991a643 100644
--- a/kernel/power/zgemm_kernel_hummer.S
+++ b/kernel/power/zgemm_kernel_hummer.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#undef ZERO
#define ALPHA 0
@@ -72,7 +72,7 @@
#define BO r25
#define AO2 r26
#define BO2 r27
-
+
#define CO1 r28
#define CO2 r29
#define ZERO r31
@@ -131,7 +131,7 @@
stfpdux f29, SP, r0
stfpdux f30, SP, r0
stfpdux f31, SP, r0
-
+
stwu r31, -4(SP)
stwu r30, -4(SP)
stwu r29, -4(SP)
@@ -201,7 +201,7 @@
#endif
addi AO, A, -4 * SIZE
-
+
li r0, FZERO
lfpsx f0, SP, r0
@@ -1435,7 +1435,7 @@
#endif
addi AO, A, -2 * SIZE
-
+
li r0, FZERO
lfpsx f0, SP, r0
@@ -2273,7 +2273,7 @@
#endif
addi AO, A, -4 * SIZE
-
+
li r0, FZERO
lfpsx f0, SP, r0
@@ -2629,7 +2629,7 @@
#endif
FXCSMADD f13, B2, A2, f13
nop
-
+
FXCPMADD f2, B5, A8, f2
nop
FXCSMADD f6, B5, A8, f6
@@ -3576,7 +3576,7 @@
#endif
addi AO, A, -2 * SIZE
-
+
li r0, FZERO
lfpsx f0, SP, r0
diff --git a/kernel/power/zgemm_kernel_power3.S b/kernel/power/zgemm_kernel_power3.S
index 716fa88..d7d6e2a 100644
--- a/kernel/power/zgemm_kernel_power3.S
+++ b/kernel/power/zgemm_kernel_power3.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -99,7 +99,7 @@
#define PREA r30
#define PREC r31
#define PREB PREA
-
+
#ifndef NEEDPARAM
#ifndef DOUBLE
@@ -291,7 +291,7 @@ LL(KERNEL_MainSubHead):
LFD f21, 1 * SIZE(B)
LFD f22, 2 * SIZE(B)
LFD f23, 3 * SIZE(B)
-
+
LFD f24, 4 * SIZE(AO)
LFD f25, 5 * SIZE(AO)
LFD f26, 6 * SIZE(AO)
@@ -601,7 +601,7 @@ LL(KERNEL_MainFinish):
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
-
+
addic. I, I, -1
bgt LL(KERNEL_MainSubHead)
.align 4
diff --git a/kernel/power/zgemm_kernel_power6.S b/kernel/power/zgemm_kernel_power6.S
index 7f677df..3f79c05 100644
--- a/kernel/power/zgemm_kernel_power6.S
+++ b/kernel/power/zgemm_kernel_power6.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -107,7 +107,7 @@
#define PREA r30
#define PREC r31
-
+
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define FMA1 FMADD
#define FMA2 FMADD
@@ -1056,7 +1056,7 @@ LL(18):
addi CO2, CO2, 4 * SIZE
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
@@ -1445,7 +1445,7 @@ LL(28):
addi CO2, CO2, 2 * SIZE
addi CO3, CO3, 2 * SIZE
addi CO4, CO4, 2 * SIZE
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
@@ -2016,7 +2016,7 @@ LL(38):
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
@@ -2271,7 +2271,7 @@ LL(48):
addi CO1, CO1, 2 * SIZE
addi CO2, CO2, 2 * SIZE
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
@@ -2644,7 +2644,7 @@ LL(58):
fmr f11, f0
addi CO1, CO1, 4 * SIZE
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
@@ -2847,7 +2847,7 @@ LL(68):
STFD f1, 1 * SIZE(CO1)
addi CO1, CO1, 2 * SIZE
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
diff --git a/kernel/power/zgemm_kernel_ppc440.S b/kernel/power/zgemm_kernel_ppc440.S
index 2a80c97..075fa2b 100644
--- a/kernel/power/zgemm_kernel_ppc440.S
+++ b/kernel/power/zgemm_kernel_ppc440.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -723,7 +723,7 @@
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
-
+
#ifdef TRMMKERNEL
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
diff --git a/kernel/power/zgemm_ncopy_hummer_2.S b/kernel/power/zgemm_ncopy_hummer_2.S
index 9a6f802..8a2ac93 100644
--- a/kernel/power/zgemm_ncopy_hummer_2.S
+++ b/kernel/power/zgemm_ncopy_hummer_2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M r3
#define N r4
#define A r5
@@ -77,7 +77,7 @@
stfpdux f14, SP, r0
stfpdux f15, SP, r0
-
+
stwu r31, -4(SP)
stwu r30, -4(SP)
@@ -146,7 +146,7 @@ LL(12):
STFPDUX c16, B, INC2
bdnz LL(12)
.align 4
-
+
LL(15):
andi. r0, M, 7
ble LL(19)
@@ -227,7 +227,7 @@ LL(22):
STFPDUX c07, B, INC2
bdnz LL(22)
.align 4
-
+
LL(25):
andi. r0, M, 3
ble LL(99)
@@ -321,7 +321,7 @@ LL(112):
STFPDUX c15, B, INC2
bdnz LL(112)
.align 4
-
+
LL(115):
andi. r0, M, 3
ble LL(119)
@@ -404,7 +404,7 @@ LL(122):
STFPDUX c07, B, INC2
bdnz LL(122)
.align 4
-
+
LL(125):
andi. r0, M, 3
ble LL(999)
diff --git a/kernel/power/zgemm_ncopy_hummer_4.S b/kernel/power/zgemm_ncopy_hummer_4.S
index 0a64d0d..e3c4b71 100644
--- a/kernel/power/zgemm_ncopy_hummer_4.S
+++ b/kernel/power/zgemm_ncopy_hummer_4.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M r3
#define N r4
#define A r5
@@ -79,7 +79,7 @@
stfpdux f14, SP, r0
stfpdux f15, SP, r0
-
+
stwu r31, -4(SP)
stwu r30, -4(SP)
@@ -154,7 +154,7 @@ LL(12):
STFPDUX c16, B, INC2
bdnz LL(12)
.align 4
-
+
LL(15):
andi. r0, M, 3
ble LL(19)
@@ -237,7 +237,7 @@ LL(22):
STFPDUX c08, B, INC2
bdnz LL(22)
.align 4
-
+
LL(25):
andi. r0, M, 3
ble LL(30)
@@ -290,7 +290,7 @@ LL(32):
STFPDUX c04, B, INC2
bdnz LL(32)
.align 4
-
+
LL(35):
andi. r0, M, 3
ble LL(99)
@@ -406,7 +406,7 @@ LL(112):
STFPDUX c16, B, INC2
bdnz LL(112)
.align 4
-
+
LL(115):
andi. r0, M, 3
ble LL(119)
@@ -536,7 +536,7 @@ LL(122):
STFPDUX c15, B, INC2
bdnz LL(122)
.align 4
-
+
LL(125):
andi. r0, M, 3
ble LL(130)
@@ -614,7 +614,7 @@ LL(132):
STFPDUX c07, B, INC2
bdnz LL(132)
.align 4
-
+
LL(135):
andi. r0, M, 3
ble LL(999)
diff --git a/kernel/power/zgemm_tcopy_hummer_2.S b/kernel/power/zgemm_tcopy_hummer_2.S
index bc2a083..d5dea2f 100644
--- a/kernel/power/zgemm_tcopy_hummer_2.S
+++ b/kernel/power/zgemm_tcopy_hummer_2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M r3
#define N r4
#define A r5
@@ -55,7 +55,7 @@
#define M4 r29
#define INC r30
#define INC2 r31
-
+
#define c01 f0
#define c02 f1
#define c03 f2
@@ -130,7 +130,7 @@ LL(12):
STFPDUX c04, B1, INC2
bdnz LL(12)
.align 4
-
+
LL(15):
andi. r0, N, 1
ble LL(19)
@@ -168,7 +168,7 @@ LL(22):
STFPDUX c02, B1, INC2
bdnz LL(22)
.align 4
-
+
LL(23):
andi. r0, N, 1
ble LL(99)
@@ -230,7 +230,7 @@ LL(112):
STFPDUX c07, B1, INC2
bdnz LL(112)
.align 4
-
+
LL(115):
andi. r0, N, 1
ble LL(119)
@@ -278,7 +278,7 @@ LL(122):
STFPDUX c03, B1, INC2
bdnz LL(122)
.align 4
-
+
LL(123):
andi. r0, N, 1
ble LL(999)
diff --git a/kernel/power/zgemm_tcopy_hummer_4.S b/kernel/power/zgemm_tcopy_hummer_4.S
index 7011dc2..aae4e73 100644
--- a/kernel/power/zgemm_tcopy_hummer_4.S
+++ b/kernel/power/zgemm_tcopy_hummer_4.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M r3
#define N r4
#define A r5
@@ -57,7 +57,7 @@
#define M4 r29
#define INC r30
#define INC2 r31
-
+
#define c01 f0
#define c02 f1
#define c03 f2
@@ -184,7 +184,7 @@ LL(12):
STFPDUX c16, B1, INC2
bdnz LL(12)
.align 4
-
+
LL(15):
andi. r0, N, 3
ble LL(19)
@@ -271,7 +271,7 @@ LL(22):
STFPDUX c08, B1, INC2
bdnz LL(22)
.align 4
-
+
LL(23):
andi. r0, N, 2
ble LL(24)
@@ -323,7 +323,7 @@ LL(32):
STFPDUX c04, B1, INC2
bdnz LL(32)
.align 4
-
+
LL(33):
andi. r0, N, 2
ble LL(34)
@@ -447,7 +447,7 @@ LL(112):
STFPDUX c16, B1, INC2
bdnz LL(112)
.align 4
-
+
LL(115):
andi. r0, N, 3
ble LL(119)
@@ -576,7 +576,7 @@ LL(122):
STFPDUX c15, B1, INC2
bdnz LL(122)
.align 4
-
+
LL(123):
andi. r0, N, 2
ble LL(124)
@@ -651,7 +651,7 @@ LL(132):
STFPDUX c07, B1, INC2
bdnz LL(132)
.align 4
-
+
LL(133):
andi. r0, N, 2
ble LL(134)
diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S
index 00ba966..ba4685d 100644
--- a/kernel/power/zgemv_n.S
+++ b/kernel/power/zgemv_n.S
@@ -82,7 +82,7 @@
#define INCY r7
#endif
#endif
-
+
#define I r11
#define J r12
@@ -2155,7 +2155,7 @@ LL(37):
add Y2, Y2, INCY
b LL(999)
.align 4
-
+
LL(100):
srawi. J, N, 2
ble LL(120)
diff --git a/kernel/power/zgemv_n_ppc440.S b/kernel/power/zgemv_n_ppc440.S
index 690eb0d..31e7202 100644
--- a/kernel/power/zgemv_n_ppc440.S
+++ b/kernel/power/zgemv_n_ppc440.S
@@ -82,7 +82,7 @@
#define INCY r7
#endif
#endif
-
+
#define I r11
#define J r12
@@ -1241,7 +1241,7 @@ LL(37):
STFDU y01, 1 * SIZE(Y2)
STFDU y02, 1 * SIZE(Y2)
.align 4
-
+
LL(990):
cmpi cr0, 0, INCY, SIZE
beq LL(999)
diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S
index 057c04d..bd8ac40 100644
--- a/kernel/power/zgemv_t.S
+++ b/kernel/power/zgemv_t.S
@@ -257,7 +257,7 @@
stfd f1, ALPHA_R
stfd f2, ALPHA_I
-
+
mullw PLDA_M, LDA, N
li XP, P
subf PLDA_M, XP, PLDA_M
@@ -669,7 +669,7 @@ LL(MainKernel):
addi BO, BO, 16 * SIZE
bdnz LL(MainKernel)
- .align 4
+ .align 4
LL(MainKernelSkip):
FMADD f0, f16, f24, f0
@@ -984,7 +984,7 @@ LL(MainN3Kernel):
addi AO4, AO4, 2 * SIZE
bdnz LL(MainN3Kernel)
- .align 4
+ .align 4
LL(MainN3KernelSkip):
FMADD f0, f16, f24, f0
@@ -1159,7 +1159,7 @@ LL(FinishN1):
cmpwi cr0, J, 0
bgt LL(MainHead)
.align 4
-
+
LL(Remain):
andi. J, N, 3
ble LL(ISEnd)
@@ -1301,7 +1301,7 @@ LL(RemainKernel):
DCBT(AO1, PREA)
bdnz LL(RemainKernel)
- .align 4
+ .align 4
LL(RemainKernelSkip):
FMADD f0, f16, f24, f0
@@ -1393,7 +1393,7 @@ LL(RemainN3Kernel):
LFDU f25, 2 * SIZE(BO)
addi AO1, AO1, 2 * SIZE
bdnz LL(RemainN3Kernel)
- .align 4
+ .align 4
LL(RemainN3KernelSkip):
FMADD f0, f16, f24, f0
diff --git a/kernel/power/zgemv_t_ppc440.S b/kernel/power/zgemv_t_ppc440.S
index edb5183..043b9e3 100644
--- a/kernel/power/zgemv_t_ppc440.S
+++ b/kernel/power/zgemv_t_ppc440.S
@@ -542,7 +542,7 @@ LL(12):
LFDU a7, 1 * SIZE(AO4)
FMADD4 y8, a8, b3, y8
bdnz LL(12)
- .align 4
+ .align 4
LL(13):
FMADD1 y1, a1, b1, y1
@@ -804,7 +804,7 @@ LL(19):
cmpwi cr0, J, 0
bgt LL(11)
.align 4
-
+
LL(20):
andi. J, N, 2
ble LL(30)
@@ -920,7 +920,7 @@ LL(22):
FMADD4 y4, a4, b3, y4
bdnz LL(22)
- .align 4
+ .align 4
LL(23):
FMADD1 y1, a1, b1, y1
@@ -1147,7 +1147,7 @@ LL(32):
LFDU a2, 1 * SIZE(AO1)
bdnz LL(32)
- .align 4
+ .align 4
LL(33):
FMADD1 y1, a1, b1, y1
diff --git a/kernel/power/zger.S b/kernel/power/zger.S
index 03d0bca..01cb907 100644
--- a/kernel/power/zger.S
+++ b/kernel/power/zger.S
@@ -342,7 +342,7 @@ LL(06):
addi X1, X1, 2 * SIZE
bdnz+ LL(06)
.align 4
-
+
LL(10):
srawi. J, N, 1
ble LL(20)
@@ -937,7 +937,7 @@ LL(19):
cmpi cr0, 0, J, 0
bgt LL(11)
.align 4
-
+
LL(20):
andi. J, N, 1
ble LL(999)
diff --git a/kernel/power/znrm2.S b/kernel/power/znrm2.S
index ded25fd..60f379d 100644
--- a/kernel/power/znrm2.S
+++ b/kernel/power/znrm2.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define NN r6
#define XX r7
diff --git a/kernel/power/znrm2_hummer.S b/kernel/power/znrm2_hummer.S
index b6deb94..1d0c598 100644
--- a/kernel/power/znrm2_hummer.S
+++ b/kernel/power/znrm2_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define INCX2 r6
#define X2 r7
@@ -91,7 +91,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
-
+
stfpdux f16, SP, r10
stfpdux f17, SP, r10
stfpdux f18, SP, r10
@@ -309,7 +309,7 @@ LL(20):
fdiv ALPHA_R, ALPHA_R, ALPHA
lfpsx C1, SP, r10 # Zero clear
-
+
fpmr C2, C1
fpmr C3, C1
fpmr C4, C1
@@ -755,7 +755,7 @@ LL(120):
fdiv ALPHA_R, ALPHA_R, ALPHA
lfpsx C1, SP, r10 # Zero clear
-
+
fpmr C2, C1
fpmr C3, C1
fpmr C4, C1
diff --git a/kernel/power/znrm2_ppc440.S b/kernel/power/znrm2_ppc440.S
index 3542279..778b805 100644
--- a/kernel/power/znrm2_ppc440.S
+++ b/kernel/power/znrm2_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define NN r6
#define XX r7
@@ -107,7 +107,7 @@
sub X, X, INCX
li INC1, SIZE
- li PRE, 3 * 16 * SIZE
+ li PRE, 3 * 16 * SIZE
cmpwi cr0, N, 0
ble- LL(999)
diff --git a/kernel/power/zrot.S b/kernel/power/zrot.S
index aad28af..3ec4277 100644
--- a/kernel/power/zrot.S
+++ b/kernel/power/zrot.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define Y r6
#define INCY r7
#define PREA r8
@@ -88,7 +88,7 @@
srawi. r0, N, 3
mtspr CTR, r0
beq- cr0, LL(50)
-
+
LFD f0, 0 * SIZE(X)
LFD f4, 1 * SIZE(X)
LFD f6, 2 * SIZE(X)
diff --git a/kernel/power/zrot_ppc440.S b/kernel/power/zrot_ppc440.S
index fe1a99d..abde97e 100644
--- a/kernel/power/zrot_ppc440.S
+++ b/kernel/power/zrot_ppc440.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r4
-#define INCX r5
+#define INCX r5
#define Y r6
#define INCY r7
#define PRE r8
diff --git a/kernel/power/zscal.S b/kernel/power/zscal.S
index 7ffa80f..2eb7b0d 100644
--- a/kernel/power/zscal.S
+++ b/kernel/power/zscal.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define XX r4
#define PREA r5
@@ -66,7 +66,7 @@
#define FZERO f0
#define ALPHA_R f1
#define ALPHA_I f2
-
+
PROLOGUE
PROFCODE
@@ -80,7 +80,7 @@
#if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE)
lwz INCX, 56(SP)
#endif
-
+
slwi INCX, INCX, ZBASE_SHIFT
li PREA, L1_PREFETCHSIZE
diff --git a/kernel/power/zscal_hummer.S b/kernel/power/zscal_hummer.S
index 6c559f3..56fd5d1 100644
--- a/kernel/power/zscal_hummer.S
+++ b/kernel/power/zscal_hummer.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define X r6
-#define INCX r7
+#define INCX r7
#define INCX2 r4
#define XX r5
@@ -78,7 +78,7 @@
stfpdux f15, SP, r10
stfpdux f16, SP, r10
stfpdux f17, SP, r10
-
+
li r10, 0
stwu r10, -4(SP)
stwu r10, -4(SP)
diff --git a/kernel/power/zscal_ppc440.S b/kernel/power/zscal_ppc440.S
index 9f120ac..d0e4c9b 100644
--- a/kernel/power/zscal_ppc440.S
+++ b/kernel/power/zscal_ppc440.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N r3
#define XX r4
#define PRE r5
@@ -68,7 +68,7 @@
#define FZERO f0
#define ALPHA_R f1
#define ALPHA_I f2
-
+
PROLOGUE
PROFCODE
@@ -82,7 +82,7 @@
#if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE)
lwz INCX, 56(SP)
#endif
-
+
slwi INCX, INCX, ZBASE_SHIFT
li INC1, SIZE
sub X, X, INCX
diff --git a/kernel/power/zswap.S b/kernel/power/zswap.S
index 4c23c1d..048e8ac 100644
--- a/kernel/power/zswap.S
+++ b/kernel/power/zswap.S
@@ -38,12 +38,12 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifdef linux
#ifndef __64BIT__
#define N r3
#define X r6
-#define INCX r7
+#define INCX r7
#define Y r8
#define INCY r9
#define PREA r4
@@ -52,7 +52,7 @@
#else
#define N r3
#define X r8
-#define INCX r9
+#define INCX r9
#define Y r10
#define INCY r4
#define PREA r5
@@ -65,7 +65,7 @@
#if !defined(__64BIT__) && defined(DOUBLE)
#define N r3
#define X r10
-#define INCX r4
+#define INCX r4
#define Y r5
#define INCY r6
#define PREA r7
@@ -74,7 +74,7 @@
#else
#define N r3
#define X r8
-#define INCX r9
+#define INCX r9
#define Y r10
#define INCY r4
#define PREA r5
@@ -120,7 +120,7 @@
#if defined(linux) && defined(__64BIT__)
ld INCY, 112 + STACKSIZE(SP)
#endif
-
+
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld INCY, 112 + STACKSIZE(SP)
@@ -143,7 +143,7 @@
#ifdef L1_DUALFETCH
li PREA, (L1_PREFETCHSIZE) / 2
#else
- li PREA, (L1_PREFETCHSIZE)
+ li PREA, (L1_PREFETCHSIZE)
#endif
cmpwi cr0, N, 0
diff --git a/kernel/power/zswap_hummer.S b/kernel/power/zswap_hummer.S
index 335eaa1..3674cdc 100644
--- a/kernel/power/zswap_hummer.S
+++ b/kernel/power/zswap_hummer.S
@@ -41,9 +41,9 @@
#define N r3
#define X r6
-#define INCX r7
+#define INCX r7
#define Y r8
-#define INCY r9
+#define INCY r9
#define INCX2 r4
#define INCY2 r5
@@ -78,7 +78,7 @@
stfpdux f14, SP, r10
stfpdux f15, SP, r10
stfpdux f16, SP, r10
-
+
slwi INCX, INCX, BASE_SHIFT
slwi INCY, INCY, BASE_SHIFT
add INCX2, INCX, INCX
@@ -283,7 +283,7 @@ LL(23):
LL(25):
andi. r0, N, 3
beq LL(29)
-
+
andi. r0, N, 2
beq LL(27)
@@ -428,7 +428,7 @@ LL(33):
LL(35):
andi. r0, N, 3
beq LL(39)
-
+
andi. r0, N, 2
beq LL(37)
@@ -658,7 +658,7 @@ LL(999):
lfpdux f16, SP, r10
lfpdux f15, SP, r10
lfpdux f14, SP, r10
-
+
addi SP, SP, 16
blr
diff --git a/kernel/power/zsymv_L.S b/kernel/power/zsymv_L.S
index 0dca84d..ad4a8cd 100644
--- a/kernel/power/zsymv_L.S
+++ b/kernel/power/zsymv_L.S
@@ -100,7 +100,7 @@
#define TEMP r22
#define PREA r24
#define IS r25
-
+
#define y01 f0
#define y02 f1
#define y03 f2
@@ -1455,7 +1455,7 @@ LL(18):
STFD y04, 3 * SIZE(YY)
ble LL(11)
.align 4
-
+
LL(20):
andi. TEMP, N, 1
ble LL(990)
@@ -1505,7 +1505,7 @@ LL(20):
STFD y01, 0 * SIZE(YY)
STFD y02, 1 * SIZE(YY)
.align 4
-
+
LL(990):
cmpwi cr0, INCY, 2 * SIZE
beq LL(999)
diff --git a/kernel/power/zsymv_U.S b/kernel/power/zsymv_U.S
index dbf6ebb..4032b66 100644
--- a/kernel/power/zsymv_U.S
+++ b/kernel/power/zsymv_U.S
@@ -295,7 +295,7 @@
li PREA, PREFETCHSIZE_A * SIZE
sub IS, M, IS
-
+
cmpwi cr0, M, 0
ble- LL(999)
@@ -1393,7 +1393,7 @@ LL(18):
STFD y04, 3 * SIZE(YY)
ble LL(11)
.align 4
-
+
LL(20):
andi. TEMP, M, 1
ble LL(990)
@@ -1485,7 +1485,7 @@ LL(28):
STFD y01, 0 * SIZE(YY)
STFD y02, 1 * SIZE(YY)
.align 4
-
+
LL(990):
cmpwi cr0, INCY, 2 * SIZE
beq LL(999)
diff --git a/kernel/power/ztrsm_kernel_LN.S b/kernel/power/ztrsm_kernel_LN.S
index e31a887..64fb968 100644
--- a/kernel/power/ztrsm_kernel_LN.S
+++ b/kernel/power/ztrsm_kernel_LN.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -106,7 +106,7 @@
#define PREA r30
#define PREC r31
#define PREB PREA
-
+
#ifndef NEEDPARAM
PROLOGUE
@@ -1074,7 +1074,7 @@ LL(KERNEL_MainFinish):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -1094,7 +1094,7 @@ LL(KERNEL_MainFinish):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -1431,7 +1431,7 @@ LL(KERNEL_MainFinish):
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 1 + ZBASE_SHIFT
add AORIG, AORIG, r0
diff --git a/kernel/power/ztrsm_kernel_LT.S b/kernel/power/ztrsm_kernel_LT.S
index f7153b7..ae4615c 100644
--- a/kernel/power/ztrsm_kernel_LT.S
+++ b/kernel/power/ztrsm_kernel_LT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -106,7 +106,7 @@
#define PREA r30
#define PREC r31
#define PREB PREA
-
+
#ifndef NEEDPARAM
PROLOGUE
@@ -652,7 +652,7 @@ LL(KERNEL_MainFinish):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -672,7 +672,7 @@ LL(KERNEL_MainFinish):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -1009,7 +1009,7 @@ LL(KERNEL_MainFinish):
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 1 + ZBASE_SHIFT
add AORIG, AORIG, r0
diff --git a/kernel/power/ztrsm_kernel_RT.S b/kernel/power/ztrsm_kernel_RT.S
index 55bc29b..f756dda 100644
--- a/kernel/power/ztrsm_kernel_RT.S
+++ b/kernel/power/ztrsm_kernel_RT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -106,7 +106,7 @@
#define PREA r30
#define PREC r31
#define PREB PREA
-
+
#ifndef NEEDPARAM
PROLOGUE
@@ -1049,7 +1049,7 @@ LL(49):
.align 4
-LL(30):
+LL(30):
srawi. J, N, 1
ble LL(999)
.align 4
@@ -1402,7 +1402,7 @@ LL(KERNEL_MainFinish):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -1422,7 +1422,7 @@ LL(KERNEL_MainFinish):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -1759,7 +1759,7 @@ LL(KERNEL_MainFinish):
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 1 + ZBASE_SHIFT
add AORIG, AORIG, r0
diff --git a/kernel/power/ztrsm_kernel_cell_LN.S b/kernel/power/ztrsm_kernel_cell_LN.S
index c284a0e..2427a4d 100644
--- a/kernel/power/ztrsm_kernel_cell_LN.S
+++ b/kernel/power/ztrsm_kernel_cell_LN.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -106,7 +106,7 @@
#define PREA r30
#define PREC r31
#define PREB PREA
-
+
#ifndef NEEDPARAM
#ifndef DOUBLE
@@ -1038,7 +1038,7 @@ LL(KERNEL_MainFinish):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -1058,7 +1058,7 @@ LL(KERNEL_MainFinish):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -1395,7 +1395,7 @@ LL(KERNEL_MainFinish):
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 1 + ZBASE_SHIFT
add AORIG, AORIG, r0
diff --git a/kernel/power/ztrsm_kernel_cell_LT.S b/kernel/power/ztrsm_kernel_cell_LT.S
index ca80100..0d88ded 100644
--- a/kernel/power/ztrsm_kernel_cell_LT.S
+++ b/kernel/power/ztrsm_kernel_cell_LT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -106,7 +106,7 @@
#define PREA r30
#define PREC r31
#define PREB PREA
-
+
#ifndef NEEDPARAM
#ifndef DOUBLE
@@ -641,7 +641,7 @@ LL(KERNEL_MainFinish):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -661,7 +661,7 @@ LL(KERNEL_MainFinish):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -998,7 +998,7 @@ LL(KERNEL_MainFinish):
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 1 + ZBASE_SHIFT
add AORIG, AORIG, r0
diff --git a/kernel/power/ztrsm_kernel_cell_RT.S b/kernel/power/ztrsm_kernel_cell_RT.S
index f1139fd..84f2089 100644
--- a/kernel/power/ztrsm_kernel_cell_RT.S
+++ b/kernel/power/ztrsm_kernel_cell_RT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -106,7 +106,7 @@
#define PREA r30
#define PREC r31
#define PREB PREA
-
+
#ifndef NEEDPARAM
#ifndef DOUBLE
@@ -993,7 +993,7 @@ LL(49):
.align 4
-LL(30):
+LL(30):
srawi. J, N, 1
ble LL(999)
.align 4
@@ -1362,7 +1362,7 @@ LL(KERNEL_MainFinish):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -1382,7 +1382,7 @@ LL(KERNEL_MainFinish):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -1719,7 +1719,7 @@ LL(KERNEL_MainFinish):
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 1 + ZBASE_SHIFT
add AORIG, AORIG, r0
diff --git a/kernel/power/ztrsm_kernel_hummer_LN.S b/kernel/power/ztrsm_kernel_hummer_LN.S
index 9e9697d..bf3eafa 100644
--- a/kernel/power/ztrsm_kernel_hummer_LN.S
+++ b/kernel/power/ztrsm_kernel_hummer_LN.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#undef ZERO
#define ALPHA 0
@@ -73,7 +73,7 @@
#define BO r25
#define AO2 r26
#define BO2 r27
-
+
#define CO1 r28
#define CO2 r29
#define ZERO r31
@@ -145,7 +145,7 @@
stfpdux f29, SP, r0
stfpdux f30, SP, r0
stfpdux f31, SP, r0
-
+
stwu r31, -4(SP)
stwu r30, -4(SP)
stwu r29, -4(SP)
@@ -194,7 +194,7 @@
li INCM7, -7 * SIZE
addi C, C, - 1 * SIZE
-
+
#ifdef LN
mullw r0, M, K
slwi r0, r0, ZBASE_SHIFT
@@ -255,7 +255,7 @@
li r0, FZERO
lfpsx f0, SP, r0
-
+
andi. I, M, 1
beq .L20
@@ -539,7 +539,7 @@
li r0, FZERO
lfpsx f0, SP, r0
.align 4
-
+
.L20:
andi. I, M, 2
beq .L30
diff --git a/kernel/power/ztrsm_kernel_hummer_LT.S b/kernel/power/ztrsm_kernel_hummer_LT.S
index 6da6c72..865c85f 100644
--- a/kernel/power/ztrsm_kernel_hummer_LT.S
+++ b/kernel/power/ztrsm_kernel_hummer_LT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#undef ZERO
#define ALPHA 0
@@ -73,7 +73,7 @@
#define BO r25
#define AO2 r26
#define BO2 r27
-
+
#define CO1 r28
#define CO2 r29
#define ZERO r31
@@ -145,7 +145,7 @@
stfpdux f29, SP, r0
stfpdux f30, SP, r0
stfpdux f31, SP, r0
-
+
stwu r31, -4(SP)
stwu r30, -4(SP)
stwu r29, -4(SP)
@@ -194,7 +194,7 @@
li INCM7, -7 * SIZE
addi C, C, - 1 * SIZE
-
+
#ifdef LN
mullw r0, M, K
slwi r0, r0, ZBASE_SHIFT
diff --git a/kernel/power/ztrsm_kernel_hummer_RT.S b/kernel/power/ztrsm_kernel_hummer_RT.S
index 8670cea..99868f9 100644
--- a/kernel/power/ztrsm_kernel_hummer_RT.S
+++ b/kernel/power/ztrsm_kernel_hummer_RT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#undef ZERO
#define ALPHA 0
@@ -73,7 +73,7 @@
#define BO r25
#define AO2 r26
#define BO2 r27
-
+
#define CO1 r28
#define CO2 r29
#define ZERO r31
@@ -145,7 +145,7 @@
stfpdux f29, SP, r0
stfpdux f30, SP, r0
stfpdux f31, SP, r0
-
+
stwu r31, -4(SP)
stwu r30, -4(SP)
stwu r29, -4(SP)
@@ -194,7 +194,7 @@
li INCM7, -7 * SIZE
addi C, C, - 1 * SIZE
-
+
#ifdef LN
mullw r0, M, K
slwi r0, r0, ZBASE_SHIFT
@@ -1266,7 +1266,7 @@
#endif
.align 4
-.L50:
+.L50:
srawi. J, N, 1
ble .L999
.align 4
diff --git a/kernel/power/ztrsm_kernel_power6_LN.S b/kernel/power/ztrsm_kernel_power6_LN.S
index 7a3b286..42239bb 100644
--- a/kernel/power/ztrsm_kernel_power6_LN.S
+++ b/kernel/power/ztrsm_kernel_power6_LN.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -103,7 +103,7 @@
#define PREA r30
#define PREC r31
-
+
#ifndef CONJ
#define FMA1 FMADD
#define FMA2 FMADD
@@ -580,7 +580,7 @@ LL(27):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
FSUB f0, f16, f0
FSUB f1, f17, f1
FSUB f4, f18, f4
@@ -1049,7 +1049,7 @@ LL(27):
addi CO3, CO3, 2 * SIZE
addi CO4, CO4, 2 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 0 + ZBASE_SHIFT
add AORIG, AORIG, r0
@@ -1677,7 +1677,7 @@ LL(18):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
FSUB f0, f16, f0
FSUB f1, f17, f1
FSUB f4, f18, f4
@@ -1697,7 +1697,7 @@ LL(18):
LFD f25, 9 * SIZE(BO)
LFD f26, 10 * SIZE(BO)
LFD f27, 11 * SIZE(BO)
-
+
FSUB f2, f24, f2
FSUB f3, f25, f3
FSUB f6, f26, f6
@@ -1724,7 +1724,7 @@ LL(18):
FSUB f1, f17, f1
FSUB f2, f18, f2
FSUB f3, f19, f3
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -1739,7 +1739,7 @@ LL(18):
LFD f25, 9 * SIZE(AO)
LFD f26, 10 * SIZE(AO)
LFD f27, 11 * SIZE(AO)
-
+
FSUB f8, f24, f8
FSUB f9, f25, f9
FSUB f10, f26, f10
@@ -2574,7 +2574,7 @@ LL(18):
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 1 + ZBASE_SHIFT
add AORIG, AORIG, r0
@@ -3505,7 +3505,7 @@ LL(38):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -3525,7 +3525,7 @@ LL(38):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -3862,7 +3862,7 @@ LL(38):
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 1 + ZBASE_SHIFT
add AORIG, AORIG, r0
diff --git a/kernel/power/ztrsm_kernel_power6_LT.S b/kernel/power/ztrsm_kernel_power6_LT.S
index b7c3441..dfae4d6 100644
--- a/kernel/power/ztrsm_kernel_power6_LT.S
+++ b/kernel/power/ztrsm_kernel_power6_LT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -103,7 +103,7 @@
#define PREA r30
#define PREC r31
-
+
#ifndef CONJ
#define FMA1 FMADD
#define FMA2 FMADD
@@ -914,7 +914,7 @@ LL(18):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
FSUB f0, f16, f0
FSUB f1, f17, f1
FSUB f4, f18, f4
@@ -934,7 +934,7 @@ LL(18):
LFD f25, 9 * SIZE(BO)
LFD f26, 10 * SIZE(BO)
LFD f27, 11 * SIZE(BO)
-
+
FSUB f2, f24, f2
FSUB f3, f25, f3
FSUB f6, f26, f6
@@ -961,7 +961,7 @@ LL(18):
FSUB f1, f17, f1
FSUB f2, f18, f2
FSUB f3, f19, f3
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -976,7 +976,7 @@ LL(18):
LFD f25, 9 * SIZE(AO)
LFD f26, 10 * SIZE(AO)
LFD f27, 11 * SIZE(AO)
-
+
FSUB f8, f24, f8
FSUB f9, f25, f9
FSUB f10, f26, f10
@@ -1811,7 +1811,7 @@ LL(18):
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 1 + ZBASE_SHIFT
add AORIG, AORIG, r0
@@ -2107,7 +2107,7 @@ LL(27):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
FSUB f0, f16, f0
FSUB f1, f17, f1
FSUB f4, f18, f4
@@ -2556,7 +2556,7 @@ LL(27):
addi CO3, CO3, 2 * SIZE
addi CO4, CO4, 2 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 0 + ZBASE_SHIFT
add AORIG, AORIG, r0
@@ -3060,7 +3060,7 @@ LL(38):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -3080,7 +3080,7 @@ LL(38):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -3417,7 +3417,7 @@ LL(38):
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 1 + ZBASE_SHIFT
add AORIG, AORIG, r0
diff --git a/kernel/power/ztrsm_kernel_power6_RT.S b/kernel/power/ztrsm_kernel_power6_RT.S
index 069a73c..79f8b70 100644
--- a/kernel/power/ztrsm_kernel_power6_RT.S
+++ b/kernel/power/ztrsm_kernel_power6_RT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -103,7 +103,7 @@
#define PREA r30
#define PREC r31
-
+
#ifndef CONJ
#define FMA1 FMADD
#define FMA2 FMADD
@@ -1462,7 +1462,7 @@ LL(38):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -1482,7 +1482,7 @@ LL(38):
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -1819,7 +1819,7 @@ LL(38):
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 1 + ZBASE_SHIFT
add AORIG, AORIG, r0
@@ -2945,7 +2945,7 @@ LL(18):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
FSUB f0, f16, f0
FSUB f1, f17, f1
FSUB f4, f18, f4
@@ -2965,7 +2965,7 @@ LL(18):
LFD f25, 9 * SIZE(BO)
LFD f26, 10 * SIZE(BO)
LFD f27, 11 * SIZE(BO)
-
+
FSUB f2, f24, f2
FSUB f3, f25, f3
FSUB f6, f26, f6
@@ -2992,7 +2992,7 @@ LL(18):
FSUB f1, f17, f1
FSUB f2, f18, f2
FSUB f3, f19, f3
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -3007,7 +3007,7 @@ LL(18):
LFD f25, 9 * SIZE(AO)
LFD f26, 10 * SIZE(AO)
LFD f27, 11 * SIZE(AO)
-
+
FSUB f8, f24, f8
FSUB f9, f25, f9
FSUB f10, f26, f10
@@ -3842,7 +3842,7 @@ LL(18):
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 1 + ZBASE_SHIFT
add AORIG, AORIG, r0
@@ -4138,7 +4138,7 @@ LL(27):
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
FSUB f0, f16, f0
FSUB f1, f17, f1
FSUB f4, f18, f4
@@ -4587,7 +4587,7 @@ LL(27):
addi CO3, CO3, 2 * SIZE
addi CO4, CO4, 2 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 0 + ZBASE_SHIFT
add AORIG, AORIG, r0
diff --git a/kernel/power/ztrsm_kernel_ppc440_LN.S b/kernel/power/ztrsm_kernel_ppc440_LN.S
index fdcf5be..51db719 100644
--- a/kernel/power/ztrsm_kernel_ppc440_LN.S
+++ b/kernel/power/ztrsm_kernel_ppc440_LN.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -1017,7 +1017,7 @@
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -1037,7 +1037,7 @@
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -1374,7 +1374,7 @@
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 1 + ZBASE_SHIFT
add AORIG, AORIG, r0
diff --git a/kernel/power/ztrsm_kernel_ppc440_LT.S b/kernel/power/ztrsm_kernel_ppc440_LT.S
index a9c98dd..b5e23b3 100644
--- a/kernel/power/ztrsm_kernel_ppc440_LT.S
+++ b/kernel/power/ztrsm_kernel_ppc440_LT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -595,7 +595,7 @@
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -615,7 +615,7 @@
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -952,7 +952,7 @@
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 1 + ZBASE_SHIFT
add AORIG, AORIG, r0
diff --git a/kernel/power/ztrsm_kernel_ppc440_RT.S b/kernel/power/ztrsm_kernel_ppc440_RT.S
index c9b794e..2bb374d 100644
--- a/kernel/power/ztrsm_kernel_ppc440_RT.S
+++ b/kernel/power/ztrsm_kernel_ppc440_RT.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#ifndef __64BIT__
#define LOAD lwz
#else
@@ -975,7 +975,7 @@
.align 4
-.L30:
+.L30:
srawi. J, N, 1
ble .L999
.align 4
@@ -1325,7 +1325,7 @@
LFD f17, 1 * SIZE(BO)
LFD f18, 2 * SIZE(BO)
LFD f19, 3 * SIZE(BO)
-
+
LFD f20, 4 * SIZE(BO)
LFD f21, 5 * SIZE(BO)
LFD f22, 6 * SIZE(BO)
@@ -1345,7 +1345,7 @@
LFD f17, 1 * SIZE(AO)
LFD f18, 2 * SIZE(AO)
LFD f19, 3 * SIZE(AO)
-
+
LFD f20, 4 * SIZE(AO)
LFD f21, 5 * SIZE(AO)
LFD f22, 6 * SIZE(AO)
@@ -1682,7 +1682,7 @@
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
#endif
-
+
#ifdef RT
slwi r0, K, 1 + ZBASE_SHIFT
add AORIG, AORIG, r0
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index cf868f1..5086420 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -69,9 +69,9 @@ gotoblas_t TABLE_NAME = {
sgemv_nTS, sgemv_tTS, sger_kTS,
ssymv_LTS, ssymv_UTS,
- sgemm_kernelTS, sgemm_betaTS,
+ sgemm_kernelTS, sgemm_betaTS,
#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
- sgemm_incopyTS, sgemm_itcopyTS,
+ sgemm_incopyTS, sgemm_itcopyTS,
#else
sgemm_oncopyTS, sgemm_otcopyTS,
#endif
@@ -97,7 +97,7 @@ gotoblas_t TABLE_NAME = {
strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS,
strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS,
#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
- ssymm_iutcopyTS, ssymm_iltcopyTS,
+ ssymm_iutcopyTS, ssymm_iltcopyTS,
#else
ssymm_outcopyTS, ssymm_oltcopyTS,
#endif
@@ -119,9 +119,9 @@ gotoblas_t TABLE_NAME = {
dgemv_nTS, dgemv_tTS, dger_kTS,
dsymv_LTS, dsymv_UTS,
- dgemm_kernelTS, dgemm_betaTS,
+ dgemm_kernelTS, dgemm_betaTS,
#if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
- dgemm_incopyTS, dgemm_itcopyTS,
+ dgemm_incopyTS, dgemm_itcopyTS,
#else
dgemm_oncopyTS, dgemm_otcopyTS,
#endif
@@ -147,7 +147,7 @@ gotoblas_t TABLE_NAME = {
dtrmm_ounucopyTS, dtrmm_ounncopyTS, dtrmm_outucopyTS, dtrmm_outncopyTS,
dtrmm_olnucopyTS, dtrmm_olnncopyTS, dtrmm_oltucopyTS, dtrmm_oltncopyTS,
#if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
- dsymm_iutcopyTS, dsymm_iltcopyTS,
+ dsymm_iutcopyTS, dsymm_iltcopyTS,
#else
dsymm_outcopyTS, dsymm_oltcopyTS,
#endif
@@ -171,9 +171,9 @@ gotoblas_t TABLE_NAME = {
qgemv_nTS, qgemv_tTS, qger_kTS,
qsymv_LTS, qsymv_UTS,
- qgemm_kernelTS, qgemm_betaTS,
+ qgemm_kernelTS, qgemm_betaTS,
#if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N
- qgemm_incopyTS, qgemm_itcopyTS,
+ qgemm_incopyTS, qgemm_itcopyTS,
#else
qgemm_oncopyTS, qgemm_otcopyTS,
#endif
@@ -199,7 +199,7 @@ gotoblas_t TABLE_NAME = {
qtrmm_ounucopyTS, qtrmm_ounncopyTS, qtrmm_outucopyTS, qtrmm_outncopyTS,
qtrmm_olnucopyTS, qtrmm_olnncopyTS, qtrmm_oltucopyTS, qtrmm_oltncopyTS,
#if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N
- qsymm_iutcopyTS, qsymm_iltcopyTS,
+ qsymm_iutcopyTS, qsymm_iltcopyTS,
#else
qsymm_outcopyTS, qsymm_oltcopyTS,
#endif
@@ -219,14 +219,14 @@ gotoblas_t TABLE_NAME = {
camax_kTS, camin_kTS, icamax_kTS, icamin_kTS,
cnrm2_kTS, casum_kTS, ccopy_kTS,
cdotu_kTS, cdotc_kTS, csrot_kTS,
- caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS,
+ caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS,
- cgemv_nTS, cgemv_tTS, cgemv_rTS, cgemv_cTS,
- cgemv_oTS, cgemv_uTS, cgemv_sTS, cgemv_dTS,
- cgeru_kTS, cgerc_kTS, cgerv_kTS, cgerd_kTS,
+ cgemv_nTS, cgemv_tTS, cgemv_rTS, cgemv_cTS,
+ cgemv_oTS, cgemv_uTS, cgemv_sTS, cgemv_dTS,
+ cgeru_kTS, cgerc_kTS, cgerv_kTS, cgerd_kTS,
csymv_LTS, csymv_UTS,
chemv_LTS, chemv_UTS, chemv_MTS, chemv_VTS,
-
+
cgemm_kernel_nTS, cgemm_kernel_lTS, cgemm_kernel_rTS, cgemm_kernel_bTS,
cgemm_betaTS,
@@ -236,10 +236,10 @@ gotoblas_t TABLE_NAME = {
cgemm_oncopyTS, cgemm_otcopyTS,
#endif
cgemm_oncopyTS, cgemm_otcopyTS,
-
+
ctrsm_kernel_LNTS, ctrsm_kernel_LTTS, ctrsm_kernel_LRTS, ctrsm_kernel_LCTS,
ctrsm_kernel_RNTS, ctrsm_kernel_RTTS, ctrsm_kernel_RRTS, ctrsm_kernel_RCTS,
-
+
#if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
ctrsm_iunucopyTS, ctrsm_iunncopyTS, ctrsm_iutucopyTS, ctrsm_iutncopyTS,
ctrsm_ilnucopyTS, ctrsm_ilnncopyTS, ctrsm_iltucopyTS, ctrsm_iltncopyTS,
@@ -249,10 +249,10 @@ gotoblas_t TABLE_NAME = {
#endif
ctrsm_ounucopyTS, ctrsm_ounncopyTS, ctrsm_outucopyTS, ctrsm_outncopyTS,
ctrsm_olnucopyTS, ctrsm_olnncopyTS, ctrsm_oltucopyTS, ctrsm_oltncopyTS,
-
+
ctrmm_kernel_RNTS, ctrmm_kernel_RTTS, ctrmm_kernel_RRTS, ctrmm_kernel_RCTS,
ctrmm_kernel_LNTS, ctrmm_kernel_LTTS, ctrmm_kernel_LRTS, ctrmm_kernel_LCTS,
-
+
#if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
ctrmm_iunucopyTS, ctrmm_iunncopyTS, ctrmm_iutucopyTS, ctrmm_iutncopyTS,
ctrmm_ilnucopyTS, ctrmm_ilnncopyTS, ctrmm_iltucopyTS, ctrmm_iltncopyTS,
@@ -262,7 +262,7 @@ gotoblas_t TABLE_NAME = {
#endif
ctrmm_ounucopyTS, ctrmm_ounncopyTS, ctrmm_outucopyTS, ctrmm_outncopyTS,
ctrmm_olnucopyTS, ctrmm_olnncopyTS, ctrmm_oltucopyTS, ctrmm_oltncopyTS,
-
+
#if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
csymm_iutcopyTS, csymm_iltcopyTS,
#else
@@ -275,16 +275,16 @@ gotoblas_t TABLE_NAME = {
chemm_outcopyTS, chemm_oltcopyTS,
#endif
chemm_outcopyTS, chemm_oltcopyTS,
-
+
cgemm3m_kernelTS,
-
+
cgemm3m_incopybTS, cgemm3m_incopyrTS,
cgemm3m_incopyiTS, cgemm3m_itcopybTS,
cgemm3m_itcopyrTS, cgemm3m_itcopyiTS,
cgemm3m_oncopybTS, cgemm3m_oncopyrTS,
cgemm3m_oncopyiTS, cgemm3m_otcopybTS,
cgemm3m_otcopyrTS, cgemm3m_otcopyiTS,
-
+
csymm3m_iucopybTS, csymm3m_ilcopybTS,
csymm3m_iucopyrTS, csymm3m_ilcopyrTS,
csymm3m_iucopyiTS, csymm3m_ilcopyiTS,
@@ -294,7 +294,7 @@ gotoblas_t TABLE_NAME = {
chemm3m_iucopybTS, chemm3m_ilcopybTS,
chemm3m_iucopyrTS, chemm3m_ilcopyrTS,
- chemm3m_iucopyiTS, chemm3m_ilcopyiTS,
+ chemm3m_iucopyiTS, chemm3m_ilcopyiTS,
chemm3m_oucopybTS, chemm3m_olcopybTS,
chemm3m_oucopyrTS, chemm3m_olcopyrTS,
@@ -312,11 +312,11 @@ gotoblas_t TABLE_NAME = {
zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS,
znrm2_kTS, zasum_kTS, zcopy_kTS,
zdotu_kTS, zdotc_kTS, zdrot_kTS,
- zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS,
+ zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS,
- zgemv_nTS, zgemv_tTS, zgemv_rTS, zgemv_cTS,
- zgemv_oTS, zgemv_uTS, zgemv_sTS, zgemv_dTS,
- zgeru_kTS, zgerc_kTS, zgerv_kTS, zgerd_kTS,
+ zgemv_nTS, zgemv_tTS, zgemv_rTS, zgemv_cTS,
+ zgemv_oTS, zgemv_uTS, zgemv_sTS, zgemv_dTS,
+ zgeru_kTS, zgerc_kTS, zgerv_kTS, zgerd_kTS,
zsymv_LTS, zsymv_UTS,
zhemv_LTS, zhemv_UTS, zhemv_MTS, zhemv_VTS,
@@ -329,10 +329,10 @@ gotoblas_t TABLE_NAME = {
zgemm_oncopyTS, zgemm_otcopyTS,
#endif
zgemm_oncopyTS, zgemm_otcopyTS,
-
+
ztrsm_kernel_LNTS, ztrsm_kernel_LTTS, ztrsm_kernel_LRTS, ztrsm_kernel_LCTS,
ztrsm_kernel_RNTS, ztrsm_kernel_RTTS, ztrsm_kernel_RRTS, ztrsm_kernel_RCTS,
-
+
#if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
ztrsm_iunucopyTS, ztrsm_iunncopyTS, ztrsm_iutucopyTS, ztrsm_iutncopyTS,
ztrsm_ilnucopyTS, ztrsm_ilnncopyTS, ztrsm_iltucopyTS, ztrsm_iltncopyTS,
@@ -342,10 +342,10 @@ gotoblas_t TABLE_NAME = {
#endif
ztrsm_ounucopyTS, ztrsm_ounncopyTS, ztrsm_outucopyTS, ztrsm_outncopyTS,
ztrsm_olnucopyTS, ztrsm_olnncopyTS, ztrsm_oltucopyTS, ztrsm_oltncopyTS,
-
+
ztrmm_kernel_RNTS, ztrmm_kernel_RTTS, ztrmm_kernel_RRTS, ztrmm_kernel_RCTS,
ztrmm_kernel_LNTS, ztrmm_kernel_LTTS, ztrmm_kernel_LRTS, ztrmm_kernel_LCTS,
-
+
#if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
ztrmm_iunucopyTS, ztrmm_iunncopyTS, ztrmm_iutucopyTS, ztrmm_iutncopyTS,
ztrmm_ilnucopyTS, ztrmm_ilnncopyTS, ztrmm_iltucopyTS, ztrmm_iltncopyTS,
@@ -355,7 +355,7 @@ gotoblas_t TABLE_NAME = {
#endif
ztrmm_ounucopyTS, ztrmm_ounncopyTS, ztrmm_outucopyTS, ztrmm_outncopyTS,
ztrmm_olnucopyTS, ztrmm_olnncopyTS, ztrmm_oltucopyTS, ztrmm_oltncopyTS,
-
+
#if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
zsymm_iutcopyTS, zsymm_iltcopyTS,
#else
@@ -368,16 +368,16 @@ gotoblas_t TABLE_NAME = {
zhemm_outcopyTS, zhemm_oltcopyTS,
#endif
zhemm_outcopyTS, zhemm_oltcopyTS,
-
+
zgemm3m_kernelTS,
-
+
zgemm3m_incopybTS, zgemm3m_incopyrTS,
zgemm3m_incopyiTS, zgemm3m_itcopybTS,
zgemm3m_itcopyrTS, zgemm3m_itcopyiTS,
zgemm3m_oncopybTS, zgemm3m_oncopyrTS,
zgemm3m_oncopyiTS, zgemm3m_otcopybTS,
zgemm3m_otcopyrTS, zgemm3m_otcopyiTS,
-
+
zsymm3m_iucopybTS, zsymm3m_ilcopybTS,
zsymm3m_iucopyrTS, zsymm3m_ilcopyrTS,
zsymm3m_iucopyiTS, zsymm3m_ilcopyiTS,
@@ -387,7 +387,7 @@ gotoblas_t TABLE_NAME = {
zhemm3m_iucopybTS, zhemm3m_ilcopybTS,
zhemm3m_iucopyrTS, zhemm3m_ilcopyrTS,
- zhemm3m_iucopyiTS, zhemm3m_ilcopyiTS,
+ zhemm3m_iucopyiTS, zhemm3m_ilcopyiTS,
zhemm3m_oucopybTS, zhemm3m_olcopybTS,
zhemm3m_oucopyrTS, zhemm3m_olcopyrTS,
@@ -407,11 +407,11 @@ gotoblas_t TABLE_NAME = {
xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS,
xnrm2_kTS, xasum_kTS, xcopy_kTS,
xdotu_kTS, xdotc_kTS, xqrot_kTS,
- xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS,
+ xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS,
- xgemv_nTS, xgemv_tTS, xgemv_rTS, xgemv_cTS,
- xgemv_oTS, xgemv_uTS, xgemv_sTS, xgemv_dTS,
- xgeru_kTS, xgerc_kTS, xgerv_kTS, xgerd_kTS,
+ xgemv_nTS, xgemv_tTS, xgemv_rTS, xgemv_cTS,
+ xgemv_oTS, xgemv_uTS, xgemv_sTS, xgemv_dTS,
+ xgeru_kTS, xgerc_kTS, xgerv_kTS, xgerd_kTS,
xsymv_LTS, xsymv_UTS,
xhemv_LTS, xhemv_UTS, xhemv_MTS, xhemv_VTS,
@@ -424,10 +424,10 @@ gotoblas_t TABLE_NAME = {
xgemm_oncopyTS, xgemm_otcopyTS,
#endif
xgemm_oncopyTS, xgemm_otcopyTS,
-
+
xtrsm_kernel_LNTS, xtrsm_kernel_LTTS, xtrsm_kernel_LRTS, xtrsm_kernel_LCTS,
xtrsm_kernel_RNTS, xtrsm_kernel_RTTS, xtrsm_kernel_RRTS, xtrsm_kernel_RCTS,
-
+
#if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
xtrsm_iunucopyTS, xtrsm_iunncopyTS, xtrsm_iutucopyTS, xtrsm_iutncopyTS,
xtrsm_ilnucopyTS, xtrsm_ilnncopyTS, xtrsm_iltucopyTS, xtrsm_iltncopyTS,
@@ -437,10 +437,10 @@ gotoblas_t TABLE_NAME = {
#endif
xtrsm_ounucopyTS, xtrsm_ounncopyTS, xtrsm_outucopyTS, xtrsm_outncopyTS,
xtrsm_olnucopyTS, xtrsm_olnncopyTS, xtrsm_oltucopyTS, xtrsm_oltncopyTS,
-
+
xtrmm_kernel_RNTS, xtrmm_kernel_RTTS, xtrmm_kernel_RRTS, xtrmm_kernel_RCTS,
xtrmm_kernel_LNTS, xtrmm_kernel_LTTS, xtrmm_kernel_LRTS, xtrmm_kernel_LCTS,
-
+
#if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
xtrmm_iunucopyTS, xtrmm_iunncopyTS, xtrmm_iutucopyTS, xtrmm_iutncopyTS,
xtrmm_ilnucopyTS, xtrmm_ilnncopyTS, xtrmm_iltucopyTS, xtrmm_iltncopyTS,
@@ -450,7 +450,7 @@ gotoblas_t TABLE_NAME = {
#endif
xtrmm_ounucopyTS, xtrmm_ounncopyTS, xtrmm_outucopyTS, xtrmm_outncopyTS,
xtrmm_olnucopyTS, xtrmm_olnncopyTS, xtrmm_oltucopyTS, xtrmm_oltncopyTS,
-
+
#if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
xsymm_iutcopyTS, xsymm_iltcopyTS,
#else
@@ -463,16 +463,16 @@ gotoblas_t TABLE_NAME = {
xhemm_outcopyTS, xhemm_oltcopyTS,
#endif
xhemm_outcopyTS, xhemm_oltcopyTS,
-
+
xgemm3m_kernelTS,
-
+
xgemm3m_incopybTS, xgemm3m_incopyrTS,
xgemm3m_incopyiTS, xgemm3m_itcopybTS,
xgemm3m_itcopyrTS, xgemm3m_itcopyiTS,
xgemm3m_oncopybTS, xgemm3m_oncopyrTS,
xgemm3m_oncopyiTS, xgemm3m_otcopybTS,
xgemm3m_otcopyrTS, xgemm3m_otcopyiTS,
-
+
xsymm3m_iucopybTS, xsymm3m_ilcopybTS,
xsymm3m_iucopyrTS, xsymm3m_ilcopyrTS,
xsymm3m_iucopyiTS, xsymm3m_ilcopyiTS,
@@ -482,7 +482,7 @@ gotoblas_t TABLE_NAME = {
xhemm3m_iucopybTS, xhemm3m_ilcopybTS,
xhemm3m_iucopyrTS, xhemm3m_ilcopyrTS,
- xhemm3m_iucopyiTS, xhemm3m_ilcopyiTS,
+ xhemm3m_iucopyiTS, xhemm3m_ilcopyiTS,
xhemm3m_oucopybTS, xhemm3m_olcopybTS,
xhemm3m_oucopyrTS, xhemm3m_olcopyrTS,
@@ -500,6 +500,16 @@ gotoblas_t TABLE_NAME = {
SNUMOPT, DNUMOPT, QNUMOPT,
+ saxpby_kTS, daxpby_kTS, caxpby_kTS, zaxpby_kTS,
+
+ somatcopy_k_cnTS, somatcopy_k_ctTS, somatcopy_k_rnTS, somatcopy_k_rtTS,
+ domatcopy_k_cnTS, domatcopy_k_ctTS, domatcopy_k_rnTS, domatcopy_k_rtTS,
+ comatcopy_k_cnTS, comatcopy_k_ctTS, comatcopy_k_rnTS, comatcopy_k_rtTS,
+ comatcopy_k_cncTS, comatcopy_k_ctcTS, comatcopy_k_rncTS, comatcopy_k_rtcTS,
+ zomatcopy_k_cnTS, zomatcopy_k_ctTS, zomatcopy_k_rnTS, zomatcopy_k_rtTS,
+ zomatcopy_k_cncTS, zomatcopy_k_ctcTS, zomatcopy_k_rncTS, zomatcopy_k_rtcTS
+
+
};
#ifdef ARCH_X86
@@ -508,45 +518,45 @@ static int get_l2_size_old(void){
int info[15];
cpuid(2, &eax, &ebx, &ecx, &edx);
-
+
info[ 0] = BITMASK(eax, 8, 0xff);
info[ 1] = BITMASK(eax, 16, 0xff);
info[ 2] = BITMASK(eax, 24, 0xff);
-
+
info[ 3] = BITMASK(ebx, 0, 0xff);
info[ 4] = BITMASK(ebx, 8, 0xff);
info[ 5] = BITMASK(ebx, 16, 0xff);
info[ 6] = BITMASK(ebx, 24, 0xff);
-
+
info[ 7] = BITMASK(ecx, 0, 0xff);
info[ 8] = BITMASK(ecx, 8, 0xff);
info[ 9] = BITMASK(ecx, 16, 0xff);
info[10] = BITMASK(ecx, 24, 0xff);
-
+
info[11] = BITMASK(edx, 0, 0xff);
info[12] = BITMASK(edx, 8, 0xff);
info[13] = BITMASK(edx, 16, 0xff);
info[14] = BITMASK(edx, 24, 0xff);
-
+
for (i = 0; i < 15; i++){
-
+
switch (info[i]){
-
+
/* This table is from http://www.sandpile.org/ia32/cpuid.htm */
-
+
case 0x1a :
return 96;
-
+
case 0x39 :
case 0x3b :
case 0x41 :
case 0x79 :
case 0x81 :
return 128;
-
+
case 0x3a :
return 192;
-
+
case 0x21 :
case 0x3c :
case 0x42 :
@@ -554,10 +564,10 @@ static int get_l2_size_old(void){
case 0x7e :
case 0x82 :
return 256;
-
+
case 0x3d :
return 384;
-
+
case 0x3e :
case 0x43 :
case 0x7b :
@@ -565,14 +575,14 @@ static int get_l2_size_old(void){
case 0x83 :
case 0x86 :
return 512;
-
+
case 0x44 :
case 0x78 :
case 0x7c :
case 0x84 :
case 0x87 :
return 1024;
-
+
case 0x45 :
case 0x7d :
case 0x85 :
@@ -580,10 +590,10 @@ static int get_l2_size_old(void){
case 0x48 :
return 3184;
-
+
case 0x49 :
return 4096;
-
+
case 0x4e :
return 6144;
}
@@ -704,13 +714,13 @@ static void init_parameter(void) {
fprintf(stderr, "Core2\n");
#endif
- TABLE_NAME.sgemm_p = 92 * (l2 >> 9);
- TABLE_NAME.dgemm_p = 46 * (l2 >> 9);
- TABLE_NAME.cgemm_p = 46 * (l2 >> 9);
- TABLE_NAME.zgemm_p = 23 * (l2 >> 9);
+ TABLE_NAME.sgemm_p = 92 * (l2 >> 9) + 8;
+ TABLE_NAME.dgemm_p = 46 * (l2 >> 9) + 8;
+ TABLE_NAME.cgemm_p = 46 * (l2 >> 9) + 4;
+ TABLE_NAME.zgemm_p = 23 * (l2 >> 9) + 4;
#ifdef EXPRECISION
- TABLE_NAME.qgemm_p = 92 * (l2 >> 9);
- TABLE_NAME.xgemm_p = 46 * (l2 >> 9);
+ TABLE_NAME.qgemm_p = 92 * (l2 >> 9) + 8;
+ TABLE_NAME.xgemm_p = 46 * (l2 >> 9) + 4;
#endif
#endif
@@ -730,6 +740,23 @@ static void init_parameter(void) {
#endif
#endif
+#ifdef DUNNINGTON
+
+#ifdef DEBUG
+ fprintf(stderr, "Dunnington\n");
+#endif
+
+ TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8;
+ TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8;
+ TABLE_NAME.cgemm_p = 21 * (l2 >> 9) + 4;
+ TABLE_NAME.zgemm_p = 21 * (l2 >> 9) + 4;
+#ifdef EXPRECISION
+ TABLE_NAME.qgemm_p = 42 * (l2 >> 9) + 8;
+ TABLE_NAME.xgemm_p = 21 * (l2 >> 9) + 4;
+#endif
+#endif
+
+
#ifdef NEHALEM
#ifdef DEBUG
@@ -889,37 +916,37 @@ static void init_parameter(void) {
fprintf(stderr, "L2 = %8d DGEMM_P .. %d\n", l2, TABLE_NAME.dgemm_p);
#endif
- TABLE_NAME.sgemm_r = (((BUFFER_SIZE -
- ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA
+ TABLE_NAME.sgemm_r = (((BUFFER_SIZE -
+ ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.sgemm_q * 4) - 15) & ~15);
- TABLE_NAME.dgemm_r = (((BUFFER_SIZE -
- ((TABLE_NAME.dgemm_p * TABLE_NAME.dgemm_q * 8 + TABLE_NAME.offsetA
+ TABLE_NAME.dgemm_r = (((BUFFER_SIZE -
+ ((TABLE_NAME.dgemm_p * TABLE_NAME.dgemm_q * 8 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.dgemm_q * 8) - 15) & ~15);
#ifdef EXPRECISION
- TABLE_NAME.qgemm_r = (((BUFFER_SIZE -
- ((TABLE_NAME.qgemm_p * TABLE_NAME.qgemm_q * 16 + TABLE_NAME.offsetA
+ TABLE_NAME.qgemm_r = (((BUFFER_SIZE -
+ ((TABLE_NAME.qgemm_p * TABLE_NAME.qgemm_q * 16 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.qgemm_q * 16) - 15) & ~15);
#endif
- TABLE_NAME.cgemm_r = (((BUFFER_SIZE -
- ((TABLE_NAME.cgemm_p * TABLE_NAME.cgemm_q * 8 + TABLE_NAME.offsetA
+ TABLE_NAME.cgemm_r = (((BUFFER_SIZE -
+ ((TABLE_NAME.cgemm_p * TABLE_NAME.cgemm_q * 8 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.cgemm_q * 8) - 15) & ~15);
- TABLE_NAME.zgemm_r = (((BUFFER_SIZE -
- ((TABLE_NAME.zgemm_p * TABLE_NAME.zgemm_q * 16 + TABLE_NAME.offsetA
+ TABLE_NAME.zgemm_r = (((BUFFER_SIZE -
+ ((TABLE_NAME.zgemm_p * TABLE_NAME.zgemm_q * 16 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15);
#ifdef EXPRECISION
- TABLE_NAME.xgemm_r = (((BUFFER_SIZE -
- ((TABLE_NAME.xgemm_p * TABLE_NAME.xgemm_q * 32 + TABLE_NAME.offsetA
- + TABLE_NAME.align) & ~TABLE_NAME.align)
+ TABLE_NAME.xgemm_r = (((BUFFER_SIZE -
+ ((TABLE_NAME.xgemm_p * TABLE_NAME.xgemm_q * 32 + TABLE_NAME.offsetA
+ + TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.xgemm_q * 32) - 15) & ~15);
#endif
diff --git a/kernel/sparc/KERNEL.sparc b/kernel/sparc/KERNEL.sparc
index fb6cc2b..2e8319c 100644
--- a/kernel/sparc/KERNEL.sparc
+++ b/kernel/sparc/KERNEL.sparc
@@ -5,8 +5,8 @@ SGEMMONCOPY = gemm_ncopy.S
SGEMMOTCOPY = gemm_tcopy.S
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
-SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX)
-SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX)
+SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX)
DGEMMKERNEL = gemm_kernel.S
DGEMMINCOPY =
DGEMMITCOPY =
diff --git a/kernel/sparc/axpy.S b/kernel/sparc/axpy.S
index 997f9e0..2ada917 100644
--- a/kernel/sparc/axpy.S
+++ b/kernel/sparc/axpy.S
@@ -499,5 +499,5 @@
.LL59:
return %i7 + 8
clr %o0
-
+
EPILOGUE
diff --git a/kernel/sparc/cabs.S b/kernel/sparc/cabs.S
index 119293e..d186fab 100644
--- a/kernel/sparc/cabs.S
+++ b/kernel/sparc/cabs.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
PROLOGUE
add %sp, -128, %sp
diff --git a/kernel/sparc/dnrm2.S b/kernel/sparc/dnrm2.S
index 8063e23..04810f6 100644
--- a/kernel/sparc/dnrm2.S
+++ b/kernel/sparc/dnrm2.S
@@ -258,7 +258,7 @@
FCMP c1, fzero
fbe .LL99
nop
-
+
FMOV c1, fmax
add N, 1, N
FDIV fone, c1, fone
diff --git a/kernel/sparc/dot.S b/kernel/sparc/dot.S
index f89d5f9..103f087 100644
--- a/kernel/sparc/dot.S
+++ b/kernel/sparc/dot.S
@@ -108,11 +108,11 @@
FCLR(4)
FCLR(5)
#endif
-
+
cmp N, 0
ble .LL19
nop
-
+
sll INCX, BASE_SHIFT, INCX
sll INCY, BASE_SHIFT, INCY
@@ -257,7 +257,7 @@
return %i7 + 8
nop
-
+
.LL50:
sra N, 3, I
cmp I, 0
diff --git a/kernel/sparc/gemm_kernel_2x8.S b/kernel/sparc/gemm_kernel_2x8.S
index c0d257a..3d94476 100644
--- a/kernel/sparc/gemm_kernel_2x8.S
+++ b/kernel/sparc/gemm_kernel_2x8.S
@@ -1140,7 +1140,7 @@
cmp I, 0
ble,pn %icc, .LL29
nop
-
+
#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
mov B, BO
#else
@@ -1414,7 +1414,7 @@
mov BO, B
.align 4
-.LL30:
+.LL30:
and N, 4, J
cmp J, 0
ble,pn %icc, .LL50
diff --git a/kernel/sparc/gemv_n.S b/kernel/sparc/gemv_n.S
index 649ef16..640a96b 100644
--- a/kernel/sparc/gemv_n.S
+++ b/kernel/sparc/gemv_n.S
@@ -46,12 +46,12 @@
#define A %i5
#define LDA %i2
#define X %i3
-#define INCX %i4
+#define INCX %i4
#else
#define A %i4
#define LDA %i5
#define X %i2
-#define INCX %i3
+#define INCX %i3
#endif
#define Y %l0
@@ -234,7 +234,7 @@
STF FZERO, [Y1 + 7 * SIZE]
bg,pn %icc, .LL01
add Y1, 8 * SIZE, Y1
-
+
.LL10:
sra N, 2, J
cmp J, 0
@@ -1314,7 +1314,7 @@
add Y1, INCY, Y1
STF y8, [Y1]
add Y1, INCY, Y1
-
+
deccc I
bg,pn %icc, .LL991
add BUFFER, 8 * SIZE, BUFFER
@@ -1356,7 +1356,7 @@
add Y1, INCY, Y1
STF y4, [Y1]
add Y1, INCY, Y1
-
+
.LL996:
andcc M, 2, I
ble,pn %icc, .LL997
@@ -1378,7 +1378,7 @@
add Y1, INCY, Y1
STF y2, [Y1]
add Y1, INCY, Y1
-
+
.LL997:
andcc M, 1, I
ble,pn %icc, .LL999
diff --git a/kernel/sparc/gemv_t.S b/kernel/sparc/gemv_t.S
index fad006a..fc001e4 100644
--- a/kernel/sparc/gemv_t.S
+++ b/kernel/sparc/gemv_t.S
@@ -48,12 +48,12 @@
#define A %i5
#define LDA %i2
#define X %i3
-#define INCX %i4
+#define INCX %i4
#else
#define A %i4
#define LDA %i5
#define X %i2
-#define INCX %i3
+#define INCX %i3
#endif
#define Y %l0
@@ -218,7 +218,7 @@
#else
FCLR(30)
#endif
-
+
clr IS
mov P, I
sll LDA, BASE_SHIFT, LDA
@@ -697,7 +697,7 @@
cmp IS, M
bl %icc, .LL10
add A, PNLDA, A
-
+
.LL999:
return %i7 + 8
clr %o0
diff --git a/kernel/sparc/ger.S b/kernel/sparc/ger.S
index 84cd525..70b5e22 100644
--- a/kernel/sparc/ger.S
+++ b/kernel/sparc/ger.S
@@ -46,12 +46,12 @@
#define X %i5
#define INCX %i2
#define Y %i3
-#define INCY %i4
+#define INCY %i4
#else
#define X %i4
#define INCX %i5
#define Y %i2
-#define INCY %i3
+#define INCY %i3
#endif
#define A %l0
@@ -251,7 +251,7 @@
deccc J
bg,pn %icc, .LL06
nop
-
+
.LL10:
mov N, J
cmp N, 0
diff --git a/kernel/sparc/imax.S b/kernel/sparc/imax.S
index c24e182..1a2b9c5 100644
--- a/kernel/sparc/imax.S
+++ b/kernel/sparc/imax.S
@@ -149,7 +149,7 @@
add I, -1, I
cmp I, 0
ble,pt %icc, .LL12
- nop
+ nop
#define PREFETCHSIZE 40
diff --git a/kernel/sparc/lsame.S b/kernel/sparc/lsame.S
index 778301f..c00b565 100644
--- a/kernel/sparc/lsame.S
+++ b/kernel/sparc/lsame.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define A %o0
#define B %o1
#define AA %o4
diff --git a/kernel/sparc/max.S b/kernel/sparc/max.S
index 1a4bc44..252bc19 100644
--- a/kernel/sparc/max.S
+++ b/kernel/sparc/max.S
@@ -135,7 +135,7 @@
add I, -1, I
cmp I, 0
ble,pt %icc, .LL12
- nop
+ nop
#define PREFETCHSIZE 40
diff --git a/kernel/sparc/rot.S b/kernel/sparc/rot.S
index f5c5770..40e26e9 100644
--- a/kernel/sparc/rot.S
+++ b/kernel/sparc/rot.S
@@ -150,7 +150,7 @@
cmp N, 0
ble .LL19
nop
-
+
sll INCX, BASE_SHIFT, INCX
sll INCY, BASE_SHIFT, INCY
@@ -480,7 +480,7 @@
.LL19:
return %i7 + 8
nop
-
+
.LL50:
mov X, XX
mov Y, YY
diff --git a/kernel/sparc/scal.S b/kernel/sparc/scal.S
index 1414a09..36d9ce2 100644
--- a/kernel/sparc/scal.S
+++ b/kernel/sparc/scal.S
@@ -119,7 +119,7 @@
#endif
FCLR(29)
-
+
FCMP ALPHA, FZERO
fbne .LL100
sll INCX, BASE_SHIFT, INCX
diff --git a/kernel/sparc/swap.S b/kernel/sparc/swap.S
index 1d7950c..580eff7 100644
--- a/kernel/sparc/swap.S
+++ b/kernel/sparc/swap.S
@@ -116,7 +116,7 @@
ldx [%sp+ STACK_START + 56], Y
ldx [%sp+ STACK_START + 64], INCY
#endif
-
+
sll INCX, BASE_SHIFT, INCX
sll INCY, BASE_SHIFT, INCY
diff --git a/kernel/sparc/trsm_kernel_LN_2x8.S b/kernel/sparc/trsm_kernel_LN_2x8.S
index a70f0e4..16e352d 100644
--- a/kernel/sparc/trsm_kernel_LN_2x8.S
+++ b/kernel/sparc/trsm_kernel_LN_2x8.S
@@ -2106,7 +2106,7 @@
nop
.align 4
-.LL30:
+.LL30:
and N, 4, J
cmp J, 0
ble,pn %icc, .LL50
diff --git a/kernel/sparc/trsm_kernel_LT_2x8.S b/kernel/sparc/trsm_kernel_LT_2x8.S
index 39015d7..425a478 100644
--- a/kernel/sparc/trsm_kernel_LT_2x8.S
+++ b/kernel/sparc/trsm_kernel_LT_2x8.S
@@ -2105,7 +2105,7 @@
nop
.align 4
-.LL30:
+.LL30:
and N, 4, J
cmp J, 0
ble,pn %icc, .LL50
diff --git a/kernel/sparc/trsm_kernel_RT.S b/kernel/sparc/trsm_kernel_RT.S
index 3e1a2b9..eaa6fb2 100644
--- a/kernel/sparc/trsm_kernel_RT.S
+++ b/kernel/sparc/trsm_kernel_RT.S
@@ -2211,7 +2211,7 @@
sub KK, 2, KK
#endif
-.LL200:
+.LL200:
sra N, 2, J
cmp J, 0
ble,pn %icc, .LL999
diff --git a/kernel/sparc/trsm_kernel_RT_2x8.S b/kernel/sparc/trsm_kernel_RT_2x8.S
index c9f68ab..a030741 100644
--- a/kernel/sparc/trsm_kernel_RT_2x8.S
+++ b/kernel/sparc/trsm_kernel_RT_2x8.S
@@ -217,7 +217,7 @@
.register %g2, #scratch
.register %g3, #scratch
-
+
PROLOGUE
SAVESP
nop
diff --git a/kernel/sparc/zamax.S b/kernel/sparc/zamax.S
index b156c5a..ac0966f 100644
--- a/kernel/sparc/zamax.S
+++ b/kernel/sparc/zamax.S
@@ -104,7 +104,7 @@
cmp N, 0
ble .LL20
nop
-
+
cmp INCX, 0
ble .LL20
sll INCX, ZBASE_SHIFT, INCX
diff --git a/kernel/sparc/zasum.S b/kernel/sparc/zasum.S
index 53bd3c0..580b689 100644
--- a/kernel/sparc/zasum.S
+++ b/kernel/sparc/zasum.S
@@ -84,7 +84,7 @@
FCLR(0)
sll INCX, ZBASE_SHIFT, INCX
-
+
FMOV c1, c2
FMOV c1, t1
FMOV c1, t2
diff --git a/kernel/sparc/zgemm_kernel.S b/kernel/sparc/zgemm_kernel.S
index b02c942..444d3a6 100644
--- a/kernel/sparc/zgemm_kernel.S
+++ b/kernel/sparc/zgemm_kernel.S
@@ -171,7 +171,7 @@
PROLOGUE
SAVESP
-
+
#ifndef __64BIT__
#ifdef DOUBLE
#define STACK_ALPHA [%sp + STACK_START + 24]
@@ -239,7 +239,7 @@
#else
FCLR(29)
#endif
-
+
#if defined(TRMMKERNEL) && !defined(LEFT)
neg OFFSET, KK
#endif
diff --git a/kernel/sparc/zgemm_kernel_1x4.S b/kernel/sparc/zgemm_kernel_1x4.S
index 03397fd..8b0c8bd 100644
--- a/kernel/sparc/zgemm_kernel_1x4.S
+++ b/kernel/sparc/zgemm_kernel_1x4.S
@@ -239,10 +239,10 @@
.register %g2, #scratch
.register %g3, #scratch
-
+
PROLOGUE
SAVESP
-
+
#ifndef __64BIT__
#ifdef DOUBLE
st %i3, [%sp + STACK_START + 16]
@@ -1123,7 +1123,7 @@
mov BO, B
.align 4
-.LL20:
+.LL20:
and N, 2, J
cmp J, 0
ble,pn %icc, .LL30
diff --git a/kernel/sparc/zgemv_n.S b/kernel/sparc/zgemv_n.S
index 46ff438..3d4ce63 100644
--- a/kernel/sparc/zgemv_n.S
+++ b/kernel/sparc/zgemv_n.S
@@ -50,7 +50,7 @@
#define A %i5
#define LDA %i2
#define X %i3
-#define INCX %i4
+#define INCX %i4
#define Y %l0
#define INCY %l1
@@ -255,7 +255,7 @@
STF FZERO, [Y1 + 7 * SIZE]
bg,pn %icc, .LL01
add Y1, 8 * SIZE, Y1
-
+
.LL20:
sra N, 1, J
cmp J, 0
@@ -1152,7 +1152,7 @@
add Y1, INCY, Y1
add BUFFER, 4 * SIZE, BUFFER
-
+
.LL996:
andcc M, 1, I
ble,pn %icc, .LL999
diff --git a/kernel/sparc/zgemv_t.S b/kernel/sparc/zgemv_t.S
index 2b4a64c..0007a30 100644
--- a/kernel/sparc/zgemv_t.S
+++ b/kernel/sparc/zgemv_t.S
@@ -46,7 +46,7 @@
#define A %i5
#define LDA %i2
#define X %i3
-#define INCX %i4
+#define INCX %i4
#define Y %l0
#define INCY %l1
@@ -1500,7 +1500,7 @@
add Y2, INCY, Y2
STF a3, [Y2 + 0 * SIZE]
STF a4, [Y2 + 1 * SIZE]
-
+
.LL300:
andcc N, 1, J
FCLR(0)
@@ -1729,7 +1729,7 @@
cmp IS, M
bl %icc, .LL10
add A, PNLDA, A
-
+
.LL999:
return %i7 + 8
clr %o0
diff --git a/kernel/sparc/znrm2.S b/kernel/sparc/znrm2.S
index 28e9e07..065d227 100644
--- a/kernel/sparc/znrm2.S
+++ b/kernel/sparc/znrm2.S
@@ -255,7 +255,7 @@
FCMP c1, fzero
fbe .LL99
nop
-
+
FMOV c1, fmax
FDIV fone, c1, fone
diff --git a/kernel/sparc/zrot.S b/kernel/sparc/zrot.S
index ec274ca..a8609fe 100644
--- a/kernel/sparc/zrot.S
+++ b/kernel/sparc/zrot.S
@@ -149,7 +149,7 @@
cmp N, 0
ble .LL19
nop
-
+
sll INCX, ZBASE_SHIFT, INCX
sll INCY, ZBASE_SHIFT, INCY
@@ -490,7 +490,7 @@
.LL19:
return %i7 + 8
nop
-
+
.LL50:
mov X, XX
mov Y, YY
diff --git a/kernel/sparc/zscal.S b/kernel/sparc/zscal.S
index 5c6ade3..46bb6b2 100644
--- a/kernel/sparc/zscal.S
+++ b/kernel/sparc/zscal.S
@@ -170,7 +170,7 @@
#else
FCLR(24)
#endif
-
+
FCMP ALPHA_R, FZERO
fbne .LL100
sll INCX, ZBASE_SHIFT, INCX
diff --git a/kernel/sparc/zswap.S b/kernel/sparc/zswap.S
index 88ed221..70360d6 100644
--- a/kernel/sparc/zswap.S
+++ b/kernel/sparc/zswap.S
@@ -119,7 +119,7 @@
ldx [%sp + STACK_START + 64], Y
ldx [%sp + STACK_START + 72], INCY
#endif
-
+
sll INCX, ZBASE_SHIFT, INCX
sll INCY, ZBASE_SHIFT, INCY
diff --git a/kernel/sparc/ztrsm_kernel_LN.S b/kernel/sparc/ztrsm_kernel_LN.S
index 131284e..8d6f5e7 100644
--- a/kernel/sparc/ztrsm_kernel_LN.S
+++ b/kernel/sparc/ztrsm_kernel_LN.S
@@ -172,7 +172,7 @@
PROLOGUE
SAVESP
-
+
#ifndef __64BIT__
#ifdef DOUBLE
ld [%sp + STACK_START + 32], A
@@ -264,7 +264,7 @@
cmp I, 0
ble,pn %icc, .LL50
nop
-
+
#if defined(LT) || defined(RN)
sra KK, 2, L
@@ -1094,7 +1094,7 @@
cmp L, 0
ble,pn %icc, .LL29
nop
-
+
.LL26:
FADD2 c04, t1, c04
LDF [AO + 3 * SIZE], a4
diff --git a/kernel/sparc/ztrsm_kernel_LT.S b/kernel/sparc/ztrsm_kernel_LT.S
index 2a85698..cfd1c8c 100644
--- a/kernel/sparc/ztrsm_kernel_LT.S
+++ b/kernel/sparc/ztrsm_kernel_LT.S
@@ -172,7 +172,7 @@
PROLOGUE
SAVESP
-
+
#ifndef __64BIT__
#ifdef DOUBLE
ld [%sp + STACK_START + 32], A
@@ -668,7 +668,7 @@
cmp L, 0
ble,pn %icc, .LL29
nop
-
+
.LL26:
FADD2 c04, t1, c04
LDF [AO + 3 * SIZE], a4
@@ -1130,7 +1130,7 @@
FMOV FZERO, t1
ble,pn %icc, .LL99
FMOV FZERO, c04
-
+
#if defined(LT) || defined(RN)
sra KK, 2, L
diff --git a/kernel/sparc/ztrsm_kernel_LT_1x4.S b/kernel/sparc/ztrsm_kernel_LT_1x4.S
index f7d9e38..8b63143 100644
--- a/kernel/sparc/ztrsm_kernel_LT_1x4.S
+++ b/kernel/sparc/ztrsm_kernel_LT_1x4.S
@@ -66,7 +66,7 @@
#define TEMP1 %l6
#define TEMP2 %l7
#define AORIG %o7
-
+
#ifdef DOUBLE
#define c01 %f0
#define c02 %f2
@@ -223,10 +223,10 @@
.register %g2, #scratch
.register %g3, #scratch
-
+
PROLOGUE
SAVESP
-
+
#ifndef __64BIT__
#ifdef DOUBLE
ld [%sp + STACK_START + 32], A
@@ -1429,7 +1429,7 @@
nop
.align 4
-.LL20:
+.LL20:
and N, 2, J
cmp J, 0
ble,pn %icc, .LL30
diff --git a/kernel/sparc/ztrsm_kernel_RT.S b/kernel/sparc/ztrsm_kernel_RT.S
index 2949e48..5b36b58 100644
--- a/kernel/sparc/ztrsm_kernel_RT.S
+++ b/kernel/sparc/ztrsm_kernel_RT.S
@@ -172,7 +172,7 @@
PROLOGUE
SAVESP
-
+
#ifndef __64BIT__
#ifdef DOUBLE
ld [%sp + STACK_START + 32], A
@@ -1483,7 +1483,7 @@
cmp L, 0
ble,pn %icc, .LL29
nop
-
+
.LL26:
FADD2 c04, t1, c04
LDF [AO + 3 * SIZE], a4
@@ -1945,7 +1945,7 @@
FMOV FZERO, t1
ble,pn %icc, .LL99
FMOV FZERO, c04
-
+
#if defined(LT) || defined(RN)
sra KK, 2, L
diff --git a/kernel/sparc/ztrsm_kernel_RT_1x4.S b/kernel/sparc/ztrsm_kernel_RT_1x4.S
index 49d449a..668974b 100644
--- a/kernel/sparc/ztrsm_kernel_RT_1x4.S
+++ b/kernel/sparc/ztrsm_kernel_RT_1x4.S
@@ -66,7 +66,7 @@
#define TEMP1 %l6
#define TEMP2 %l7
#define AORIG %o7
-
+
#ifdef DOUBLE
#define c01 %f0
#define c02 %f2
@@ -224,10 +224,10 @@
.register %g2, #scratch
.register %g3, #scratch
-
+
PROLOGUE
SAVESP
-
+
#ifndef __64BIT__
#ifdef DOUBLE
ld [%sp + STACK_START + 32], A
diff --git a/kernel/x86/KERNEL b/kernel/x86/KERNEL
index c1a1195..8b30355 100644
--- a/kernel/x86/KERNEL
+++ b/kernel/x86/KERNEL
@@ -1,263 +1,3 @@
-GEMVDEP = ../l2param.h
-
-ifdef HAVE_SSE
-
-ifndef SAMAXKERNEL
-SAMAXKERNEL = amax_sse.S
-endif
-
-ifndef CAMAXKERNEL
-CAMAXKERNEL = zamax_sse.S
-endif
-
-ifndef SAMINKERNEL
-SAMINKERNEL = amax_sse.S
-endif
-
-ifndef CAMINKERNEL
-CAMINKERNEL = zamax_sse.S
-endif
-
-ifndef ISAMAXKERNEL
-ISAMAXKERNEL = iamax_sse.S
-endif
-
-ifndef ICAMAXKERNEL
-ICAMAXKERNEL = izamax_sse.S
-endif
-
-ifndef ISAMINKERNEL
-ISAMINKERNEL = iamax_sse.S
-endif
-
-ifndef ICAMINKERNEL
-ICAMINKERNEL = izamax_sse.S
-endif
-
-ifndef ISMAXKERNEL
-ISMAXKERNEL = iamax_sse.S
-endif
-
-ifndef ISMINKERNEL
-ISMINKERNEL = iamax_sse.S
-endif
-
-ifndef SMAXKERNEL
-SMAXKERNEL = amax_sse.S
-endif
-
-ifndef SMINKERNEL
-SMINKERNEL = amax_sse.S
-endif
-
-ifndef SASUMKERNEL
-SASUMKERNEL = asum_sse.S
-endif
-
-ifndef CASUMKERNEL
-CASUMKERNEL = zasum_sse.S
-endif
-
-ifndef SDOTKERNEL
-SDOTKERNEL = ../arm/dot.c
-endif
-
-ifndef CDOTKERNEL
-CDOTKERNEL = zdot_sse.S
-endif
-
-ifndef SCOPYKERNEL
-SCOPYKERNEL = copy_sse.S
-endif
-
-ifndef CCOPYKERNEL
-CCOPYKERNEL = zcopy_sse.S
-endif
-
-ifndef SSACALKERNEL
-SSCALKERNEL = scal_sse.S
-endif
-
-ifndef CSACALKERNEL
-CSCALKERNEL = zscal_sse.S
-endif
-
-ifndef SAXPYKERNEL
-SAXPYKERNEL = axpy_sse.S
-endif
-
-ifndef CAXPYKERNEL
-CAXPYKERNEL = zaxpy_sse.S
-endif
-
-ifndef SROTKERNEL
-SROTKERNEL = rot_sse.S
-endif
-
-ifndef CROTKERNEL
-CROTKERNEL = zrot_sse.S
-endif
-
-ifndef SSWAPKERNEL
-SSWAPKERNEL = swap_sse.S
-endif
-
-ifndef CSWAPKERNEL
-CSWAPKERNEL = zswap_sse.S
-endif
-
-ifndef SGEMVNKERNEL
-SGEMVNKERNEL = ../arm/gemv_n.c
-endif
-
-ifndef SGEMVTKERNEL
-SGEMVTKERNEL = ../arm/gemv_t.c
-endif
-
-ifndef CGEMVNKERNEL
-CGEMVNKERNEL = zgemv_n_sse.S
-endif
-
-ifndef CGEMVTKERNEL
-CGEMVTKERNEL = zgemv_t_sse.S
-endif
-
-endif
-
-
-ifdef HAVE_SSE2
-
-ifndef DAMAXKERNEL
-DAMAXKERNEL = amax_sse2.S
-endif
-
-ifndef ZAMAXKERNEL
-ZAMAXKERNEL = zamax_sse2.S
-endif
-
-ifndef DAMINKERNEL
-DAMINKERNEL = amax_sse2.S
-endif
-
-ifndef ZAMINKERNEL
-ZAMINKERNEL = zamax_sse2.S
-endif
-
-ifndef IDAMAXKERNEL
-IDAMAXKERNEL = iamax_sse2.S
-endif
-
-ifndef IZAMAXKERNEL
-IZAMAXKERNEL = izamax_sse2.S
-endif
-
-ifndef IDAMINKERNEL
-IDAMINKERNEL = iamax_sse2.S
-endif
-
-ifndef IZAMINKERNEL
-IZAMINKERNEL = izamax_sse2.S
-endif
-
-ifndef IDMAXKERNEL
-IDMAXKERNEL = iamax_sse2.S
-endif
-
-ifndef IDMINKERNEL
-IDMINKERNEL = iamax_sse2.S
-endif
-
-ifndef DMAXKERNEL
-DMAXKERNEL = amax_sse2.S
-endif
-
-ifndef DMINKERNEL
-DMINKERNEL = amax_sse2.S
-endif
-
-ifndef DDOTKERNEL
-DDOTKERNEL = dot_sse2.S
-endif
-
-ifndef ZDOTKERNEL
-ZDOTKERNEL = zdot_sse2.S
-endif
-
-ifndef DCOPYKERNEL
-# DCOPYKERNEL = copy_sse2.S
-endif
-
-ifndef ZCOPYKERNEL
-ZCOPYKERNEL = zcopy_sse2.S
-endif
-
-ifndef DSACALKERNEL
-DSCALKERNEL = scal_sse2.S
-endif
-
-ifndef ZSACALKERNEL
-ZSCALKERNEL = zscal_sse2.S
-endif
-
-ifndef DASUMKERNEL
-DASUMKERNEL = asum_sse2.S
-endif
-
-ifndef ZASUMKERNEL
-ZASUMKERNEL = zasum_sse2.S
-endif
-
-ifndef DAXPYKERNEL
-DAXPYKERNEL = axpy_sse2.S
-endif
-
-ifndef ZAXPYKERNEL
-ZAXPYKERNEL = zaxpy_sse2.S
-endif
-
-ifndef SNRM2KERNEL
-SNRM2KERNEL = nrm2_sse.S
-endif
-
-ifndef CNRM2KERNEL
-CNRM2KERNEL = znrm2_sse.S
-endif
-
-ifndef DROTKERNEL
-DROTKERNEL = rot_sse2.S
-endif
-
-ifndef ZROTKERNEL
-ZROTKERNEL = zrot_sse2.S
-endif
-
-ifndef DSWAPKERNEL
-DSWAPKERNEL = swap_sse2.S
-endif
-
-ifndef ZSWAPKERNEL
-ZSWAPKERNEL = zswap_sse2.S
-endif
-
-ifndef DGEMVNKERNEL
-DGEMVNKERNEL = gemv_n_sse2.S
-endif
-
-ifndef DGEMVTKERNEL
-DGEMVTKERNEL = gemv_t_sse2.S
-endif
-
-ifndef ZGEMVNKERNEL
-ZGEMVNKERNEL = zgemv_n_sse2.S
-endif
-
-ifndef ZGEMVTKERNEL
-ZGEMVTKERNEL = zgemv_t_sse2.S
-endif
-
-endif
-
-
ifndef SAMINKERNEL
SAMINKERNEL = amax.S
endif
@@ -394,21 +134,41 @@ XGEMMITCOPYOBJ =
XGEMMONCOPYOBJ = xgemm_oncopy$(TSUFFIX).$(SUFFIX)
XGEMMOTCOPYOBJ = xgemm_otcopy$(TSUFFIX).$(SUFFIX)
-SGEMM_BETA = gemm_beta.S
-DGEMM_BETA = gemm_beta.S
-QGEMM_BETA = ../generic/gemm_beta.c
-CGEMM_BETA = zgemm_beta.S
-ZGEMM_BETA = zgemm_beta.S
-XGEMM_BETA = ../generic/zgemm_beta.c
-
-QTRSMKERNEL_LN = qtrsm_kernel_LN_2x2.S
-QTRSMKERNEL_LT = qtrsm_kernel_LT_2x2.S
-QTRSMKERNEL_RN = qtrsm_kernel_LT_2x2.S
-QTRSMKERNEL_RT = qtrsm_kernel_RT_2x2.S
+QTRSMKERNEL_LN = qtrsm_kernel_LN_2x2.S
+QTRSMKERNEL_LT = qtrsm_kernel_LT_2x2.S
+QTRSMKERNEL_RN = qtrsm_kernel_LT_2x2.S
+QTRSMKERNEL_RT = qtrsm_kernel_RT_2x2.S
-XTRSMKERNEL_LN = xtrsm_kernel_LT_1x1.S
-XTRSMKERNEL_LT = xtrsm_kernel_LT_1x1.S
-XTRSMKERNEL_RN = xtrsm_kernel_LT_1x1.S
-XTRSMKERNEL_RT = xtrsm_kernel_LT_1x1.S
+XTRSMKERNEL_LN = xtrsm_kernel_LT_1x1.S
+XTRSMKERNEL_LT = xtrsm_kernel_LT_1x1.S
+XTRSMKERNEL_RN = xtrsm_kernel_LT_1x1.S
+XTRSMKERNEL_RT = xtrsm_kernel_LT_1x1.S
XGEMM3MKERNEL = xgemm3m_kernel_2x2.S
+
+
+# bug in zdot assembler kernel
+ifndef ZDOTKERNEL
+ZDOTKERNEL = ../arm/zdot.c
+endif
+
+DSDOTKERNEL = ../arm/dot.c
+
+# Bug in znrm2 assembler kernel
+ifndef ZNRM2KERNEL
+ZNRM2KERNEL = ../arm/znrm2.c
+endif
+
+# Bug in zgemv_t assembler kernel
+ifndef ZGEMVTKERNEL
+ZGEMVTKERNEL = ../arm/zgemv_t.c
+endif
+
+SGEMM_BETA = ../generic/gemm_beta.c
+DGEMM_BETA = ../generic/gemm_beta.c
+CGEMM_BETA = ../generic/zgemm_beta.c
+ZGEMM_BETA = ../generic/zgemm_beta.c
+
+QGEMM_BETA = ../generic/gemm_beta.c
+XGEMM_BETA = ../generic/zgemm_beta.c
+
diff --git a/kernel/x86/KERNEL.ATOM b/kernel/x86/KERNEL.ATOM
index b0f6733..7897cb9 100644
--- a/kernel/x86/KERNEL.ATOM
+++ b/kernel/x86/KERNEL.ATOM
@@ -1,10 +1,10 @@
SGEMMKERNEL = gemm_kernel_4x4_penryn.S
-SGEMMINCOPY =
-SGEMMITCOPY =
+SGEMMINCOPY =
+SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ =
-SGEMMITCOPYOBJ =
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x2_atom.S
diff --git a/kernel/x86/KERNEL.BARCELONA b/kernel/x86/KERNEL.BARCELONA
index 231350a..d984f8f 100644
--- a/kernel/x86/KERNEL.BARCELONA
+++ b/kernel/x86/KERNEL.BARCELONA
@@ -1,10 +1,10 @@
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
-SGEMMINCOPY =
-SGEMMITCOPY =
+SGEMMINCOPY =
+SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ =
-SGEMMITCOPYOBJ =
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
@@ -17,12 +17,12 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
-CGEMMINCOPY =
-CGEMMITCOPY =
+CGEMMINCOPY =
+CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
-CGEMMITCOPYOBJ =
+CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
diff --git a/kernel/x86/KERNEL.BOBCAT b/kernel/x86/KERNEL.BOBCAT
index 231350a..d984f8f 100644
--- a/kernel/x86/KERNEL.BOBCAT
+++ b/kernel/x86/KERNEL.BOBCAT
@@ -1,10 +1,10 @@
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
-SGEMMINCOPY =
-SGEMMITCOPY =
+SGEMMINCOPY =
+SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ =
-SGEMMITCOPYOBJ =
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
@@ -17,12 +17,12 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
-CGEMMINCOPY =
-CGEMMITCOPY =
+CGEMMINCOPY =
+CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
-CGEMMITCOPYOBJ =
+CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
diff --git a/kernel/x86/KERNEL.BULLDOZER b/kernel/x86/KERNEL.BULLDOZER
index 231350a..d984f8f 100644
--- a/kernel/x86/KERNEL.BULLDOZER
+++ b/kernel/x86/KERNEL.BULLDOZER
@@ -1,10 +1,10 @@
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
-SGEMMINCOPY =
-SGEMMITCOPY =
+SGEMMINCOPY =
+SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ =
-SGEMMITCOPYOBJ =
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
@@ -17,12 +17,12 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
-CGEMMINCOPY =
-CGEMMITCOPY =
+CGEMMINCOPY =
+CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
-CGEMMITCOPYOBJ =
+CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
diff --git a/kernel/x86/KERNEL.DUNNINGTON b/kernel/x86/KERNEL.DUNNINGTON
index 08e3543..f2b0f96 100644
--- a/kernel/x86/KERNEL.DUNNINGTON
+++ b/kernel/x86/KERNEL.DUNNINGTON
@@ -1,10 +1,10 @@
SGEMMKERNEL = gemm_kernel_4x4_penryn.S
-SGEMMINCOPY =
-SGEMMITCOPY =
+SGEMMINCOPY =
+SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ =
-SGEMMITCOPYOBJ =
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_penryn.S
diff --git a/kernel/x86/KERNEL.OPTERON b/kernel/x86/KERNEL.OPTERON
index 7b8b137..c065bf7 100644
--- a/kernel/x86/KERNEL.OPTERON
+++ b/kernel/x86/KERNEL.OPTERON
@@ -1,10 +1,10 @@
SGEMMKERNEL = gemm_kernel_4x4_sse.S
-SGEMMINCOPY =
-SGEMMITCOPY =
+SGEMMINCOPY =
+SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ =
-SGEMMITCOPYOBJ =
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_sse2.S
@@ -17,12 +17,12 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_sse.S
-CGEMMINCOPY =
-CGEMMITCOPY =
+CGEMMINCOPY =
+CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
-CGEMMITCOPYOBJ =
+CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_sse2.S
diff --git a/kernel/x86/KERNEL.PENRYN b/kernel/x86/KERNEL.PENRYN
index 08e3543..f2b0f96 100644
--- a/kernel/x86/KERNEL.PENRYN
+++ b/kernel/x86/KERNEL.PENRYN
@@ -1,10 +1,10 @@
SGEMMKERNEL = gemm_kernel_4x4_penryn.S
-SGEMMINCOPY =
-SGEMMITCOPY =
+SGEMMINCOPY =
+SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ =
-SGEMMITCOPYOBJ =
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_penryn.S
diff --git a/kernel/x86/KERNEL.PILEDRIVER b/kernel/x86/KERNEL.PILEDRIVER
index 231350a..d984f8f 100644
--- a/kernel/x86/KERNEL.PILEDRIVER
+++ b/kernel/x86/KERNEL.PILEDRIVER
@@ -1,10 +1,10 @@
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
-SGEMMINCOPY =
-SGEMMITCOPY =
+SGEMMINCOPY =
+SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ =
-SGEMMITCOPYOBJ =
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
@@ -17,12 +17,12 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
-CGEMMINCOPY =
-CGEMMITCOPY =
+CGEMMINCOPY =
+CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
-CGEMMITCOPYOBJ =
+CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
diff --git a/kernel/x86/KERNEL.PRESCOTT b/kernel/x86/KERNEL.PRESCOTT
index 355e00f..b8e1e75 100644
--- a/kernel/x86/KERNEL.PRESCOTT
+++ b/kernel/x86/KERNEL.PRESCOTT
@@ -1,10 +1,10 @@
SGEMMKERNEL = gemm_kernel_4x4_sse3.S
-SGEMMINCOPY =
-SGEMMITCOPY =
+SGEMMINCOPY =
+SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ =
-SGEMMITCOPYOBJ =
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_sse3.S
@@ -17,12 +17,12 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_sse3.S
-CGEMMINCOPY =
-CGEMMITCOPY =
+CGEMMINCOPY =
+CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
-CGEMMITCOPYOBJ =
+CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_sse3.S
diff --git a/kernel/x86/KERNEL.YONAH b/kernel/x86/KERNEL.YONAH
index 5b3ecae..5149f67 100644
--- a/kernel/x86/KERNEL.YONAH
+++ b/kernel/x86/KERNEL.YONAH
@@ -1,10 +1,10 @@
SGEMMKERNEL = gemm_kernel_4x4_sse3.S
-SGEMMINCOPY =
-SGEMMITCOPY =
+SGEMMINCOPY =
+SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ =
-SGEMMITCOPYOBJ =
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_sse3.S
@@ -17,12 +17,12 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_sse3.S
-CGEMMINCOPY =
-CGEMMITCOPY =
+CGEMMINCOPY =
+CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
-CGEMMITCOPYOBJ =
+CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_sse3.S
diff --git a/kernel/x86/amax.S b/kernel/x86/amax.S
index 01c2bd6..2a3404c 100644
--- a/kernel/x86/amax.S
+++ b/kernel/x86/amax.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 8
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -92,7 +92,7 @@
FLD (X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
addl INCX, X
decl M
@@ -105,7 +105,7 @@
sarl $3, I
jle .L20
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -113,7 +113,7 @@
FLD 0 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -121,7 +121,7 @@
FLD 1 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -129,7 +129,7 @@
FLD 2 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -137,7 +137,7 @@
FLD 3 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -145,7 +145,7 @@
FLD 4 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -153,7 +153,7 @@
FLD 5 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -161,7 +161,7 @@
FLD 6 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -169,7 +169,7 @@
FLD 7 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -191,7 +191,7 @@
.L21:
FLD 0 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -208,12 +208,12 @@
sarl $3, I
jle .L60
ALIGN_4
-
+
.L50:
FLD 0 * SIZE(X)
addl INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -222,7 +222,7 @@
FLD 0 * SIZE(X)
addl INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -231,7 +231,7 @@
FLD 0 * SIZE(X)
addl INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -240,7 +240,7 @@
FLD 0 * SIZE(X)
addl INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -249,7 +249,7 @@
FLD 0 * SIZE(X)
addl INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -258,7 +258,7 @@
FLD 0 * SIZE(X)
addl INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -267,7 +267,7 @@
FLD 0 * SIZE(X)
addl INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -276,7 +276,7 @@
FLD 0 * SIZE(X)
addl INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -296,7 +296,7 @@
.L61:
FLD 0 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
diff --git a/kernel/x86/amax_sse.S b/kernel/x86/amax_sse.S
index 05d21a7..e988660 100644
--- a/kernel/x86/amax_sse.S
+++ b/kernel/x86/amax_sse.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -50,7 +50,7 @@
#define X %ecx
#define INCX %edx
#define I %eax
-
+
#ifdef USE_MIN
#define maxps minps
#define maxss minss
@@ -155,7 +155,7 @@
decl I
jle .L12
ALIGN_4
-
+
.L11:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -326,7 +326,7 @@
#endif
maxps %xmm4, %xmm0
addl $4 * SIZE, X
- ALIGN_3
+ ALIGN_3
.L18:
testl $2, M
@@ -340,7 +340,7 @@
maxps %xmm4, %xmm1
addl $2 * SIZE, X
ALIGN_3
-
+
.L19:
testl $1, M
je .L998
@@ -358,7 +358,7 @@
sarl $3, I
jle .L45
ALIGN_4
-
+
.L41:
movss (X), %xmm4
addl INCX, X
@@ -451,7 +451,7 @@
andps %xmm3, %xmm7
#endif
maxss %xmm7, %xmm1
- ALIGN_3
+ ALIGN_3
.L46:
testl $2, M
@@ -471,7 +471,7 @@
#endif
maxss %xmm5, %xmm1
ALIGN_3
-
+
.L47:
testl $1, M
je .L998
@@ -493,7 +493,7 @@
shufps $1, %xmm0, %xmm0
maxss %xmm1, %xmm0
ALIGN_4
-
+
.L999:
subl $8, %esp
diff --git a/kernel/x86/amax_sse2.S b/kernel/x86/amax_sse2.S
index ad56244..e21927c 100644
--- a/kernel/x86/amax_sse2.S
+++ b/kernel/x86/amax_sse2.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -50,7 +50,7 @@
#define X %ecx
#define INCX %edx
#define I %eax
-
+
#ifdef USE_MIN
#define maxpd minpd
#define maxsd minsd
@@ -128,7 +128,7 @@
decl I
jle .L12
ALIGN_4
-
+
.L11:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -286,7 +286,7 @@
maxpd %xmm5, %xmm1
addl $4 * SIZE, X
- ALIGN_3
+ ALIGN_3
.L17:
testl $2, M
@@ -298,8 +298,8 @@
#endif
maxpd %xmm4, %xmm0
addl $2 * SIZE, X
- ALIGN_3
-
+ ALIGN_3
+
.L18:
testl $1, M
jle .L998
@@ -318,7 +318,7 @@
sarl $4, I
jle .L45
ALIGN_4
-
+
.L41:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -469,7 +469,7 @@
andps %xmm3, %xmm5
#endif
maxpd %xmm5, %xmm1
- ALIGN_3
+ ALIGN_3
.L47:
testl $2, M
@@ -484,7 +484,7 @@
#endif
maxpd %xmm6, %xmm0
ALIGN_3
-
+
.L48:
testl $1, M
je .L998
diff --git a/kernel/x86/asum.S b/kernel/x86/asum.S
index e1b0a6e..8c90f35 100644
--- a/kernel/x86/asum.S
+++ b/kernel/x86/asum.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 8
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -49,7 +49,7 @@
#define M %edx
#define X %ecx
#define INCX %esi
-
+
#define I %eax
#include "l1param.h"
@@ -91,7 +91,7 @@
sarl $3, I
jle .L20
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -153,7 +153,7 @@
sarl $3, I
jle .L60
ALIGN_4
-
+
.L50:
FLD (X)
addl INCX, X
diff --git a/kernel/x86/asum_sse.S b/kernel/x86/asum_sse.S
index 4506f29..fd2492c 100644
--- a/kernel/x86/asum_sse.S
+++ b/kernel/x86/asum_sse.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 8
#define ARGS 0
@@ -123,7 +123,7 @@
decl I
jle .L12
ALIGN_3
-
+
.L11:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -278,7 +278,7 @@
sarl $3, I
jle .L105
ALIGN_4
-
+
.L101:
movss (X), %xmm4
addl INCX, X
@@ -344,7 +344,7 @@
#ifndef HAVE_SSE3
movhlps %xmm0, %xmm1
addps %xmm1, %xmm0
-
+
movaps %xmm0, %xmm1
shufps $1, %xmm0, %xmm0
addss %xmm1, %xmm0
diff --git a/kernel/x86/asum_sse2.S b/kernel/x86/asum_sse2.S
index cea3503..a522fdf 100644
--- a/kernel/x86/asum_sse2.S
+++ b/kernel/x86/asum_sse2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 8
#define ARGS 0
@@ -105,7 +105,7 @@
decl I
jle .L11
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -217,7 +217,7 @@
addpd %xmm5, %xmm1
addl $4 * SIZE, X
- ALIGN_3
+ ALIGN_3
.L22:
testl $2, M
@@ -227,7 +227,7 @@
andps %xmm3, %xmm4
addpd %xmm4, %xmm0
addl $2 * SIZE, X
-
+
.L23:
testl $1, M
je .L999
@@ -246,7 +246,7 @@
sarl $3, I
jle .L60
ALIGN_4
-
+
.L50:
movsd -16 * SIZE(X), %xmm4
addl INCX, X
diff --git a/kernel/x86/axpy.S b/kernel/x86/axpy.S
index 7f3d99e..6d9da4e 100644
--- a/kernel/x86/axpy.S
+++ b/kernel/x86/axpy.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_ALPHA 16 + STACK + ARGS(%esp)
#ifdef DOUBLE
@@ -153,7 +153,7 @@
#ifdef HAVE_3DNOW
prefetchw 24 * SIZE(Y)
#endif
-
+
addl $8 * SIZE, X
addl $8 * SIZE, Y
decl %eax
diff --git a/kernel/x86/axpy_sse.S b/kernel/x86/axpy_sse.S
index e06d901..590e9b1 100644
--- a/kernel/x86/axpy_sse.S
+++ b/kernel/x86/axpy_sse.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_ALPHA 16 + STACK + ARGS(%esp)
#define STACK_X 20 + STACK + ARGS(%esp)
@@ -82,7 +82,7 @@
testl M, M
jle .L19
-
+
cmpl $SIZE, INCX
jne .L50
cmpl $SIZE, INCY
@@ -354,7 +354,7 @@
.L20:
#ifdef ALIGNED_ACCESS
-
+
testl $SIZE, X
jne .L30
diff --git a/kernel/x86/axpy_sse2.S b/kernel/x86/axpy_sse2.S
index 9b2d5d8..058747c 100644
--- a/kernel/x86/axpy_sse2.S
+++ b/kernel/x86/axpy_sse2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_ALPHA 16 + STACK + ARGS(%esp)
#define STACK_X 24 + STACK + ARGS(%esp)
@@ -55,7 +55,7 @@
#define INCX %ecx
#define INCY %edx
#define YY %ebp
-
+
#define ALPHA %xmm7
#include "l1param.h"
@@ -605,7 +605,7 @@
movsd -2 * SIZE(X), %xmm3
movhps -1 * SIZE(X), %xmm3
-
+
subl $-8 * SIZE, Y
subl $-8 * SIZE, X
decl %eax
diff --git a/kernel/x86/axpy_sse2_opteron.S b/kernel/x86/axpy_sse2_opteron.S
index fb22415..bc7e9ea 100644
--- a/kernel/x86/axpy_sse2_opteron.S
+++ b/kernel/x86/axpy_sse2_opteron.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define ALPHA 16 + STACK + ARGS(%esp)
#define STACK_X 24 + STACK + ARGS(%esp)
@@ -54,7 +54,7 @@
#define Y %edi
#define INCX %ecx
#define INCY %edx
-
+
#define PREFETCHSIZE 64
PROLOGUE
diff --git a/kernel/x86/copy.S b/kernel/x86/copy.S
index 721d5c5..cf4ab20 100644
--- a/kernel/x86/copy.S
+++ b/kernel/x86/copy.S
@@ -41,13 +41,13 @@
#define STACK 12
#define ARGS 0
-
+
#define M 4 + STACK + ARGS(%esp)
#define X 8 + STACK + ARGS(%esp)
#define INCX 12 + STACK + ARGS(%esp)
#define Y 16 + STACK + ARGS(%esp)
#define INCY 20 + STACK + ARGS(%esp)
-
+
PROLOGUE
pushl %edi
@@ -88,14 +88,14 @@
ALIGN_2
.L11:
- FLD 7 * SIZE(%ecx)
- FLD 6 * SIZE(%ecx)
- FLD 5 * SIZE(%ecx)
- FLD 4 * SIZE(%ecx)
- FLD 3 * SIZE(%ecx)
- FLD 2 * SIZE(%ecx)
- FLD 1 * SIZE(%ecx)
- FLD 0 * SIZE(%ecx)
+ FLD 7 * SIZE(%ecx)
+ FLD 6 * SIZE(%ecx)
+ FLD 5 * SIZE(%ecx)
+ FLD 4 * SIZE(%ecx)
+ FLD 3 * SIZE(%ecx)
+ FLD 2 * SIZE(%ecx)
+ FLD 1 * SIZE(%ecx)
+ FLD 0 * SIZE(%ecx)
FST 0 * SIZE(%edx)
FST 1 * SIZE(%edx)
diff --git a/kernel/x86/copy_sse.S b/kernel/x86/copy_sse.S
index 34902dc..c6d17b1 100644
--- a/kernel/x86/copy_sse.S
+++ b/kernel/x86/copy_sse.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
diff --git a/kernel/x86/copy_sse2.S b/kernel/x86/copy_sse2.S
index 11524aa..9a74fe9 100644
--- a/kernel/x86/copy_sse2.S
+++ b/kernel/x86/copy_sse2.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
diff --git a/kernel/x86/cpuid.S b/kernel/x86/cpuid.S
index 773b67d..7493391 100644
--- a/kernel/x86/cpuid.S
+++ b/kernel/x86/cpuid.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
PROLOGUE
PROFCODE
diff --git a/kernel/x86/dot.S b/kernel/x86/dot.S
index 5bd5d28..7f71783 100644
--- a/kernel/x86/dot.S
+++ b/kernel/x86/dot.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define STACK_N 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -79,8 +79,8 @@
movl (INCY),INCY
#endif
- leal (, INCX, SIZE), INCX
- leal (, INCY, SIZE), INCY
+ leal (, INCX, SIZE), INCX
+ leal (, INCY, SIZE), INCY
fldz
fldz
diff --git a/kernel/x86/dot_amd.S b/kernel/x86/dot_amd.S
index 75ad36e..35f0066 100644
--- a/kernel/x86/dot_amd.S
+++ b/kernel/x86/dot_amd.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define STACK_N 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -78,8 +78,8 @@
movl (INCY),INCY
#endif
- leal (, INCX, SIZE), INCX
- leal (, INCY, SIZE), INCY
+ leal (, INCX, SIZE), INCX
+ leal (, INCY, SIZE), INCY
fldz
fldz
diff --git a/kernel/x86/dot_sse.S b/kernel/x86/dot_sse.S
index 1811921..392ac49 100644
--- a/kernel/x86/dot_sse.S
+++ b/kernel/x86/dot_sse.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define STACK_N 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -75,8 +75,8 @@
movl (INCY),INCY # INCY
#endif
- leal (, INCX, SIZE), INCX
- leal (, INCY, SIZE), INCY
+ leal (, INCX, SIZE), INCX
+ leal (, INCY, SIZE), INCY
xorps %xmm0, %xmm0
xorps %xmm1, %xmm1
@@ -1297,13 +1297,13 @@
#elif defined(HAVE_SSE2)
movhlps %xmm0, %xmm1
addps %xmm1, %xmm0
-
+
PSHUFD2($1, %xmm0, %xmm1)
addss %xmm1, %xmm0
#else
movhlps %xmm0, %xmm1
addps %xmm1, %xmm0
-
+
movaps %xmm0, %xmm1
shufps $1, %xmm0, %xmm0
addss %xmm1, %xmm0
diff --git a/kernel/x86/dot_sse2.S b/kernel/x86/dot_sse2.S
index f2053d2..9f5fa42 100644
--- a/kernel/x86/dot_sse2.S
+++ b/kernel/x86/dot_sse2.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define STACK_N 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -69,8 +69,8 @@
movl STACK_Y, Y
movl STACK_INCY, INCY
- leal (, INCX, SIZE), INCX
- leal (, INCY, SIZE), INCY
+ leal (, INCX, SIZE), INCX
+ leal (, INCY, SIZE), INCY
xorps %xmm0, %xmm0
xorps %xmm1, %xmm1
diff --git a/kernel/x86/dot_sse2_opteron.S b/kernel/x86/dot_sse2_opteron.S
index 7ac059f..0b9da6b 100644
--- a/kernel/x86/dot_sse2_opteron.S
+++ b/kernel/x86/dot_sse2_opteron.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define STACK_N 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -76,8 +76,8 @@
movl (INCY),INCY # INCY
#endif
- leal (, INCX, SIZE), INCX
- leal (, INCY, SIZE), INCY
+ leal (, INCX, SIZE), INCX
+ leal (, INCY, SIZE), INCY
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
diff --git a/kernel/x86/dot_sse_opteron.S b/kernel/x86/dot_sse_opteron.S
index fc63219..0d8dfc0 100644
--- a/kernel/x86/dot_sse_opteron.S
+++ b/kernel/x86/dot_sse_opteron.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define STACK_N 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -76,8 +76,8 @@
movl (INCY),INCY # INCY
#endif
- leal (, INCX, SIZE), INCX
- leal (, INCY, SIZE), INCY
+ leal (, INCX, SIZE), INCX
+ leal (, INCY, SIZE), INCY
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
@@ -392,7 +392,7 @@
#if !defined(HAVE_SSE3) || defined(__INTERIX)
movhlps %xmm0, %xmm1
addps %xmm1, %xmm0
-
+
pshufd $1, %xmm0, %xmm1
addss %xmm1, %xmm0
#else
diff --git a/kernel/x86/gemm_beta.S b/kernel/x86/gemm_beta.S
index b68dcf3..8592fe5 100644
--- a/kernel/x86/gemm_beta.S
+++ b/kernel/x86/gemm_beta.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#ifdef DOUBLE
@@ -169,7 +169,7 @@
FLD 4 * SIZE(%eax)
fmul %st(1),%st
FST 4 * SIZE(%eax)
-
+
FLD 5 * SIZE(%eax)
fmul %st(1),%st
FST 5 * SIZE(%eax)
diff --git a/kernel/x86/gemm_kernel_1x4.S b/kernel/x86/gemm_kernel_1x4.S
index e1ff4e8..8e248b8 100644
--- a/kernel/x86/gemm_kernel_1x4.S
+++ b/kernel/x86/gemm_kernel_1x4.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define J 0 + STACK(%esp)
#define I 4 + STACK(%esp)
#define KK 8 + STACK(%esp)
@@ -70,7 +70,7 @@
#define BB %ebx
#define LDC %ebp
#define BX %esi
-
+
#define PREFETCHSIZE (8 * 5 + 4)
#define AOFFSET 1
@@ -334,7 +334,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl STACK_A, A
movl STACK_B, B
@@ -393,7 +393,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -529,7 +529,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl STACK_A, A
movl STACK_B, B
@@ -568,7 +568,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -733,7 +733,7 @@
#endif
leal (, LDC, 2), %eax
- addl %eax, C
+ addl %eax, C
movl B, STACK_B
ALIGN_4
@@ -747,7 +747,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl STACK_A, A
movl STACK_B, B
@@ -782,7 +782,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -892,7 +892,7 @@
addl $1, KK
#endif
- addl LDC, C
+ addl LDC, C
movl B, STACK_B
ALIGN_4
diff --git a/kernel/x86/gemm_kernel_2x2.S b/kernel/x86/gemm_kernel_2x2.S
index 1483bc4..f513f6d 100644
--- a/kernel/x86/gemm_kernel_2x2.S
+++ b/kernel/x86/gemm_kernel_2x2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define J 0 + STACK(%esp)
#define BX 4 + STACK(%esp)
#define KK 8 + STACK(%esp)
@@ -72,7 +72,7 @@
#else
#define REP rep
#endif
-
+
PROLOGUE
subl $ARGS, %esp # Generate Stack Frame
@@ -89,12 +89,12 @@
negl %eax
movl %eax, KK
#endif
-
+
movl N, %eax # j = (n >> 1) # MEMORY
movl LDC, %ebp # ldc # MEMORY
movl B, %ebx
- sarl $1, %eax
+ sarl $1, %eax
leal (, %ebp, SIZE), %ebp
leal 0(%ecx) , %ecx # NOP
movl %eax, J # j = (n >> 1) # MEMORY
@@ -106,7 +106,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl %ebx, BX
@@ -127,7 +127,7 @@
leal (, %eax, SIZE), %eax
leal (%edx, %eax, 2), %edx
leal (%ebx, %eax, 2), %ecx
-#endif
+#endif
#ifdef HAVE_SSE
movl BX, %eax
@@ -164,7 +164,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -327,7 +327,7 @@
ffreep %st(0)
ffreep %st(0)
- FLD ALPHA
+ FLD ALPHA
fmul %st, %st(4)
fmul %st, %st(1)
fmul %st, %st(2)
@@ -384,7 +384,7 @@
leal (, %eax, SIZE), %eax
leal (%edx, %eax, 1), %edx
leal (%ebx, %eax, 2), %ecx
-#endif
+#endif
fldz
fldz
@@ -395,7 +395,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -504,11 +504,11 @@
movl N, %eax # n # MEMORY
andl $1, %eax
je .End
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, %edi # c # MEMORY
movl A, %edx # a # MEMORY
@@ -517,7 +517,7 @@
sarl $1, %esi # m >> 1
je .L36
ALIGN_4
-
+
.L46:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
@@ -528,14 +528,14 @@
leal (, %eax, SIZE), %eax
leal (%edx, %eax, 2), %edx
leal (%ebx, %eax, 1), %ecx
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -648,14 +648,14 @@
leal (, %eax, SIZE), %eax
leal (%edx, %eax, 1), %edx
leal (%ebx, %eax, 1), %ecx
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/gemm_kernel_2x2_atom.S b/kernel/x86/gemm_kernel_2x2_atom.S
index f895412..14f0d20 100644
--- a/kernel/x86/gemm_kernel_2x2_atom.S
+++ b/kernel/x86/gemm_kernel_2x2_atom.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -84,7 +84,7 @@
movl OFFSET, %eax
#ifndef LEFT
negl %eax
-#endif
+#endif
movl %eax, KK
#endif
@@ -100,7 +100,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sall $BASE_SHIFT + 1, %eax
@@ -129,7 +129,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (B, %eax, 2), BB
-#endif
+#endif
movl BX, %eax
prefetcht0 0 * SIZE(%eax)
@@ -151,7 +151,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -319,7 +319,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 1), AA
leal (B, %eax, 2), BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
xorps %xmm2, %xmm2
@@ -332,7 +332,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -467,7 +467,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, CO1
addl LDC, C
@@ -490,7 +490,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (B, %eax, 1), BB
-#endif
+#endif
movsd 0 * SIZE(BB), %xmm1
xorps %xmm0, %xmm0
@@ -504,7 +504,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -635,7 +635,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 1), AA
leal (B, %eax, 1), BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
xorps %xmm4, %xmm4
@@ -647,7 +647,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/gemm_kernel_2x4_3dnow.S b/kernel/x86/gemm_kernel_2x4_3dnow.S
index a86efda..207ae62 100644
--- a/kernel/x86/gemm_kernel_2x4_3dnow.S
+++ b/kernel/x86/gemm_kernel_2x4_3dnow.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define OLD_M 4 + STACK + ARGS(%esi)
#define OLD_N 8 + STACK + ARGS(%esi)
#define OLD_K 12 + STACK + ARGS(%esi)
@@ -114,7 +114,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
movl OLD_B, %edi
movl OLD_C, %ebx
punpckldq %mm3, %mm3
-
+
movq %mm3, ALPHA
movl %ebx, C
@@ -143,13 +143,13 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sarl $2, %eax
jle .L03
ALIGN_3
-
+
.L02:
movd 0 * SIZE(%edi), %mm0
movd 1 * SIZE(%edi), %mm1
@@ -239,7 +239,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
decl %eax
jne .L04
ALIGN_4
-
+
.L10:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -282,7 +282,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -621,7 +621,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -948,13 +948,13 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sarl $2, %eax
jle .L33
ALIGN_3
-
+
.L32:
movd 0 * SIZE(%edi), %mm0
movd 1 * SIZE(%edi), %mm1
@@ -1012,7 +1012,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
decl %eax
jne .L34
ALIGN_4
-
+
.L40:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -1051,7 +1051,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1279,7 +1279,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1495,13 +1495,13 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sarl $3, %eax
jle .L63
ALIGN_3
-
+
.L62:
movd 0 * SIZE(%edi), %mm0
movd 1 * SIZE(%edi), %mm1
@@ -1554,7 +1554,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
decl %eax
jne .L64
ALIGN_4
-
+
.L70:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -1592,7 +1592,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1763,7 +1763,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/gemm_kernel_2x4_barcelona.S b/kernel/x86/gemm_kernel_2x4_barcelona.S
index 1acdc16..04035c7 100644
--- a/kernel/x86/gemm_kernel_2x4_barcelona.S
+++ b/kernel/x86/gemm_kernel_2x4_barcelona.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -203,7 +203,7 @@
#ifndef LEFT
negl %eax
-#endif
+#endif
movl %eax, KK
#endif
@@ -223,7 +223,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
leal GEMM_DEFAULT_Q * GEMM_DEFAULT_UNROLL_N * SIZE(B), %eax
movl %eax, BX
@@ -247,7 +247,7 @@
leal (, %eax, SIZE), %eax
leal (AO, %eax, 2), AO
leal (B, %eax, 4), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
movapd -16 * SIZE(BO), %xmm1
@@ -275,7 +275,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -492,7 +492,7 @@
leal (, %eax, SIZE), %eax
leal (AO, %eax, 1), AO
leal (B, %eax, 4), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
movapd -16 * SIZE(BO), %xmm1
@@ -508,7 +508,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -665,7 +665,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, CO # coffset = c
movl A, AO # aoffset = a
@@ -686,7 +686,7 @@
leal (, %eax, SIZE), %eax
leal (AO, %eax, 2), AO
leal (B, %eax, 2), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
pxor %xmm4, %xmm4
@@ -701,7 +701,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -866,7 +866,7 @@
leal (, %eax, SIZE), %eax
leal (AO, %eax, 1), AO
leal (B, %eax, 2), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
pxor %xmm4, %xmm4
@@ -879,7 +879,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1005,7 +1005,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, CO # coffset = c
movl A, AO # aoffset = a
@@ -1026,7 +1026,7 @@
leal (, %eax, SIZE), %eax
leal (AO, %eax, 2), AO
leal (B, %eax, 1), BO
-#endif
+#endif
movddup -16 * SIZE(BO), %xmm0
pxor %xmm4, %xmm4
@@ -1041,7 +1041,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1167,7 +1167,7 @@
leal (, %eax, SIZE), %eax
leal (AO, %eax, 1), AO
leal (B, %eax, 1), BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
pxor %xmm4, %xmm4
@@ -1180,7 +1180,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/gemm_kernel_2x4_core2.S b/kernel/x86/gemm_kernel_2x4_core2.S
index 9907131..bc2775e 100644
--- a/kernel/x86/gemm_kernel_2x4_core2.S
+++ b/kernel/x86/gemm_kernel_2x4_core2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -87,7 +87,7 @@
movl OFFSET, %eax
#ifndef LEFT
negl %eax
-#endif
+#endif
movl %eax, KK
#endif
@@ -106,7 +106,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl B, BX
@@ -160,7 +160,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -446,7 +446,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -617,7 +617,7 @@
addl $1, KK
#endif
ALIGN_4
-
+
.L29:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $4, KK
@@ -639,7 +639,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, C1
movl A, AA
@@ -677,7 +677,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -875,7 +875,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1008,7 +1008,7 @@
addl $1, KK
#endif
ALIGN_4
-
+
.L49:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $2, KK
@@ -1028,7 +1028,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, C1
movl A, AA
@@ -1063,7 +1063,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1219,7 +1219,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1305,7 +1305,7 @@
movsd %xmm4, 0 * SIZE(C1)
ALIGN_4
-
+
.L999:
popl %ebx
popl %esi
diff --git a/kernel/x86/gemm_kernel_2x4_penryn.S b/kernel/x86/gemm_kernel_2x4_penryn.S
index 0bdc918..b3bfa9a 100644
--- a/kernel/x86/gemm_kernel_2x4_penryn.S
+++ b/kernel/x86/gemm_kernel_2x4_penryn.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -116,7 +116,7 @@
movl OFFSET, %eax
#ifndef LEFT
negl %eax
-#endif
+#endif
movl %eax, KK
#endif
@@ -135,7 +135,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sall $BASE_SHIFT + 2, %eax
@@ -190,7 +190,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -501,7 +501,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -672,7 +672,7 @@
addl $1, KK
#endif
ALIGN_4
-
+
.L29:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $4, KK
@@ -694,7 +694,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, C1
movl A, AA
@@ -732,7 +732,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -930,7 +930,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1063,7 +1063,7 @@
addl $1, KK
#endif
ALIGN_4
-
+
.L49:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $2, KK
@@ -1083,7 +1083,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, C1
movl A, AA
@@ -1118,7 +1118,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1274,7 +1274,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1360,7 +1360,7 @@
movsd %xmm4, 0 * SIZE(C1)
ALIGN_4
-
+
.L999:
popl %ebx
popl %esi
diff --git a/kernel/x86/gemm_kernel_2x4_sse2.S b/kernel/x86/gemm_kernel_2x4_sse2.S
index be58235..c587fba 100644
--- a/kernel/x86/gemm_kernel_2x4_sse2.S
+++ b/kernel/x86/gemm_kernel_2x4_sse2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define OLD_M 4 + STACK + ARGS(%esi)
#define OLD_N 8 + STACK + ARGS(%esi)
#define OLD_K 12 + STACK + ARGS(%esi)
@@ -238,7 +238,7 @@
movss %xmm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
leal (, LDC, SIZE), LDC
@@ -252,7 +252,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
movl K, %eax
@@ -260,7 +260,7 @@
sarl $1, %eax
jle .L05
ALIGN_4
-
+
.L02:
#define COPYPREFETCH 40
@@ -321,7 +321,7 @@
addl $4 * SIZE, %edi
ALIGN_4
-
+
.L10:
movl %edi, BX
@@ -344,7 +344,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
movl BX, %eax
@@ -375,7 +375,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -390,7 +390,7 @@
andl $-8, %eax
sall $4, %eax
je .L15
-.L1X:
+.L1X:
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
@@ -606,7 +606,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -624,7 +624,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -841,7 +841,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
movl K, %eax
@@ -849,7 +849,7 @@
sarl $2, %eax
jle .L35
ALIGN_4
-
+
.L32:
#ifdef PENTIUM4
#ifdef HAVE_SSE3
@@ -981,7 +981,7 @@
decl %eax
jne .L36
ALIGN_4
-
+
.L40:
movl C, %esi # coffset = c
movl A, AA # aoffset = a
@@ -1002,7 +1002,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1029,7 +1029,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1196,7 +1196,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1215,7 +1215,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1368,14 +1368,14 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
leal BUFFER, %ecx
sarl $3, %eax
jle .L65
ALIGN_4
-
+
.L62:
#ifdef PENTIUM4
#ifdef HAVE_SSE3
@@ -1496,7 +1496,7 @@
decl %eax
jne .L66
ALIGN_4
-
+
.L70:
movl C, %esi # coffset = c
movl A, AA # aoffset = a
@@ -1517,7 +1517,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1542,7 +1542,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1670,7 +1670,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1689,7 +1689,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1780,7 +1780,7 @@
.L999:
movl OLD_STACK, %esp
EMMS
-
+
popl %ebx
popl %esi
popl %edi
diff --git a/kernel/x86/gemm_kernel_2x4_sse3.S b/kernel/x86/gemm_kernel_2x4_sse3.S
index e2732da..dc2ff05 100644
--- a/kernel/x86/gemm_kernel_2x4_sse3.S
+++ b/kernel/x86/gemm_kernel_2x4_sse3.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -213,7 +213,7 @@
movl OFFSET, %eax
#ifndef LEFT
negl %eax
-#endif
+#endif
movl %eax, KK
#endif
@@ -229,7 +229,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sall $BASE_SHIFT + 2, %eax
@@ -255,7 +255,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (B, %eax, 4), BB
-#endif
+#endif
movl BX, %eax
prefetcht2 0 * SIZE(%eax)
@@ -284,7 +284,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -299,7 +299,7 @@
andl $-8, %eax
sall $4, %eax
je .L15
-
+
.L1X:
KERNEL1(16 * 0)
KERNEL2(16 * 0)
@@ -741,7 +741,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 1), AA
leal (B, %eax, 4), BB
-#endif
+#endif
movddup 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -757,7 +757,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -968,7 +968,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl M, %ebx
sarl $1, %ebx # i = (m >> 2)
@@ -986,7 +986,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (B, %eax, 2), BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1012,7 +1012,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1179,7 +1179,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 1), AA
leal (B, %eax, 2), BB
-#endif
+#endif
movddup 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1195,7 +1195,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1348,7 +1348,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl M, %ebx
sarl $1, %ebx # i = (m >> 2)
@@ -1366,7 +1366,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (B, %eax, 1), BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1386,7 +1386,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1513,7 +1513,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 1), AA
leal (B, %eax, 1), BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1529,7 +1529,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/gemm_kernel_4x2_core2.S b/kernel/x86/gemm_kernel_4x2_core2.S
index 641b5fc..edaebcb 100644
--- a/kernel/x86/gemm_kernel_4x2_core2.S
+++ b/kernel/x86/gemm_kernel_4x2_core2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define OLD_M 4 + STACK + ARGS(%esi)
#define OLD_N 8 + STACK + ARGS(%esi)
#define OLD_K 12 + STACK + ARGS(%esi)
@@ -121,7 +121,7 @@
movd %mm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
subl $-16 * SIZE, A
@@ -140,13 +140,13 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sarl $2, %eax
jle .L05
ALIGN_4
-
+
.L02:
movddup -16 * SIZE(B), %xmm0
movddup -15 * SIZE(B), %xmm1
@@ -193,7 +193,7 @@
decl %eax
jne .L06
ALIGN_4
-
+
.L10:
movl B, BX
@@ -239,7 +239,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -500,7 +500,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
addl $2, %eax
@@ -664,7 +664,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -807,13 +807,13 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sarl $3, %eax
jle .L45
ALIGN_4
-
+
.L42:
movddup -16 * SIZE(B), %xmm0
movddup -15 * SIZE(B), %xmm1
@@ -855,7 +855,7 @@
decl %eax
jne .L46
ALIGN_4
-
+
.L50:
movl C, C1
movl A, AA
@@ -893,7 +893,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1060,7 +1060,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1199,7 +1199,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
addl $1, %eax
@@ -1292,7 +1292,7 @@
.L999:
movl OLD_STACK, %esp
-
+
EMMS
popl %ebx
diff --git a/kernel/x86/gemm_kernel_4x2_sse2.S b/kernel/x86/gemm_kernel_4x2_sse2.S
index 2e67afa..ea93225 100644
--- a/kernel/x86/gemm_kernel_4x2_sse2.S
+++ b/kernel/x86/gemm_kernel_4x2_sse2.S
@@ -47,7 +47,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -57,7 +57,7 @@
#define STACK_C 32 + STACK + ARGS(%esi)
#define STACK_LDC 36 + STACK + ARGS(%esi)
#define STACK_OFFT 40 + STACK + ARGS(%esi)
-
+
#define ALPHA 0(%esp)
#define K 16(%esp)
#define N 20(%esp)
@@ -242,7 +242,7 @@
movd %mm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
sall $BASE_SHIFT, LDC
@@ -250,12 +250,12 @@
movl %eax, J
jle .L100
ALIGN_2
-
+
.L01:
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leal BUFFER, %ecx
@@ -263,7 +263,7 @@
sarl $2, %eax
jle .L03
ALIGN_2
-
+
.L02:
movsd 0 * SIZE(B), %xmm0
movsd 1 * SIZE(B), %xmm1
@@ -324,7 +324,7 @@
BRANCH
jne .L04
ALIGN_4
-
+
.L05:
movl B, BX
@@ -368,7 +368,7 @@
movapd 8 * SIZE(AA), %xmm1
pxor %xmm7, %xmm7
-#endif
+#endif
prefetchnta 3 * SIZE(%esi)
prefetchnta 3 * SIZE(%esi, LDC)
@@ -383,7 +383,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -401,7 +401,7 @@
je .L12
sall $3, %eax
.align 8
-
+
.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
@@ -494,7 +494,7 @@
subl $64 * 8, %eax
BRANCH
jg .L1X
-
+
.L11:
leal (AA, %eax, 4), AA
leal (BB, %eax, 4), BB
@@ -503,7 +503,7 @@
sarl $3, %eax
je .L12
-.L11:
+.L11:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
KERNEL3(32 * 0)
@@ -548,7 +548,7 @@
addl $4 * SIZE, BB # boffset1 += 8
subl $1, %eax
jg .L13
- ALIGN_4
+ ALIGN_4
.L14:
mulpd %xmm3, %xmm4
@@ -597,7 +597,7 @@
BRANCH
jg .L10
jmp .L30
- ALIGN_2
+ ALIGN_2
.L18x:
#ifndef TRMMKERNEL
@@ -683,14 +683,14 @@
pxor %xmm6, %xmm6
movapd 8 * SIZE(AA), %xmm1
pxor %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
addl $2, %eax
@@ -699,7 +699,7 @@
sarl $3, %eax
je .L32
-.L31:
+.L31:
mulpd %xmm0, %xmm2
mulpd 2 * SIZE(BB), %xmm0
addpd %xmm2, %xmm4
@@ -861,14 +861,14 @@
pxor %xmm6, %xmm6
movsd 4 * SIZE(AA), %xmm1
pxor %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -881,7 +881,7 @@
sarl $3, %eax
je .L52
-.L51:
+.L51:
mulsd %xmm0, %xmm2
mulsd 2 * SIZE(BB), %xmm0
addsd %xmm2, %xmm4
@@ -1001,12 +1001,12 @@
testl $1, %eax
jle .L999
ALIGN_2
-
+
.L101:
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leal BUFFER, %ecx
@@ -1015,7 +1015,7 @@
sarl $3, %eax
jle .L103
ALIGN_4
-
+
.L102:
movsd 0 * SIZE(B), %xmm0
movsd 1 * SIZE(B), %xmm1
@@ -1072,7 +1072,7 @@
decl %eax
jne .L104
ALIGN_4
-
+
.L105:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -1112,7 +1112,7 @@
pxor %xmm6, %xmm6
movapd 8 * SIZE(AA), %xmm1
pxor %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
@@ -1120,7 +1120,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1133,7 +1133,7 @@
sarl $3, %eax
je .L112
-.L111:
+.L111:
mulpd %xmm2, %xmm0
mulpd 2 * SIZE(AA), %xmm2
addpd %xmm0, %xmm4
@@ -1255,7 +1255,7 @@
BRANCH
decl %ebx # i --
jg .L110
- ALIGN_2
+ ALIGN_2
.L130:
movl M, %ebx
@@ -1292,7 +1292,7 @@
pxor %xmm6, %xmm6
movapd 8 * SIZE(AA), %xmm1
pxor %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
@@ -1300,7 +1300,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1313,7 +1313,7 @@
sarl $3, %eax
je .L132
-.L131:
+.L131:
mulpd %xmm0, %xmm2
movapd 2 * SIZE(AA), %xmm0
addpd %xmm2, %xmm4
@@ -1438,14 +1438,14 @@
pxor %xmm6, %xmm6
movapd 4 * SIZE(AA), %xmm1
pxor %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
addl $1, %eax
@@ -1454,7 +1454,7 @@
sarl $3, %eax
je .L152
-.L151:
+.L151:
mulsd %xmm0, %xmm2
movsd 1 * SIZE(AA), %xmm0
addsd %xmm2, %xmm4
diff --git a/kernel/x86/gemm_kernel_4x4_barcelona.S b/kernel/x86/gemm_kernel_4x4_barcelona.S
index f081aec..df11ba5 100644
--- a/kernel/x86/gemm_kernel_4x4_barcelona.S
+++ b/kernel/x86/gemm_kernel_4x4_barcelona.S
@@ -40,7 +40,7 @@
#include "common.h"
#define STACK 16
-
+
#define OLD_M 4 + STACK(%esi)
#define OLD_N 8 + STACK(%esi)
#define OLD_K 12 + STACK(%esi)
@@ -203,7 +203,7 @@
andl $-1024, %esp # align stack
STACK_TOUCHING
-
+
movl OLD_N, %eax
movl OLD_K, %ecx
movl OLD_A, %edx
@@ -230,7 +230,7 @@
movss %xmm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
leal (, LDC, SIZE), LDC
@@ -242,7 +242,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leal BUFFER, %ecx
@@ -251,7 +251,7 @@
sarl $1, %eax
jle .L05
ALIGN_4
-
+
.L02:
prefetch (RPREFETCHSIZE + 0) * SIZE(%edi)
@@ -307,7 +307,7 @@
addl $4 * SIZE, %edi
ALIGN_4
-
+
.L10:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -328,7 +328,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
xorps %xmm4, %xmm4
@@ -351,7 +351,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -365,7 +365,7 @@
andl $-8, %eax
sall $4, %eax
je .L15
-.L1X:
+.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
KERNEL3(32 * 0)
@@ -563,7 +563,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
xorps %xmm4, %xmm4
@@ -579,7 +579,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -809,7 +809,7 @@
leal (AA, %eax, 1), AA
leal (BB, %eax, 8), BB
leal (BB, %eax, 8), BB
-#endif
+#endif
movss 0 * SIZE(AA), %xmm0
xorps %xmm4, %xmm4
@@ -825,7 +825,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1047,14 +1047,14 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
leal BUFFER, %ecx
sarl $2, %eax
jle .L45
ALIGN_4
-
+
.L42:
prefetch (RPREFETCHSIZE + 0) * SIZE(%edi)
@@ -1111,7 +1111,7 @@
decl %eax
jne .L46
ALIGN_4
-
+
.L50:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -1132,7 +1132,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -1152,7 +1152,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1313,7 +1313,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -1332,7 +1332,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1496,7 +1496,7 @@
leal (, %eax, 4), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -1515,7 +1515,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1651,7 +1651,7 @@
addl $2, KK
#endif
leal (, LDC, 2), %eax
- addl %eax, C
+ addl %eax, C
ALIGN_4
.L80:
@@ -1661,7 +1661,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -1669,7 +1669,7 @@
sarl $3, %eax
jle .L85
ALIGN_4
-
+
.L82:
prefetch (RPREFETCHSIZE + 0) * SIZE(%edi)
@@ -1722,7 +1722,7 @@
decl %eax
jne .L86
ALIGN_4
-
+
.L90:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -1743,7 +1743,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -1762,7 +1762,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1889,7 +1889,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -1908,7 +1908,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2034,7 +2034,7 @@
leal (, %eax, 4), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -2053,7 +2053,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/gemm_kernel_4x4_penryn.S b/kernel/x86/gemm_kernel_4x4_penryn.S
index 2d51d97..e3f7384 100644
--- a/kernel/x86/gemm_kernel_4x4_penryn.S
+++ b/kernel/x86/gemm_kernel_4x4_penryn.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -116,7 +116,7 @@
movl OFFSET, %eax
#ifndef LEFT
negl %eax
-#endif
+#endif
movl %eax, KK
#endif
@@ -135,7 +135,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sall $BASE_SHIFT + 2, %eax
@@ -190,7 +190,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -483,7 +483,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -691,7 +691,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -854,7 +854,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, C1
movl A, AA
@@ -895,7 +895,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1091,7 +1091,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1251,7 +1251,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1397,7 +1397,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, C1
movl A, AA
@@ -1434,7 +1434,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1593,7 +1593,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1743,7 +1743,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/gemm_kernel_4x4_sse.S b/kernel/x86/gemm_kernel_4x4_sse.S
index b360a58..5503344 100644
--- a/kernel/x86/gemm_kernel_4x4_sse.S
+++ b/kernel/x86/gemm_kernel_4x4_sse.S
@@ -40,7 +40,7 @@
#include "common.h"
#define STACK 16
-
+
#define OLD_M 4 + STACK(%esi)
#define OLD_N 8 + STACK(%esi)
#define OLD_K 12 + STACK(%esi)
@@ -207,7 +207,7 @@
addps %xmm1, %xmm7; \
movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
#endif
-
+
#ifdef PENTIUM4
#define KERNEL1(address) \
mulps %xmm0, %xmm2; \
@@ -333,7 +333,7 @@
PROFCODE
EMMS
-
+
movl %esp, %esi # save old stack
subl $128 + LOCAL_BUFFER_SIZE, %esp
movl OLD_M, %ebx
@@ -367,7 +367,7 @@
movss %xmm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
leal (, LDC, SIZE), LDC
@@ -379,7 +379,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leal BUFFER, %ecx
@@ -388,7 +388,7 @@
sarl $1, %eax
jle .L05
ALIGN_4
-
+
.L02:
#ifdef HAVE_SSE2
movss 0 * SIZE(%edi), %xmm0
@@ -524,7 +524,7 @@
#endif
addl $4 * SIZE, %edi
ALIGN_4
-
+
.L10:
movl %edi, BX
@@ -547,7 +547,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
movl BX, %eax
@@ -607,7 +607,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -622,7 +622,7 @@
andl $-8, %eax
sall $4, %eax
je .L15
-.L1X:
+.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
KERNEL3(32 * 0)
@@ -841,7 +841,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
xorps %xmm4, %xmm4
@@ -857,7 +857,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1099,7 +1099,7 @@
leal (AA, %eax, 1), AA
leal (BB, %eax, 8), BB
leal (BB, %eax, 8), BB
-#endif
+#endif
movss 0 * SIZE(AA), %xmm0
xorps %xmm4, %xmm4
@@ -1115,7 +1115,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1337,14 +1337,14 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
leal BUFFER, %ecx
sarl $2, %eax
jle .L45
ALIGN_4
-
+
.L42:
prefetchnta 80 * SIZE(%edi)
@@ -1469,7 +1469,7 @@
decl %eax
jne .L46
ALIGN_4
-
+
.L50:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -1490,7 +1490,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -1515,7 +1515,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1676,7 +1676,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -1695,7 +1695,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1865,7 +1865,7 @@
leal (, %eax, 4), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -1884,7 +1884,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2020,7 +2020,7 @@
addl $2, KK
#endif
leal (, LDC, 2), %eax
- addl %eax, C
+ addl %eax, C
ALIGN_4
.L80:
@@ -2030,14 +2030,14 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
leal BUFFER, %ecx
sarl $3, %eax
jle .L85
ALIGN_4
-
+
.L82:
prefetchnta 80 * SIZE(%edi)
@@ -2151,7 +2151,7 @@
decl %eax
jne .L86
ALIGN_4
-
+
.L90:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -2172,7 +2172,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -2195,7 +2195,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2322,7 +2322,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -2341,7 +2341,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2470,7 +2470,7 @@
leal (, %eax, 4), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -2489,7 +2489,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/gemm_kernel_4x4_sse3.S b/kernel/x86/gemm_kernel_4x4_sse3.S
index 78efab6..12581d9 100644
--- a/kernel/x86/gemm_kernel_4x4_sse3.S
+++ b/kernel/x86/gemm_kernel_4x4_sse3.S
@@ -40,7 +40,7 @@
#include "common.h"
#define STACK 16
-
+
#define OLD_M 4 + STACK(%esi)
#define OLD_N 8 + STACK(%esi)
#define OLD_K 12 + STACK(%esi)
@@ -247,7 +247,7 @@
movss %xmm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
leal (, LDC, SIZE), LDC
@@ -259,7 +259,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leal BUFFER, %ecx
@@ -268,7 +268,7 @@
sarl $2, %eax
jle .L05
ALIGN_4
-
+
.L02:
movddup 0 * SIZE(%edi), %xmm0
movddup 2 * SIZE(%edi), %xmm1
@@ -316,7 +316,7 @@
decl %eax
jne .L06
ALIGN_4
-
+
.L10:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -337,7 +337,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -360,7 +360,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -375,7 +375,7 @@
andl $-8, %eax
sall $4, %eax
je .L15
-.L1X:
+.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
KERNEL3(32 * 0)
@@ -677,7 +677,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
movddup 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -691,7 +691,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -867,7 +867,7 @@
leal (, %eax, 4), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
movss 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -881,7 +881,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1046,14 +1046,14 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
leal BUFFER, %ecx
sarl $3, %eax
jle .L45
ALIGN_4
-
+
.L42:
movddup 0 * SIZE(%edi), %xmm0
movddup 2 * SIZE(%edi), %xmm1
@@ -1098,7 +1098,7 @@
decl %eax
jne .L46
ALIGN_4
-
+
.L50:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -1119,7 +1119,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1138,7 +1138,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1298,7 +1298,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
movddup 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1314,7 +1314,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1446,7 +1446,7 @@
leal (, %eax, 4), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
movss 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1462,7 +1462,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1590,7 +1590,7 @@
addl $2, KK
#endif
leal (, LDC, 2), %eax
- addl %eax, C
+ addl %eax, C
ALIGN_4
.L80:
@@ -1600,14 +1600,14 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
leal BUFFER, %ecx
sarl $3, %eax
jle .L85
ALIGN_4
-
+
.L82:
movss 0 * SIZE(%edi), %xmm0
movss 1 * SIZE(%edi), %xmm1
@@ -1661,7 +1661,7 @@
decl %eax
jne .L86
ALIGN_4
-
+
.L90:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -1682,7 +1682,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 1), BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1702,7 +1702,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1828,7 +1828,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 1), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1847,7 +1847,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1971,7 +1971,7 @@
leal (, %eax, 4), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
movss 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1987,7 +1987,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/gemm_kernel_8x1_sse2.S b/kernel/x86/gemm_kernel_8x1_sse2.S
index 52a9ebc..fbeef0f 100644
--- a/kernel/x86/gemm_kernel_8x1_sse2.S
+++ b/kernel/x86/gemm_kernel_8x1_sse2.S
@@ -45,7 +45,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -216,7 +216,7 @@
movl %eax, J
jle .L999
ALIGN_2
-
+
.L01:
/* Copying to Sub Buffer */
movl K, %eax
@@ -224,7 +224,7 @@
sarl $3, %eax
jle .L03
ALIGN_4
-
+
.L02:
prefetchnta 96 * SIZE(B)
@@ -279,7 +279,7 @@
decl %eax
jne .L04
ALIGN_4
-
+
.L05:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -306,7 +306,7 @@
andl $-8, %eax
leal (, %eax, 8), %eax
je .L12
-
+
KERNELMACRO(32 * 0) # 0
cmpl $64 * 1, %eax
jle .L11
@@ -372,7 +372,7 @@
#define PRE 40
-.L11:
+.L11:
mulpd %xmm0, %xmm1
movd (PRE + 0) * SIZE(AA), %mm0
addpd %xmm1, %xmm4
@@ -544,7 +544,7 @@
BRANCH
decl %ebx # i --
jg .L10
- ALIGN_2
+ ALIGN_2
.L20:
movl M, %ebx
@@ -567,7 +567,7 @@
sarl $3, %eax
je .L22
-.L21:
+.L21:
movapd 0 * SIZE(BB), %xmm0
movapd 0 * SIZE(AA), %xmm1
mulpd %xmm0, %xmm1
@@ -690,7 +690,7 @@
sarl $3, %eax
je .L32
-.L31:
+.L31:
movapd 0 * SIZE(BB), %xmm0
movapd 0 * SIZE(AA), %xmm1
mulpd %xmm0, %xmm1
@@ -791,7 +791,7 @@
sarl $3, %eax
je .L52
-.L51:
+.L51:
movsd 0 * SIZE(AA), %xmm0
mulsd 0 * SIZE(BB), %xmm0
addsd %xmm0, %xmm4
diff --git a/kernel/x86/gemm_kernel_8x2_core2.S b/kernel/x86/gemm_kernel_8x2_core2.S
index 3fd8c56..6e2edc4 100644
--- a/kernel/x86/gemm_kernel_8x2_core2.S
+++ b/kernel/x86/gemm_kernel_8x2_core2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -114,7 +114,7 @@
movd %mm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
shufps $0, %xmm3, %xmm3
@@ -142,13 +142,13 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sarl $2, %eax
jle .L05
ALIGN_4
-
+
.L02:
prefetcht0 (PREFETCH_R + 0) * SIZE(B)
movss -32 * SIZE(B), %xmm0
@@ -207,7 +207,7 @@
decl %eax
jne .L06
ALIGN_4
-
+
.L10:
movl C, C1
movl A, AA
@@ -244,7 +244,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -494,7 +494,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -662,7 +662,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -825,7 +825,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -977,13 +977,13 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sarl $3, %eax
jle .L55
ALIGN_4
-
+
.L52:
movss -32 * SIZE(B), %xmm0
movss -31 * SIZE(B), %xmm1
@@ -1035,7 +1035,7 @@
decl %eax
jne .L56
ALIGN_4
-
+
.L60:
movl C, C1
movl A, AA
@@ -1073,7 +1073,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1240,7 +1240,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1379,7 +1379,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1513,7 +1513,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1610,7 +1610,7 @@
.L999:
movl OLD_STACK, %esp
-
+
EMMS
popl %ebx
diff --git a/kernel/x86/gemm_kernel_8x2_sse.S b/kernel/x86/gemm_kernel_8x2_sse.S
index c389764..f855263 100644
--- a/kernel/x86/gemm_kernel_8x2_sse.S
+++ b/kernel/x86/gemm_kernel_8x2_sse.S
@@ -45,7 +45,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -218,7 +218,7 @@
addl $STACK_OFFSET, %esp
STACK_TOUCHING
-
+
movd STACK_M, %mm0
movl STACK_N, %eax
movd STACK_K, %mm1
@@ -247,7 +247,7 @@
movd %mm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
leal (, LDC, SIZE), LDC
@@ -256,12 +256,12 @@
movl %eax, J
jle .L100
ALIGN_2
-
+
.L01:
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
movl K, %eax
@@ -269,7 +269,7 @@
sarl $2, %eax
jle .L03
ALIGN_4
-
+
.L02:
movss 0 * SIZE(B), %xmm0
movss 1 * SIZE(B), %xmm1
@@ -299,7 +299,7 @@
movaps %xmm7, 28 * SIZE(%ecx)
prefetcht0 104 * SIZE(B)
-
+
addl $ 8 * SIZE, B
addl $32 * SIZE, %ecx
decl %eax
@@ -367,7 +367,7 @@
xorps %xmm6, %xmm6
movaps 16 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
prefetchnta 7 * SIZE(%esi)
prefetchnta 7 * SIZE(%esi, %ebp)
@@ -377,7 +377,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -391,7 +391,7 @@
NOBRANCH
je .L12
sall $3, %eax
-
+
.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
@@ -518,7 +518,7 @@
xorps %xmm6, %xmm6
movaps 8 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
prefetchnta 8 * SIZE(%esi)
prefetchnta 8 * SIZE(%esi, %ebp)
@@ -528,7 +528,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -791,7 +791,7 @@
BRANCH
decl %ebx # i --
jg .L10
- ALIGN_2
+ ALIGN_2
.L30:
movl M, %ebx
@@ -830,14 +830,14 @@
xorps %xmm6, %xmm6
movaps 16 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -936,14 +936,14 @@
xorps %xmm6, %xmm6
movaps 8 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1086,7 +1086,7 @@
#endif
addl $4 * SIZE, %esi
- ALIGN_2
+ ALIGN_2
.L50:
testl $2, %ebx
@@ -1122,14 +1122,14 @@
xorps %xmm6, %xmm6
movaps 8 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1236,14 +1236,14 @@
xorps %xmm6, %xmm6
movaps 8 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1385,7 +1385,7 @@
#endif
addl $2 * SIZE, %esi
- ALIGN_2
+ ALIGN_2
.L70:
testl $1, %ebx
@@ -1420,14 +1420,14 @@
xorps %xmm6, %xmm6
movss 4 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1526,14 +1526,14 @@
xorps %xmm6, %xmm6
movss 4 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1658,7 +1658,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addl $1, KK
#endif
- ALIGN_2
+ ALIGN_2
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -1677,12 +1677,12 @@
testl $1, %eax
jle .L999
ALIGN_2
-
+
.L101:
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
movl K, %eax
@@ -1690,10 +1690,10 @@
sarl $3, %eax
jle .L103
ALIGN_4
-
+
.L102:
prefetchnta 96 * SIZE(B)
-
+
movss 0 * SIZE(B), %xmm0
movss 1 * SIZE(B), %xmm1
movss 2 * SIZE(B), %xmm2
@@ -1785,14 +1785,14 @@
xorps %xmm6, %xmm6
movaps 16 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1892,14 +1892,14 @@
xorps %xmm6, %xmm6
movaps 8 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2045,7 +2045,7 @@
BRANCH
decl %ebx # i --
jg .L110
- ALIGN_2
+ ALIGN_2
.L130:
movl M, %ebx
@@ -2084,14 +2084,14 @@
xorps %xmm6, %xmm6
movaps 16 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2168,14 +2168,14 @@
xorps %xmm6, %xmm6
movaps 8 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2278,7 +2278,7 @@
addl $4, KK
#endif
addl $4 * SIZE, %esi
- ALIGN_2
+ ALIGN_2
.L150:
testl $2, %ebx
@@ -2313,14 +2313,14 @@
xorps %xmm6, %xmm6
movaps 8 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2403,14 +2403,14 @@
xorps %xmm6, %xmm6
movaps 8 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2518,7 +2518,7 @@
addl $2, KK
#endif
addl $2 * SIZE, %esi
- ALIGN_2
+ ALIGN_2
.L170:
testl $1, %ebx
@@ -2553,14 +2553,14 @@
xorps %xmm6, %xmm6
movss 4 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2637,14 +2637,14 @@
xorps %xmm6, %xmm6
movss 4 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2728,7 +2728,7 @@
addss 0 * SIZE(%esi), %xmm4
#endif
movss %xmm4, 0 * SIZE(%esi)
- ALIGN_2
+ ALIGN_2
.L999:
movl OLD_STACK, %esp
diff --git a/kernel/x86/gemm_ncopy_2.S b/kernel/x86/gemm_ncopy_2.S
index a2674c7..e828825 100644
--- a/kernel/x86/gemm_ncopy_2.S
+++ b/kernel/x86/gemm_ncopy_2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 8
-
+
#define J 0 + STACK(%esp)
#define M 4 + STACK + ARGS(%esp)
diff --git a/kernel/x86/gemm_ncopy_2_sse.S b/kernel/x86/gemm_ncopy_2_sse.S
index 1a8262c..7a6613d 100644
--- a/kernel/x86/gemm_ncopy_2_sse.S
+++ b/kernel/x86/gemm_ncopy_2_sse.S
@@ -46,7 +46,7 @@
#define STACK 16
#define ARGS 0
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define ARG_A 12 + STACK + ARGS(%esp)
@@ -60,7 +60,7 @@
#define A2 %edx
#define I %esi
#define J %edi
-
+
PROLOGUE
pushl %ebp
diff --git a/kernel/x86/gemm_ncopy_4_sse.S b/kernel/x86/gemm_ncopy_4_sse.S
index 3e919b2..4c26b95 100644
--- a/kernel/x86/gemm_ncopy_4_sse.S
+++ b/kernel/x86/gemm_ncopy_4_sse.S
@@ -46,7 +46,7 @@
#define STACK 16
#define ARGS 0
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define ARG_A 12 + STACK + ARGS(%esp)
@@ -60,7 +60,7 @@
#define A2 %edx
#define I %esi
#define J %edi
-
+
PROLOGUE
pushl %ebp
diff --git a/kernel/x86/gemm_tcopy_2.S b/kernel/x86/gemm_tcopy_2.S
index 61b7754..3d862b6 100644
--- a/kernel/x86/gemm_tcopy_2.S
+++ b/kernel/x86/gemm_tcopy_2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 8
-
+
#define J 0 + STACK(%esp)
#define BOFFSET2 4 + STACK(%esp)
@@ -60,7 +60,7 @@
pushl %ebx
PROFCODE
-
+
EMMS
movl A, %ebp
diff --git a/kernel/x86/gemm_tcopy_2_sse.S b/kernel/x86/gemm_tcopy_2_sse.S
index de5f4ff..3a5b7c6 100644
--- a/kernel/x86/gemm_tcopy_2_sse.S
+++ b/kernel/x86/gemm_tcopy_2_sse.S
@@ -46,7 +46,7 @@
#define STACK 16
#define ARGS 8
-
+
#define J 0 + STACK(%esp)
#define BOFFSET2 4 + STACK(%esp)
@@ -65,7 +65,7 @@
pushl %ebx
PROFCODE
-
+
movl A, %ebp
movl B, %edi
diff --git a/kernel/x86/gemm_tcopy_4_sse.S b/kernel/x86/gemm_tcopy_4_sse.S
index 4e1e2e6..bc84f69 100644
--- a/kernel/x86/gemm_tcopy_4_sse.S
+++ b/kernel/x86/gemm_tcopy_4_sse.S
@@ -46,7 +46,7 @@
#define STACK 16
#define ARGS 0
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define ARG_A 12 + STACK + ARGS(%esp)
@@ -60,7 +60,7 @@
#define A2 %edx
#define I %esi
#define J %edi
-
+
PROLOGUE
pushl %ebp
diff --git a/kernel/x86/gemv_n.S b/kernel/x86/gemv_n.S
index 652c0bb..53dfd4e 100644
--- a/kernel/x86/gemv_n.S
+++ b/kernel/x86/gemv_n.S
@@ -53,7 +53,7 @@
#define STACK 16
#define ARGS 16
-
+
#define PLDA_M 0 + STACK(%esp)
#define XP 4 + STACK(%esp)
#define MIN_N 8 + STACK(%esp)
@@ -190,7 +190,7 @@
ALIGN_2
.L48:
- movl A, %edx # a_offset = a
+ movl A, %edx # a_offset = a
fldz
addl $4 * SIZE, A # a += 4
fldz
@@ -261,7 +261,7 @@
FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0)
fmul %st(1), %st # at1 *= bt1
faddp %st, %st(2) # ct1 += at1
-
+
FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1)
fmul %st(1), %st # at1 *= bt1
faddp %st, %st(3) # ct2 += at1
@@ -279,7 +279,7 @@
FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0)
fmul %st(1), %st # at1 *= bt1
faddp %st, %st(2) # ct1 += at1
-
+
FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1)
fmul %st(1), %st # at1 *= bt1
faddp %st, %st(3) # ct2 += at1
@@ -306,7 +306,7 @@
FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0)
fmul %st(1), %st # at1 *= bt1
faddp %st, %st(2) # ct1 += at1
-
+
FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1)
fmul %st(1), %st # at1 *= bt1
faddp %st, %st(3) # ct2 += at1
diff --git a/kernel/x86/gemv_n_atom.S b/kernel/x86/gemv_n_atom.S
index e88409c..f300740 100644
--- a/kernel/x86/gemv_n_atom.S
+++ b/kernel/x86/gemv_n_atom.S
@@ -57,7 +57,7 @@
#define Y 40 + STACKSIZE(%esp)
#define STACK_INCY 44 + STACKSIZE(%esp)
#define BUFFER 48 + STACKSIZE(%esp)
-
+
#define I %eax
#define J %ebx
@@ -93,7 +93,7 @@
jle .L999
movl BUFFER, Y1
-
+
pxor %xmm7, %xmm7
movl M, %eax
@@ -767,7 +767,7 @@
.L999:
popl %ebx
popl %esi
- popl %edi
+ popl %edi
popl %ebp
ret
diff --git a/kernel/x86/gemv_n_sse.S b/kernel/x86/gemv_n_sse.S
index f3a388f..3c77a2a 100644
--- a/kernel/x86/gemv_n_sse.S
+++ b/kernel/x86/gemv_n_sse.S
@@ -105,7 +105,7 @@
#define MMM 0+ARGS(%esp)
#define YY 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
-
+
#define I %eax
#define J %ebx
@@ -169,7 +169,7 @@
jle .L999
movl BUFFER, Y1
-
+
xorps %xmm7, %xmm7
movl M, %eax
@@ -697,7 +697,7 @@
.L999x:
popl %ebx
popl %esi
- popl %edi
+ popl %edi
popl %ebp
addl $ARGS,%esp
ret
diff --git a/kernel/x86/gemv_n_sse2.S b/kernel/x86/gemv_n_sse2.S
index eeb3c25..2b2cc64 100644
--- a/kernel/x86/gemv_n_sse2.S
+++ b/kernel/x86/gemv_n_sse2.S
@@ -92,7 +92,7 @@
#define MMM 0+ARGS(%esp)
#define YY 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
-
+
#define I %eax
#define J %ebx
@@ -157,7 +157,7 @@
jle .L999
movl BUFFER, Y1
-
+
pxor %xmm7, %xmm7
movl M, %eax
@@ -724,7 +724,7 @@
popl %ebx
popl %esi
- popl %edi
+ popl %edi
popl %ebp
addl $ARGS,%esp
ret
diff --git a/kernel/x86/gemv_t.S b/kernel/x86/gemv_t.S
index 2eecd3f..0d2a251 100644
--- a/kernel/x86/gemv_t.S
+++ b/kernel/x86/gemv_t.S
@@ -49,7 +49,7 @@
#define STACK 16
#define ARGS 24
-
+
#define NLDA 0 + STACK(%esp)
#define XP 4 + STACK(%esp)
#define MIN_M 8 + STACK(%esp)
@@ -305,7 +305,7 @@
addl $4 * SIZE, %esi
#else
-
+
#if defined(HAS_PREFETCH)
prefetcht0 PRESIZE * SIZE(%ebx)
prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2)
diff --git a/kernel/x86/gemv_t_atom.S b/kernel/x86/gemv_t_atom.S
index a21416d..43ff0f0 100644
--- a/kernel/x86/gemv_t_atom.S
+++ b/kernel/x86/gemv_t_atom.S
@@ -57,7 +57,7 @@
#define Y 40 + STACKSIZE(%esp)
#define STACK_INCY 44 + STACKSIZE(%esp)
#define BUFFER 48 + STACKSIZE(%esp)
-
+
#define I %eax
#define J %ebx
@@ -95,7 +95,7 @@
jle .L999
movl BUFFER, Y1
-
+
movl M, I
sarl $3, I
jle .L05
@@ -365,7 +365,7 @@
addsd %xmm6, %xmm0
addsd %xmm7, %xmm1
-
+
addl $4 * SIZE, A1
addl $4 * SIZE, X
ALIGN_4
@@ -582,7 +582,7 @@
mulsd %xmm3, %xmm5
addsd %xmm4, %xmm0
addsd %xmm5, %xmm1
-
+
addl $2 * SIZE, A1
ALIGN_4
@@ -605,11 +605,11 @@
movsd %xmm0, (Y1)
ALIGN_4
-
+
.L999:
popl %ebx
popl %esi
- popl %edi
+ popl %edi
popl %ebp
ret
diff --git a/kernel/x86/gemv_t_sse.S b/kernel/x86/gemv_t_sse.S
index 48193f1..2c927aa 100644
--- a/kernel/x86/gemv_t_sse.S
+++ b/kernel/x86/gemv_t_sse.S
@@ -105,7 +105,7 @@
#define MMM 0+ARGS(%esp)
#define AA 4+ARGS(%esp)
#define XX 8+ARGS(%esp)
-
+
#define I %eax
#define J %ebx
@@ -139,10 +139,10 @@
sall $22,J # J=2^24*sizeof(float)=buffer size(16MB)
subl $8, J # Don't use last 8 float in the buffer.
subl J,MMM # MMM=MMM-J
- movl J,M
+ movl J,M
jge .L00t
ALIGN_4
-
+
movl MMM,%eax
addl J,%eax
jle .L999x
@@ -171,7 +171,7 @@
jle .L999
movl BUFFER, Y1
-
+
movl M, I
sarl $3, I
jle .L05
@@ -423,7 +423,7 @@
mulps %xmm2, %xmm5
addps %xmm5, %xmm1
movaps %xmm3, %xmm2
-
+
addl $4 * SIZE, A1
ALIGN_4
@@ -446,7 +446,7 @@
mulps %xmm2, %xmm5
addps %xmm5, %xmm1
movhlps %xmm2, %xmm2
-
+
addl $2 * SIZE, A1
ALIGN_4
@@ -621,7 +621,7 @@
mulps %xmm2, %xmm4
addps %xmm4, %xmm0
movaps %xmm3, %xmm2
-
+
addl $4 * SIZE, A1
ALIGN_4
@@ -637,7 +637,7 @@
mulps %xmm2, %xmm4
addps %xmm4, %xmm0
movhlps %xmm2, %xmm2
-
+
addl $2 * SIZE, A1
ALIGN_4
@@ -673,7 +673,7 @@
movss %xmm0, (Y1)
ALIGN_4
-
+
.L999:
movl M,J
leal (,J,SIZE),%eax
@@ -687,7 +687,7 @@
.L999x:
popl %ebx
popl %esi
- popl %edi
+ popl %edi
popl %ebp
addl $ARGS,%esp
diff --git a/kernel/x86/gemv_t_sse2.S b/kernel/x86/gemv_t_sse2.S
index 75ed89a..b94723a 100644
--- a/kernel/x86/gemv_t_sse2.S
+++ b/kernel/x86/gemv_t_sse2.S
@@ -128,10 +128,10 @@
sall $21,J # J=2^21*sizeof(double)=buffer size(16MB)
subl $4, J # Don't use last 4 double in the buffer.
subl J,MMM # MMM=MMM-J
- movl J,M
+ movl J,M
jge .L00t
ALIGN_4
-
+
movl MMM,%eax
addl J,%eax
jle .L999x
@@ -161,7 +161,7 @@
jle .L999
movl BUFFER, Y1
-
+
movl M, I
sarl $3, I
jle .L05
@@ -391,7 +391,7 @@
mulpd %xmm2, %xmm5
addpd %xmm5, %xmm1
movapd %xmm3, %xmm2
-
+
addl $2 * SIZE, A1
ALIGN_4
@@ -562,7 +562,7 @@
mulpd %xmm2, %xmm4
addpd %xmm4, %xmm0
movapd %xmm3, %xmm2
-
+
addl $2 * SIZE, A1
ALIGN_4
@@ -594,7 +594,7 @@
movlpd %xmm0, (Y1)
ALIGN_4
-
+
.L999:
movl M,J
leal (,J,SIZE),%eax
@@ -608,7 +608,7 @@
.L999x:
popl %ebx
popl %esi
- popl %edi
+ popl %edi
popl %ebp
addl $ARGS,%esp
diff --git a/kernel/x86/iamax.S b/kernel/x86/iamax.S
index 33204c0..1a73784 100644
--- a/kernel/x86/iamax.S
+++ b/kernel/x86/iamax.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -101,7 +101,7 @@
FLD (X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
addl INCX, X
decl M
@@ -114,7 +114,7 @@
sarl $3, I
jle .L20
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -122,7 +122,7 @@
FLD 0 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -132,7 +132,7 @@
FLD 1 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -142,7 +142,7 @@
FLD 2 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -152,7 +152,7 @@
FLD 3 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -162,7 +162,7 @@
FLD 4 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -172,7 +172,7 @@
FLD 5 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -182,7 +182,7 @@
FLD 6 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -192,7 +192,7 @@
FLD 7 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -216,7 +216,7 @@
.L21:
FLD 0 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -235,12 +235,12 @@
sarl $3, I
jle .L60
ALIGN_4
-
+
.L50:
FLD 0 * SIZE(X)
addl INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -251,7 +251,7 @@
FLD 0 * SIZE(X)
addl INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -262,7 +262,7 @@
FLD 0 * SIZE(X)
addl INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -273,7 +273,7 @@
FLD 0 * SIZE(X)
addl INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -284,7 +284,7 @@
FLD 0 * SIZE(X)
addl INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -295,7 +295,7 @@
FLD 0 * SIZE(X)
addl INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -306,7 +306,7 @@
FLD 0 * SIZE(X)
addl INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -317,7 +317,7 @@
FLD 0 * SIZE(X)
addl INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -339,7 +339,7 @@
.L61:
FLD 0 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi %st(1), %st
FMOV %st(1), %st(0)
diff --git a/kernel/x86/iamax_sse.S b/kernel/x86/iamax_sse.S
index 3b64ebd..dcd62bf 100644
--- a/kernel/x86/iamax_sse.S
+++ b/kernel/x86/iamax_sse.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -54,7 +54,7 @@
#define MM %ebp
#define XX %edi
#define TEMP %ebx
-
+
#ifdef USE_MIN
#define maxps minps
#define maxss minss
@@ -163,7 +163,7 @@
sarl $4, I
jle .L15
ALIGN_4
-
+
.L11:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
@@ -229,7 +229,7 @@
#endif
maxps %xmm4, %xmm2
addl $4 * SIZE, XX
- ALIGN_3
+ ALIGN_3
.L17:
testl $2, MM
@@ -242,7 +242,7 @@
#endif
maxps %xmm4, %xmm3
addl $2 * SIZE, XX
-
+
.L18:
testl $1, MM
je .L20
@@ -312,7 +312,7 @@
sarl $3, I
jle .L25
ALIGN_4
-
+
.L23:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
@@ -418,7 +418,7 @@
incl RET
comiss %xmm0, %xmm4
je .L999
- ALIGN_3
+ ALIGN_3
.L26:
testl $2, MM
@@ -438,7 +438,7 @@
comiss %xmm0, %xmm2
je .L999
ALIGN_3
-
+
.L27:
incl RET
jmp .L999
@@ -450,7 +450,7 @@
sarl $4, I
jle .L35
ALIGN_4
-
+
.L31:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
@@ -524,7 +524,7 @@
#endif
maxps %xmm4, %xmm2
addl $4 * SIZE, XX
- ALIGN_3
+ ALIGN_3
.L37:
testl $2, MM
@@ -537,7 +537,7 @@
#endif
maxps %xmm4, %xmm3
addl $2 * SIZE, XX
-
+
.L38:
testl $1, MM
je .L40
@@ -569,7 +569,7 @@
sarl $3, I
jle .L45
ALIGN_4
-
+
.L43:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
@@ -677,7 +677,7 @@
incl RET
comiss %xmm0, %xmm4
je .L999
- ALIGN_3
+ ALIGN_3
.L46:
testl $2, MM
@@ -697,7 +697,7 @@
comiss %xmm0, %xmm2
je .L999
ALIGN_3
-
+
.L47:
incl RET
jmp .L999
@@ -708,7 +708,7 @@
sarl $3, I
jle .L85
ALIGN_4
-
+
.L81:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
@@ -808,7 +808,7 @@
andps %xmm7, %xmm4
#endif
maxss %xmm4, %xmm3
- ALIGN_3
+ ALIGN_3
.L86:
testl $2, MM
@@ -828,7 +828,7 @@
#endif
maxss %xmm4, %xmm1
ALIGN_3
-
+
.L87:
testl $1, MM
je .L90
@@ -854,7 +854,7 @@
sarl $2, I
jle .L96
ALIGN_4
-
+
.L92:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
@@ -932,7 +932,7 @@
incl RET
comiss %xmm0, %xmm4
je .L999
- ALIGN_3
+ ALIGN_3
.L96:
testl $2, MM
@@ -953,7 +953,7 @@
comiss %xmm0, %xmm2
je .L999
ALIGN_3
-
+
.L97:
incl RET
ALIGN_3
diff --git a/kernel/x86/iamax_sse2.S b/kernel/x86/iamax_sse2.S
index a0ddb26..caa6fc8 100644
--- a/kernel/x86/iamax_sse2.S
+++ b/kernel/x86/iamax_sse2.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -54,7 +54,7 @@
#define MM %ebp
#define XX %edi
#define TEMP %ebx
-
+
#ifdef USE_MIN
#define maxpd minpd
#define maxsd minsd
@@ -137,7 +137,7 @@
sarl $4, I
jle .L15
ALIGN_4
-
+
.L11:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
@@ -249,7 +249,7 @@
#endif
maxpd %xmm4, %xmm1
addl $4 * SIZE, XX
- ALIGN_3
+ ALIGN_3
.L17:
testl $2, MM
@@ -261,7 +261,7 @@
#endif
maxpd %xmm4, %xmm2
addl $2 * SIZE, XX
-
+
.L18:
testl $1, MM
je .L20
@@ -306,7 +306,7 @@
sarl $3, I
jle .L25
ALIGN_4
-
+
.L22:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
@@ -425,7 +425,7 @@
incl RET
comisd %xmm0, %xmm4
je .L999
- ALIGN_3
+ ALIGN_3
.L27:
testl $2, MM
@@ -445,7 +445,7 @@
comisd %xmm0, %xmm2
je .L999
ALIGN_3
-
+
.L28:
incl RET
jmp .L999
@@ -584,7 +584,7 @@
#endif
maxpd %xmm4, %xmm1
addl $4 * SIZE, XX
- ALIGN_3
+ ALIGN_3
.L57:
testl $2, MM
@@ -597,7 +597,7 @@
#endif
maxpd %xmm4, %xmm2
addl $2 * SIZE, XX
-
+
.L58:
testl $1, MM
je .L60
@@ -626,7 +626,7 @@
sarl $3, I
jle .L65
ALIGN_4
-
+
.L62:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
@@ -704,13 +704,13 @@
movsd 4 * SIZE(XX), %xmm1
movsd 5 * SIZE(XX), %xmm2
movsd 6 * SIZE(XX), %xmm3
-
+
#ifdef USE_ABS
andpd %xmm7, %xmm1
andpd %xmm7, %xmm2
andpd %xmm7, %xmm3
#endif
-
+
comisd %xmm0, %xmm1
je .L999
incl RET
@@ -750,7 +750,7 @@
incl RET
comisd %xmm0, %xmm4
je .L999
- ALIGN_3
+ ALIGN_3
.L67:
testl $2, MM
@@ -770,7 +770,7 @@
comisd %xmm0, %xmm2
je .L999
ALIGN_3
-
+
.L68:
incl RET
jmp .L999
@@ -781,7 +781,7 @@
sarl $4, I
jle .L85
ALIGN_4
-
+
.L81:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
@@ -932,7 +932,7 @@
andpd %xmm7, %xmm4
#endif
maxpd %xmm4, %xmm1
- ALIGN_3
+ ALIGN_3
.L87:
testl $2, MM
@@ -947,7 +947,7 @@
#endif
maxpd %xmm4, %xmm2
ALIGN_3
-
+
.L88:
testl $1, MM
je .L90
@@ -976,7 +976,7 @@
sarl $3, I
jle .L95
ALIGN_4
-
+
.L92:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
@@ -1116,7 +1116,7 @@
incl RET
comisd %xmm0, %xmm4
je .L999
- ALIGN_3
+ ALIGN_3
.L97:
testl $2, MM
@@ -1137,7 +1137,7 @@
comisd %xmm0, %xmm2
je .L999
ALIGN_3
-
+
.L98:
incl RET
ALIGN_3
diff --git a/kernel/x86/izamax.S b/kernel/x86/izamax.S
index 63bcaef..de324ad 100644
--- a/kernel/x86/izamax.S
+++ b/kernel/x86/izamax.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -99,9 +99,9 @@
movl $1, RET
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
addl INCX, X
decl M
@@ -114,16 +114,16 @@
sarl $2, I
jle .L20
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -132,9 +132,9 @@
incl NUM
FLD 2 * SIZE(X)
- fabs
+ fabs
FLD 3 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -143,9 +143,9 @@
incl NUM
FLD 4 * SIZE(X)
- fabs
+ fabs
FLD 5 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -154,9 +154,9 @@
incl NUM
FLD 6 * SIZE(X)
- fabs
+ fabs
FLD 7 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -178,9 +178,9 @@
.L21:
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -199,12 +199,12 @@
sarl $2, I
jle .L60
ALIGN_4
-
+
.L50:
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
addl INCX, X
faddp %st, %st(1)
fcomi %st(1), %st
@@ -214,9 +214,9 @@
incl NUM
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
addl INCX, X
faddp %st, %st(1)
fcomi %st(1), %st
@@ -226,9 +226,9 @@
incl NUM
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
addl INCX, X
faddp %st, %st(1)
fcomi %st(1), %st
@@ -238,9 +238,9 @@
incl NUM
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
addl INCX, X
faddp %st, %st(1)
fcomi %st(1), %st
@@ -262,9 +262,9 @@
.L61:
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi %st(1), %st
FMOV %st(1), %st(0)
diff --git a/kernel/x86/izamax_sse.S b/kernel/x86/izamax_sse.S
index 95223fe..eed58be 100644
--- a/kernel/x86/izamax_sse.S
+++ b/kernel/x86/izamax_sse.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -54,12 +54,12 @@
#define MM %ebp
#define XX %edi
#define TEMP %ebx
-
+
#ifdef USE_MIN
#define maxps minps
#define maxss minss
#endif
-
+
#ifndef HAVE_SSE2
#define pxor xorps
#define movsd movlps
@@ -126,7 +126,7 @@
sarl $3, I
jle .L35
ALIGN_4
-
+
.L31:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
@@ -209,7 +209,7 @@
maxss %xmm1, %xmm0
maxss %xmm3, %xmm0
addl $4 * SIZE, XX
- ALIGN_3
+ ALIGN_3
.L37:
testl $1, MM
@@ -239,7 +239,7 @@
sarl $2, I
jle .L45
ALIGN_4
-
+
.L41:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
@@ -338,7 +338,7 @@
incl RET
comiss %xmm0, %xmm3
je .L999
- ALIGN_3
+ ALIGN_3
.L47:
incl RET
@@ -350,7 +350,7 @@
sarl $3, I
jle .L75
ALIGN_4
-
+
.L71:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
@@ -422,7 +422,7 @@
andps %xmm7, %xmm3
addps %xmm3, %xmm1
maxps %xmm1, %xmm0
- ALIGN_3
+ ALIGN_3
.L76:
testl $2, MM
@@ -443,7 +443,7 @@
maxss %xmm1, %xmm0
maxss %xmm3, %xmm0
ALIGN_3
-
+
.L77:
testl $1, MM
je .L80
@@ -472,7 +472,7 @@
sarl $2, I
jle .L85
ALIGN_4
-
+
.L81:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
@@ -580,7 +580,7 @@
incl RET
comiss %xmm0, %xmm3
je .L999
- ALIGN_3
+ ALIGN_3
.L87:
incl RET
diff --git a/kernel/x86/izamax_sse2.S b/kernel/x86/izamax_sse2.S
index 0392e1d..d9e7a8b 100644
--- a/kernel/x86/izamax_sse2.S
+++ b/kernel/x86/izamax_sse2.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -54,7 +54,7 @@
#define MM %ebp
#define XX %edi
#define TEMP %ebx
-
+
#ifdef USE_MIN
#define maxpd minpd
#define maxsd minsd
@@ -209,7 +209,7 @@
maxpd %xmm1, %xmm0
addl $4 * SIZE, XX
- ALIGN_3
+ ALIGN_3
.L27:
testl $1, MM
@@ -341,7 +341,7 @@
incl RET
comisd %xmm0, %xmm3
je .L999
- ALIGN_3
+ ALIGN_3
.L36:
incl RET
@@ -353,7 +353,7 @@
sarl $3, I
jle .L65
ALIGN_4
-
+
.L61:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -462,7 +462,7 @@
andpd %xmm7, %xmm2
addpd %xmm2, %xmm1
maxpd %xmm1, %xmm0
- ALIGN_3
+ ALIGN_3
.L67:
testl $1, MM
@@ -603,7 +603,7 @@
incl RET
comisd %xmm0, %xmm3
je .L999
- ALIGN_3
+ ALIGN_3
.L76:
incl RET
diff --git a/kernel/x86/nrm2.S b/kernel/x86/nrm2.S
index c098249..7a14da8 100644
--- a/kernel/x86/nrm2.S
+++ b/kernel/x86/nrm2.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 8
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -49,7 +49,7 @@
#define M %edx
#define X %ecx
#define INCX %esi
-
+
#define I %eax
#include "l1param.h"
@@ -91,7 +91,7 @@
sarl $3, I
jle .L20
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -153,7 +153,7 @@
sarl $3, I
jle .L60
ALIGN_4
-
+
.L50:
FLD (X)
addl INCX, X
diff --git a/kernel/x86/nrm2_sse.S b/kernel/x86/nrm2_sse.S
index e704609..0f174c4 100644
--- a/kernel/x86/nrm2_sse.S
+++ b/kernel/x86/nrm2_sse.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 8
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -49,7 +49,7 @@
#define M %edx
#define X %ecx
#define INCX %esi
-
+
#define I %eax
#include "l1param.h"
@@ -79,7 +79,7 @@
testl $SIZE, X
je .L05
-
+
movss -32 * SIZE(X), %xmm0
cvtss2sd %xmm0, %xmm0
mulsd %xmm0, %xmm0
@@ -93,7 +93,7 @@
movl M, I
sarl $4, I
jle .L13
-
+
movsd -32 * SIZE(X), %xmm4
movsd -30 * SIZE(X), %xmm5
movsd -28 * SIZE(X), %xmm6
@@ -267,7 +267,7 @@
sarl $3, I
jle .L44
ALIGN_4
-
+
.L41:
movss (X), %xmm4
addl INCX, X
diff --git a/kernel/x86/qaxpy.S b/kernel/x86/qaxpy.S
index 0497ea3..6298e40 100644
--- a/kernel/x86/qaxpy.S
+++ b/kernel/x86/qaxpy.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_ALPHA 16 + STACK + ARGS(%esp)
#define STACK_X 32 + STACK + ARGS(%esp)
@@ -154,7 +154,7 @@
#ifdef HAVE_3DNOW
prefetchw 24 * SIZE(Y)
#endif
-
+
addl $8 * SIZE, X
addl $8 * SIZE, Y
decl %eax
diff --git a/kernel/x86/qdot.S b/kernel/x86/qdot.S
index ce5ff29..21665ce 100644
--- a/kernel/x86/qdot.S
+++ b/kernel/x86/qdot.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define STACK_N 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
diff --git a/kernel/x86/qgemm_kernel_2x2.S b/kernel/x86/qgemm_kernel_2x2.S
index a2852f2..55748b1 100644
--- a/kernel/x86/qgemm_kernel_2x2.S
+++ b/kernel/x86/qgemm_kernel_2x2.S
@@ -50,7 +50,7 @@
#define PREFETCHSIZE (5 + 4 * 10)
#define STACK 16
#define ARGS 16
-
+
#define J 0 + STACK(%esp)
#define KK 4 + STACK(%esp)
#define KKK 8 + STACK(%esp)
@@ -90,13 +90,13 @@
negl %eax
movl %eax, KK
#endif
-
+
movl ARG_LDC, LDC
movl ARG_B, B
addl $8 * SIZE, A
addl $8 * SIZE, B
-
+
sall $BASE_SHIFT, LDC
movl N, %eax
@@ -109,7 +109,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl A, AO
@@ -132,7 +132,7 @@
sall $BASE_SHIFT, %eax
leal (AO, %eax, 2), AO
leal (B, %eax, 2), BO
-#endif
+#endif
fldz
fldz
@@ -152,7 +152,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -178,7 +178,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -196,7 +196,7 @@
FLD -5 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -5 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -216,7 +216,7 @@
FLD -3 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -3 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -234,7 +234,7 @@
FLD -1 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -1 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -270,7 +270,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -351,7 +351,7 @@
sall $BASE_SHIFT, %eax
leal (AO, %eax, 1), AO
leal ( B, %eax, 2), BO
-#endif
+#endif
fldz
fldz
@@ -361,7 +361,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -500,13 +500,13 @@
.L30:
movl N, %eax
- testl $1, %eax
+ testl $1, %eax
je .L999
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl A, AO
@@ -528,7 +528,7 @@
sall $BASE_SHIFT, %eax
leal (AO, %eax, 2), AO
leal ( B, %eax, 1), BO
-#endif
+#endif
fldz
fldz
@@ -544,7 +544,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -685,7 +685,7 @@
sall $BASE_SHIFT, %eax
leal (AO, %eax, 1), AO
leal ( B, %eax, 1), BO
-#endif
+#endif
fldz
@@ -694,7 +694,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/qgemv_n.S b/kernel/x86/qgemv_n.S
index e33bce2..1d1ca47 100644
--- a/kernel/x86/qgemv_n.S
+++ b/kernel/x86/qgemv_n.S
@@ -53,7 +53,7 @@
#define STACK 16
#define ARGS 16
-
+
#define PLDA_M 0 + STACK(%esp)
#define XP 4 + STACK(%esp)
#define MIN_N 8 + STACK(%esp)
@@ -184,7 +184,7 @@
ALIGN_2
.L48:
- movl A, %edx # a_offset = a
+ movl A, %edx # a_offset = a
fldz
addl $4 * SIZE, A # a += 4
fldz
@@ -255,7 +255,7 @@
FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0)
fmul %st(1), %st # at1 *= bt1
faddp %st, %st(2) # ct1 += at1
-
+
FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1)
fmul %st(1), %st # at1 *= bt1
faddp %st, %st(3) # ct2 += at1
@@ -274,7 +274,7 @@
FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0)
fmul %st(1), %st # at1 *= bt1
faddp %st, %st(2) # ct1 += at1
-
+
FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1)
fmul %st(1), %st # at1 *= bt1
faddp %st, %st(3) # ct2 += at1
@@ -302,7 +302,7 @@
FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0)
fmul %st(1), %st # at1 *= bt1
faddp %st, %st(2) # ct1 += at1
-
+
FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1)
fmul %st(1), %st # at1 *= bt1
faddp %st, %st(3) # ct2 += at1
diff --git a/kernel/x86/qgemv_t.S b/kernel/x86/qgemv_t.S
index ff2ba80..f5a77fc 100644
--- a/kernel/x86/qgemv_t.S
+++ b/kernel/x86/qgemv_t.S
@@ -49,7 +49,7 @@
#define STACK 16
#define ARGS 24
-
+
#define NLDA 0 + STACK(%esp)
#define XP 4 + STACK(%esp)
#define MIN_M 8 + STACK(%esp)
@@ -299,7 +299,7 @@
addl $4 * SIZE, %esi
#else
-
+
#if defined(HAS_PREFETCH)
prefetcht0 PRESIZE * SIZE(%ebx)
prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2)
diff --git a/kernel/x86/qtrsm_kernel_LN_2x2.S b/kernel/x86/qtrsm_kernel_LN_2x2.S
index 37c268b..749dec4 100644
--- a/kernel/x86/qtrsm_kernel_LN_2x2.S
+++ b/kernel/x86/qtrsm_kernel_LN_2x2.S
@@ -50,7 +50,7 @@
#define PREFETCHSIZE (5 + 4 * 10)
#define STACK 16
#define ARGS 16
-
+
#define J 0 + STACK(%esp)
#define KK 4 + STACK(%esp)
#define KKK 8 + STACK(%esp)
@@ -117,7 +117,7 @@
movl OFFSET, %eax
negl %eax
movl %eax, KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -159,7 +159,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -186,7 +186,7 @@
leal (B, %eax, 2), BO
#else
movl B, BO
-#endif
+#endif
fldz
fldz
@@ -409,7 +409,7 @@
leal (B, %eax, 2), BO
#else
movl B, BO
-#endif
+#endif
fldz
fldz
@@ -446,7 +446,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -464,7 +464,7 @@
FLD -5 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -5 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -484,7 +484,7 @@
FLD -3 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -3 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -502,7 +502,7 @@
FLD -1 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -1 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -539,7 +539,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -758,7 +758,7 @@
.L30:
movl N, %eax
- testl $1, %eax
+ testl $1, %eax
je .L999
#if defined(LT) || defined(RN)
@@ -786,7 +786,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -813,7 +813,7 @@
leal (B, %eax, 1), BO
#else
movl B, BO
-#endif
+#endif
fldz
@@ -988,7 +988,7 @@
leal (B, %eax, 1), BO
#else
movl B, BO
-#endif
+#endif
fldz
fldz
diff --git a/kernel/x86/qtrsm_kernel_LT_2x2.S b/kernel/x86/qtrsm_kernel_LT_2x2.S
index 157e12d..10c3986 100644
--- a/kernel/x86/qtrsm_kernel_LT_2x2.S
+++ b/kernel/x86/qtrsm_kernel_LT_2x2.S
@@ -50,7 +50,7 @@
#define PREFETCHSIZE (5 + 4 * 10)
#define STACK 16
#define ARGS 16
-
+
#define J 0 + STACK(%esp)
#define KK 4 + STACK(%esp)
#define AORIG 8 + STACK(%esp)
@@ -115,7 +115,7 @@
movl OFFSET, %eax
negl %eax
movl %eax, KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -157,7 +157,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -184,7 +184,7 @@
leal (B, %eax, 2), BO
#else
movl B, BO
-#endif
+#endif
fldz
fldz
@@ -221,7 +221,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -239,7 +239,7 @@
FLD -5 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -5 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -259,7 +259,7 @@
FLD -3 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -3 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -277,7 +277,7 @@
FLD -1 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -1 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -314,7 +314,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -529,7 +529,7 @@
leal (B, %eax, 2), BO
#else
movl B, BO
-#endif
+#endif
fldz
fldz
@@ -756,7 +756,7 @@
.L30:
movl N, %eax
- testl $1, %eax
+ testl $1, %eax
je .L999
#if defined(LT) || defined(RN)
@@ -784,7 +784,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -811,7 +811,7 @@
leal (B, %eax, 1), BO
#else
movl B, BO
-#endif
+#endif
fldz
fldz
@@ -1044,7 +1044,7 @@
leal (B, %eax, 1), BO
#else
movl B, BO
-#endif
+#endif
fldz
diff --git a/kernel/x86/qtrsm_kernel_RT_2x2.S b/kernel/x86/qtrsm_kernel_RT_2x2.S
index a0a4daf..3a00076 100644
--- a/kernel/x86/qtrsm_kernel_RT_2x2.S
+++ b/kernel/x86/qtrsm_kernel_RT_2x2.S
@@ -50,7 +50,7 @@
#define PREFETCHSIZE (5 + 4 * 10)
#define STACK 16
#define ARGS 16
-
+
#define J 0 + STACK(%esp)
#define KK 4 + STACK(%esp)
#define KKK 8 + STACK(%esp)
@@ -117,7 +117,7 @@
movl OFFSET, %eax
negl %eax
movl %eax, KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -126,7 +126,7 @@
#endif
movl N, %eax
- testl $1, %eax
+ testl $1, %eax
je .L30
#if defined(LT) || defined(RN)
@@ -154,7 +154,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -181,7 +181,7 @@
leal (B, %eax, 1), BO
#else
movl B, BO
-#endif
+#endif
fldz
fldz
@@ -414,7 +414,7 @@
leal (B, %eax, 1), BO
#else
movl B, BO
-#endif
+#endif
fldz
@@ -623,7 +623,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -650,7 +650,7 @@
leal (B, %eax, 2), BO
#else
movl B, BO
-#endif
+#endif
fldz
fldz
@@ -687,7 +687,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -705,7 +705,7 @@
FLD -5 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -5 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -725,7 +725,7 @@
FLD -3 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -3 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -743,7 +743,7 @@
FLD -1 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -1 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -780,7 +780,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -995,7 +995,7 @@
leal (B, %eax, 2), BO
#else
movl B, BO
-#endif
+#endif
fldz
fldz
diff --git a/kernel/x86/rot.S b/kernel/x86/rot.S
index 111266a..8448bee 100644
--- a/kernel/x86/rot.S
+++ b/kernel/x86/rot.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 12
#define ARGS 0
-
+
#define STACK_N 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -110,7 +110,7 @@
sarl $2, I
jle .L15
ALIGN_4
-
+
.L10:
#ifdef PENTIUM4
PREFETCH (PREFETCH_SIZE + 0) * SIZE(X)
@@ -248,7 +248,7 @@
sarl $2, I
jle .L55
ALIGN_4
-
+
.L51:
FLD 0 * SIZE(X)
FLD 0 * SIZE(Y)
diff --git a/kernel/x86/rot_sse.S b/kernel/x86/rot_sse.S
index af9f12f..9495bcd 100644
--- a/kernel/x86/rot_sse.S
+++ b/kernel/x86/rot_sse.S
@@ -76,8 +76,8 @@
movl STACK_Y, Y
movl STACK_INCY, INCY
- leal (, INCX, SIZE), INCX
- leal (, INCY, SIZE), INCY
+ leal (, INCX, SIZE), INCX
+ leal (, INCY, SIZE), INCY
movss STACK_C, C
movss STACK_S, S
@@ -434,7 +434,7 @@
movaps %xmm0, 12 * SIZE(X)
movlps %xmm2, 12 * SIZE(Y)
movhps %xmm2, 14 * SIZE(Y)
-
+
addl $16 * SIZE, X
addl $16 * SIZE, Y
ALIGN_3
diff --git a/kernel/x86/rot_sse2.S b/kernel/x86/rot_sse2.S
index e9c5ba1..83931de 100644
--- a/kernel/x86/rot_sse2.S
+++ b/kernel/x86/rot_sse2.S
@@ -76,8 +76,8 @@
movl STACK_Y, Y
movl STACK_INCY, INCY
- leal (, INCX, SIZE), INCX
- leal (, INCY, SIZE), INCY
+ leal (, INCX, SIZE), INCX
+ leal (, INCY, SIZE), INCY
movsd STACK_C, C
movsd STACK_S, S
diff --git a/kernel/x86/scal_sse.S b/kernel/x86/scal_sse.S
index 48edfc5..53eba77 100644
--- a/kernel/x86/scal_sse.S
+++ b/kernel/x86/scal_sse.S
@@ -280,7 +280,7 @@
movaps %xmm0, %xmm4
mulps -20 * SIZE(X), %xmm4
- decl I
+ decl I
jle .L112
ALIGN_4
@@ -353,13 +353,13 @@
movaps %xmm4, -4 * SIZE(X)
#else
-
+
movaps -32 * SIZE(X), %xmm1
movaps -28 * SIZE(X), %xmm2
movaps -24 * SIZE(X), %xmm3
movaps -20 * SIZE(X), %xmm4
- decl I
+ decl I
jle .L112
ALIGN_4
diff --git a/kernel/x86/scal_sse2.S b/kernel/x86/scal_sse2.S
index 35b7913..a278ecb 100644
--- a/kernel/x86/scal_sse2.S
+++ b/kernel/x86/scal_sse2.S
@@ -77,7 +77,7 @@
comisd %xmm0, %xmm1
jne .L100 # Alpha != ZERO
jp .L100 # For Alpha = NaN
-
+
/* Alpha == ZERO */
cmpl $SIZE, INCX
jne .L50
@@ -264,7 +264,7 @@
movaps %xmm0, %xmm4
mulpd -10 * SIZE(X), %xmm4
- decl I
+ decl I
jle .L112
ALIGN_4
@@ -342,7 +342,7 @@
movaps -12 * SIZE(X), %xmm3
movaps -10 * SIZE(X), %xmm4
- decl I
+ decl I
jle .L112
ALIGN_4
diff --git a/kernel/x86/swap.S b/kernel/x86/swap.S
index d32c1a3..54b00b3 100644
--- a/kernel/x86/swap.S
+++ b/kernel/x86/swap.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define N 4 + STACK + ARGS(%esp)
#ifdef XDOUBLE
#define X 32 + STACK + ARGS(%esp)
diff --git a/kernel/x86/swap_sse.S b/kernel/x86/swap_sse.S
index 39c0d2f..e6cd4ad 100644
--- a/kernel/x86/swap_sse.S
+++ b/kernel/x86/swap_sse.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 20 + STACK + ARGS(%esp)
#define STACK_INCX 24 + STACK + ARGS(%esp)
@@ -53,7 +53,7 @@
#define Y %edi
#define INCX %ebx
#define INCY %ecx
-
+
#include "l1param.h"
PROLOGUE
@@ -80,7 +80,7 @@
subl $-32 * SIZE, X
subl $-32 * SIZE, Y
-
+
cmpl $3, M
jle .L16
@@ -302,7 +302,7 @@
.L20:
movaps -33 * SIZE(X), %xmm0
movaps -32 * SIZE(Y), %xmm1
-
+
movss %xmm1, -32 * SIZE(X)
PSHUFD2($0x39, %xmm1, %xmm3)
movlps %xmm3, -31 * SIZE(X)
@@ -778,7 +778,7 @@
.L40:
movaps -35 * SIZE(X), %xmm0
movaps -32 * SIZE(Y), %xmm1
-
+
movss %xmm1, -32 * SIZE(X)
subl $3, M
diff --git a/kernel/x86/swap_sse2.S b/kernel/x86/swap_sse2.S
index b880812..9a3576c 100644
--- a/kernel/x86/swap_sse2.S
+++ b/kernel/x86/swap_sse2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 24 + STACK + ARGS(%esp)
#define STACK_INCX 28 + STACK + ARGS(%esp)
@@ -96,7 +96,7 @@
.L10:
subl $-16 * SIZE, X
subl $-16 * SIZE, Y
-
+
testl $SIZE, X
jne .L20
diff --git a/kernel/x86/trsm_kernel_LN_2x2.S b/kernel/x86/trsm_kernel_LN_2x2.S
index d1c741b..587739c 100644
--- a/kernel/x86/trsm_kernel_LN_2x2.S
+++ b/kernel/x86/trsm_kernel_LN_2x2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define J 0 + STACK(%esp)
#define KK 4 + STACK(%esp)
#define KKK 8 + STACK(%esp)
@@ -72,7 +72,7 @@
#else
#define REP rep
#endif
-
+
#define AA %edx
#define BB %ecx
@@ -112,7 +112,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -121,7 +121,7 @@
#endif
movl N, %eax # j = (n >> 1) # MEMORY
- sarl $1, %eax
+ sarl $1, %eax
movl %eax, J # j = (n >> 1) # MEMORY
je .L8
ALIGN_4
@@ -153,7 +153,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -178,7 +178,7 @@
leal (%ebx, %eax, 2), BB
#else
movl %ebx, BB
-#endif
+#endif
fldz
fldz
@@ -371,7 +371,7 @@
leal (%ebx, %eax, 2), BB
#else
movl %ebx, BB
-#endif
+#endif
fldz
fldz
@@ -741,7 +741,7 @@
movl N, %eax # n # MEMORY
andl $1, %eax
je .End
-
+
#if defined(LT) || defined(RN)
movl A, AA
#else
@@ -767,7 +767,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -792,7 +792,7 @@
leal (%ebx, %eax, 1), BB
#else
movl %ebx, BB
-#endif
+#endif
fldz
@@ -905,7 +905,7 @@
sarl $1, %esi # m >> 1
je .L99
ALIGN_4
-
+
.L46:
#ifdef LN
movl K, %eax
@@ -921,7 +921,7 @@
leal (%ebx, %eax, 1), BB
#else
movl %ebx, BB
-#endif
+#endif
fldz
fldz
diff --git a/kernel/x86/trsm_kernel_LN_2x2_atom.S b/kernel/x86/trsm_kernel_LN_2x2_atom.S
index 846a848..7624fde 100644
--- a/kernel/x86/trsm_kernel_LN_2x2_atom.S
+++ b/kernel/x86/trsm_kernel_LN_2x2_atom.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -83,7 +83,7 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
leal (, LDC, SIZE), LDC
@@ -146,7 +146,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -175,7 +175,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
xorps %xmm2, %xmm2
@@ -395,7 +395,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
xorps %xmm2, %xmm2
@@ -724,7 +724,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -753,7 +753,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
xorps %xmm2, %xmm2
@@ -922,7 +922,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd 0 * SIZE(BB), %xmm1
xorps %xmm0, %xmm0
diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S
index 16ba9a0..0b475af 100644
--- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S
+++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -95,7 +95,7 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
leal (, LDC, SIZE), LDC
@@ -161,7 +161,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -190,7 +190,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd -16 * SIZE(AA), %xmm0
movhps -15 * SIZE(AA), %xmm0
@@ -495,7 +495,7 @@
#endif
ALIGN_4
-.L20:
+.L20:
movl M, %ebx
sarl $1, %ebx
jle .L29
@@ -521,7 +521,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
leal (CO1, LDC, 2), %eax
@@ -1006,7 +1006,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1035,7 +1035,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd -16 * SIZE(AA), %xmm0
movhps -15 * SIZE(AA), %xmm0
@@ -1278,7 +1278,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1611,7 +1611,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1640,7 +1640,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd -16 * SIZE(AA), %xmm0
movhps -15 * SIZE(AA), %xmm0
@@ -1827,7 +1827,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
diff --git a/kernel/x86/trsm_kernel_LN_2x4_sse2.S b/kernel/x86/trsm_kernel_LN_2x4_sse2.S
index b1dea62..8038081 100644
--- a/kernel/x86/trsm_kernel_LN_2x4_sse2.S
+++ b/kernel/x86/trsm_kernel_LN_2x4_sse2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define OLD_M 4 + STACK + ARGS(%esi)
#define OLD_N 8 + STACK + ARGS(%esi)
#define OLD_K 12 + STACK + ARGS(%esi)
@@ -204,7 +204,7 @@
PROFCODE
EMMS
-
+
movl %esp, %esi # save old stack
subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
@@ -256,7 +256,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -275,7 +275,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -291,7 +291,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 4), B
leal (BB, %eax, 8), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -307,7 +307,7 @@
sarl $1, %eax
jle .L05
ALIGN_4
-
+
.L02:
#define COPYPREFETCH 40
@@ -373,7 +373,7 @@
addl $4 * SIZE, B
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movl A, AA
@@ -414,7 +414,7 @@
movl KK, %eax
sall $3 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -787,7 +787,7 @@
movl KK, %eax
sall $3 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -824,7 +824,7 @@
andl $-8, %eax
sall $4, %eax
je .L15
-.L1X:
+.L1X:
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
@@ -1266,7 +1266,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -1282,7 +1282,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 2), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1298,7 +1298,7 @@
sarl $2, %eax
jle .L35
ALIGN_4
-
+
.L32:
#define COPYPREFETCH 40
@@ -1363,7 +1363,7 @@
decl %eax
jne .L36
ALIGN_4
-
+
.L40:
#if defined(LT) || defined(RN)
movl A, AA
@@ -1404,7 +1404,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1665,7 +1665,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1987,7 +1987,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -2003,7 +2003,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 1), B
leal (BB, %eax, 2), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -2019,7 +2019,7 @@
sarl $3, %eax
jle .L65
ALIGN_4
-
+
.L62:
#define COPYPREFETCH 40
@@ -2081,7 +2081,7 @@
decl %eax
jne .L66
ALIGN_4
-
+
.L70:
#if defined(LT) || defined(RN)
movl A, AA
@@ -2120,7 +2120,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -2326,7 +2326,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
diff --git a/kernel/x86/trsm_kernel_LN_2x4_sse3.S b/kernel/x86/trsm_kernel_LN_2x4_sse3.S
index 5ab4ab3..5b4c19b 100644
--- a/kernel/x86/trsm_kernel_LN_2x4_sse3.S
+++ b/kernel/x86/trsm_kernel_LN_2x4_sse3.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -95,7 +95,7 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
leal (, LDC, SIZE), LDC
@@ -158,7 +158,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -187,7 +187,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movddup 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -534,7 +534,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -982,7 +982,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1011,7 +1011,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movddup 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1255,7 +1255,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1565,7 +1565,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1594,7 +1594,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
movhpd 1 * SIZE(AA), %xmm0
@@ -1797,7 +1797,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
diff --git a/kernel/x86/trsm_kernel_LN_4x2_core2.S b/kernel/x86/trsm_kernel_LN_4x2_core2.S
index d974fa6..94942b6 100644
--- a/kernel/x86/trsm_kernel_LN_4x2_core2.S
+++ b/kernel/x86/trsm_kernel_LN_4x2_core2.S
@@ -45,7 +45,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -55,7 +55,7 @@
#define STACK_C 32 + STACK + ARGS(%esi)
#define STACK_LDC 36 + STACK + ARGS(%esi)
#define STACK_OFFT 40 + STACK + ARGS(%esi)
-
+
#define K 16(%esp)
#define N 20(%esp)
#define M 24(%esp)
@@ -141,7 +141,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -154,14 +154,14 @@
movl %eax, J
jle .L100
ALIGN_2
-
+
.L01:
/* Copying to Sub Buffer */
#ifdef LN
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal 16 * SIZE + BUFFER, BB
@@ -177,7 +177,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 2), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -193,7 +193,7 @@
sarl $2, %eax
jle .L03
ALIGN_2
-
+
.L02:
movddup -16 * SIZE(B), %xmm0
movddup -15 * SIZE(B), %xmm1
@@ -243,7 +243,7 @@
decl %eax
jne .L04
ALIGN_4
-
+
.L05:
#if defined(LT) || defined(RN)
movl A, AA
@@ -285,7 +285,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movsd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -305,7 +305,7 @@
sarl $3, %eax
je .L52
-.L51:
+.L51:
mulsd %xmm0, %xmm1
mulsd -14 * SIZE(BB), %xmm0
addsd %xmm1, %xmm4
@@ -517,7 +517,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -537,7 +537,7 @@
sarl $3, %eax
je .L32
-.L31:
+.L31:
mulpd %xmm0, %xmm1
mulpd -14 * SIZE(BB), %xmm0
addpd %xmm1, %xmm4
@@ -789,7 +789,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1205,7 +1205,7 @@
decl %ebx # i --
jg .L10
- ALIGN_2
+ ALIGN_2
.L99:
#ifdef LN
@@ -1238,13 +1238,13 @@
testl $1, %eax
jle .L999
ALIGN_2
-
+
.L101:
#ifdef LN
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal 16 * SIZE + BUFFER, BB
@@ -1260,7 +1260,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 1), B
leal (BB, %eax, 2), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1276,7 +1276,7 @@
sarl $3, %eax
jle .L103
ALIGN_4
-
+
.L102:
movddup -16 * SIZE(B), %xmm0
movddup -15 * SIZE(B), %xmm1
@@ -1324,7 +1324,7 @@
decl %eax
jne .L104
ALIGN_4
-
+
.L105:
#if defined(LT) || defined(RN)
movl A, AA
@@ -1364,7 +1364,7 @@
movl KK, %eax
sall $0 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movsd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1382,7 +1382,7 @@
sarl $3, %eax
je .L152
-.L151:
+.L151:
mulsd %xmm0, %xmm1
movsd -15 * SIZE(AA), %xmm0
addsd %xmm1, %xmm4
@@ -1549,7 +1549,7 @@
movl KK, %eax
sall $0 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1567,7 +1567,7 @@
sarl $3, %eax
je .L132
-.L131:
+.L131:
mulpd %xmm0, %xmm1
movapd -14 * SIZE(AA), %xmm0
addpd %xmm1, %xmm4
@@ -1713,7 +1713,7 @@
movddup %xmm0, %xmm1
unpckhpd %xmm0, %xmm0
-
+
movapd %xmm1, -16 * SIZE(BB)
movapd %xmm0, -14 * SIZE(BB)
#else
@@ -1773,7 +1773,7 @@
movl KK, %eax
sall $0 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1793,7 +1793,7 @@
sarl $3, %eax
je .L112
-.L111:
+.L111:
mulpd %xmm1, %xmm0
mulpd -14 * SIZE(AA), %xmm1
addpd %xmm0, %xmm4
@@ -2059,7 +2059,7 @@
BRANCH
decl %ebx # i --
jg .L110
- ALIGN_2
+ ALIGN_2
.L159:
#ifdef LN
diff --git a/kernel/x86/trsm_kernel_LN_4x2_sse2.S b/kernel/x86/trsm_kernel_LN_4x2_sse2.S
index a1fb8a1..12625cc 100644
--- a/kernel/x86/trsm_kernel_LN_4x2_sse2.S
+++ b/kernel/x86/trsm_kernel_LN_4x2_sse2.S
@@ -45,7 +45,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -55,7 +55,7 @@
#define STACK_C 32 + STACK + ARGS(%esi)
#define STACK_LDC 36 + STACK + ARGS(%esi)
#define STACK_OFFT 40 + STACK + ARGS(%esi)
-
+
#define ALPHA 0(%esp)
#define K 16(%esp)
#define N 20(%esp)
@@ -257,7 +257,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -270,14 +270,14 @@
movl %eax, J
jle .L100
ALIGN_2
-
+
.L01:
/* Copying to Sub Buffer */
#ifdef LN
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -293,7 +293,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 2), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -309,7 +309,7 @@
sarl $2, %eax
jle .L03
ALIGN_2
-
+
.L02:
movsd 0 * SIZE(B), %xmm0
movsd 1 * SIZE(B), %xmm1
@@ -373,7 +373,7 @@
decl %eax
jne .L04
ALIGN_4
-
+
.L05:
#if defined(LT) || defined(RN)
movl A, AA
@@ -415,7 +415,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movsd 0 * SIZE(BB), %xmm2
pxor %xmm4, %xmm4
@@ -435,7 +435,7 @@
sarl $3, %eax
je .L52
-.L51:
+.L51:
mulsd %xmm0, %xmm2
mulsd 2 * SIZE(BB), %xmm0
addsd %xmm2, %xmm4
@@ -648,7 +648,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd 0 * SIZE(BB), %xmm2
pxor %xmm4, %xmm4
@@ -668,7 +668,7 @@
sarl $3, %eax
je .L32
-.L31:
+.L31:
mulpd %xmm0, %xmm2
mulpd 2 * SIZE(BB), %xmm0
addpd %xmm2, %xmm4
@@ -938,7 +938,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd 0 * SIZE(BB), %xmm2
pxor %xmm4, %xmm4
@@ -956,7 +956,7 @@
prefetcht2 4 * SIZE(%esi)
prefetcht2 4 * SIZE(%esi, LDC)
#endif
-
+
#if defined(LT) || defined(RN)
movl KK, %eax
#else
@@ -969,7 +969,7 @@
NOBRANCH
je .L12
sall $3, %eax
-
+
.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
@@ -1062,7 +1062,7 @@
subl $64 * 8, %eax
BRANCH
jg .L1X
-
+
.L11:
leal (AA, %eax, 4), AA
leal (BB, %eax, 4), BB
@@ -1071,7 +1071,7 @@
sarl $3, %eax
je .L12
-.L11:
+.L11:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
KERNEL3(32 * 0)
@@ -1117,7 +1117,7 @@
addl $4 * SIZE, BB # boffset1 += 8
subl $1, %eax
jg .L13
- ALIGN_4
+ ALIGN_4
.L14:
#if defined(LN) || defined(RT)
@@ -1382,7 +1382,7 @@
decl %ebx # i --
jg .L10
- ALIGN_2
+ ALIGN_2
.L99:
#ifdef LN
@@ -1415,14 +1415,14 @@
testl $1, %eax
jle .L999
ALIGN_2
-
+
.L101:
/* Copying to Sub Buffer */
#ifdef LN
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -1438,7 +1438,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 1), B
leal (BB, %eax, 2), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1454,7 +1454,7 @@
sarl $3, %eax
jle .L103
ALIGN_4
-
+
.L102:
movsd 0 * SIZE(B), %xmm0
movsd 1 * SIZE(B), %xmm1
@@ -1516,7 +1516,7 @@
decl %eax
jne .L104
ALIGN_4
-
+
.L105:
#if defined(LT) || defined(RN)
movl A, AA
@@ -1571,7 +1571,7 @@
movl KK, %eax
sall $0 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1582,7 +1582,7 @@
sarl $3, %eax
je .L152
-.L151:
+.L151:
mulsd %xmm0, %xmm2
movsd 1 * SIZE(AA), %xmm0
addsd %xmm2, %xmm4
@@ -1752,7 +1752,7 @@
movl KK, %eax
sall $0 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1763,7 +1763,7 @@
sarl $3, %eax
je .L132
-.L131:
+.L131:
mulpd %xmm0, %xmm2
movapd 2 * SIZE(AA), %xmm0
addpd %xmm2, %xmm4
@@ -1965,7 +1965,7 @@
movl KK, %eax
sall $0 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd 0 * SIZE(BB), %xmm2
pxor %xmm4, %xmm4
@@ -1985,7 +1985,7 @@
sarl $3, %eax
je .L112
-.L111:
+.L111:
mulpd %xmm2, %xmm0
mulpd 2 * SIZE(AA), %xmm2
addpd %xmm0, %xmm4
@@ -2252,7 +2252,7 @@
BRANCH
decl %ebx # i --
jg .L110
- ALIGN_2
+ ALIGN_2
.L159:
#ifdef LN
diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S
index 03f8e3d..e98854f 100644
--- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S
+++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -100,7 +100,7 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
leal (, LDC, SIZE), LDC
@@ -165,7 +165,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -193,7 +193,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
movsd -32 * SIZE(AA), %xmm0
@@ -499,7 +499,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
movaps -32 * SIZE(AA), %xmm0
@@ -880,7 +880,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
leal (CO1, LDC, 2), %eax
@@ -1451,7 +1451,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1479,7 +1479,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
movsd -32 * SIZE(AA), %xmm0
@@ -1711,7 +1711,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -32 * SIZE(AA), %xmm0
pxor %xmm3, %xmm3
@@ -1978,7 +1978,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -2382,7 +2382,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -2410,7 +2410,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
movsd -32 * SIZE(AA), %xmm0
@@ -2575,7 +2575,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd -32 * SIZE(AA), %xmm0
pxor %xmm3, %xmm3
@@ -2806,7 +2806,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
diff --git a/kernel/x86/trsm_kernel_LN_4x4_sse.S b/kernel/x86/trsm_kernel_LN_4x4_sse.S
index 5259e11..95bfb8e 100644
--- a/kernel/x86/trsm_kernel_LN_4x4_sse.S
+++ b/kernel/x86/trsm_kernel_LN_4x4_sse.S
@@ -40,7 +40,7 @@
#include "common.h"
#define STACK 16
-
+
#define OLD_M 4 + STACK(%esi)
#define OLD_N 8 + STACK(%esi)
#define OLD_K 12 + STACK(%esi)
@@ -268,7 +268,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -286,7 +286,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -302,7 +302,7 @@
sall $2 + BASE_SHIFT, %eax
leal (B, %eax, 1), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -318,7 +318,7 @@
sarl $1, %eax
jle .L05
ALIGN_4
-
+
.L02:
movaps 0 * SIZE(B), %xmm3
movaps 4 * SIZE(B), %xmm7
@@ -373,7 +373,7 @@
addl $4 * SIZE, B
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movl A, AA
@@ -413,7 +413,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movss 0 * SIZE(AA), %xmm0
xorps %xmm4, %xmm4
@@ -803,7 +803,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef movsd
xorps %xmm0, %xmm0
@@ -1257,7 +1257,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
xorps %xmm4, %xmm4
@@ -1693,7 +1693,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -1709,7 +1709,7 @@
sall $1 + BASE_SHIFT, %eax
leal (B, %eax, 1), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1725,7 +1725,7 @@
sarl $2, %eax
jle .L45
ALIGN_4
-
+
.L42:
movaps 0 * SIZE(B), %xmm3
movaps 4 * SIZE(B), %xmm7
@@ -1784,7 +1784,7 @@
decl %eax
jne .L46
ALIGN_4
-
+
.L50:
#if defined(LT) || defined(RN)
movl A, AA
@@ -1824,7 +1824,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -2080,7 +2080,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -2402,7 +2402,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -2802,7 +2802,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -2818,7 +2818,7 @@
sall $BASE_SHIFT, %eax
leal (B, %eax, 1), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -2834,7 +2834,7 @@
sarl $3, %eax
jle .L85
ALIGN_4
-
+
.L82:
movsd 0 * SIZE(B), %xmm3
movhps 2 * SIZE(B), %xmm3
@@ -2890,7 +2890,7 @@
decl %eax
jne .L86
ALIGN_4
-
+
.L90:
#if defined(LT) || defined(RN)
movl A, AA
@@ -2928,7 +2928,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -3118,7 +3118,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -3363,7 +3363,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
diff --git a/kernel/x86/trsm_kernel_LN_8x2_sse.S b/kernel/x86/trsm_kernel_LN_8x2_sse.S
index 16a2c2f..12b09e1 100644
--- a/kernel/x86/trsm_kernel_LN_8x2_sse.S
+++ b/kernel/x86/trsm_kernel_LN_8x2_sse.S
@@ -45,7 +45,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -153,7 +153,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -173,13 +173,13 @@
movl %eax, J
jle .L100
ALIGN_2
-
+
.L01:
#ifdef LN
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -195,7 +195,7 @@
sall $1 + BASE_SHIFT, %eax
leal (B, %eax, 1), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -211,7 +211,7 @@
sarl $2, %eax
jle .L03
ALIGN_4
-
+
.L02:
movsd 0 * SIZE(B), %xmm3
movhps 2 * SIZE(B), %xmm3
@@ -337,7 +337,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movss 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -576,7 +576,7 @@
sall $BASE_SHIFT, %eax
addl %eax, AORIG
#endif
- ALIGN_2
+ ALIGN_2
.L30:
testl $2, M
@@ -601,7 +601,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -801,7 +801,7 @@
#ifdef RN
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
movss 1 * SIZE(B), %xmm6
@@ -813,14 +813,14 @@
movss 3 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm2
#endif
#ifdef RT
movss 3 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm2
movss 2 * SIZE(B), %xmm6
@@ -833,7 +833,7 @@
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
#endif
@@ -911,7 +911,7 @@
sall $1 + BASE_SHIFT, %eax
addl %eax, AORIG
#endif
- ALIGN_2
+ ALIGN_2
.L50:
testl $4, M
@@ -936,7 +936,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -1186,7 +1186,7 @@
#ifdef RN
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
movss 1 * SIZE(B), %xmm6
@@ -1198,14 +1198,14 @@
movss 3 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm2
#endif
#ifdef RT
movss 3 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm2
movss 2 * SIZE(B), %xmm6
@@ -1218,7 +1218,7 @@
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
#endif
@@ -1323,7 +1323,7 @@
sall $2 + BASE_SHIFT, %eax
addl %eax, AORIG
#endif
- ALIGN_2
+ ALIGN_2
.L70:
movl M, %ebx
@@ -1351,7 +1351,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -1874,7 +1874,7 @@
#ifdef RN
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
mulps %xmm6, %xmm1
@@ -1890,7 +1890,7 @@
movss 3 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm2
mulps %xmm6, %xmm3
#endif
@@ -1898,7 +1898,7 @@
#ifdef RT
movss 3 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm2
mulps %xmm6, %xmm3
@@ -1914,7 +1914,7 @@
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
mulps %xmm6, %xmm1
#endif
@@ -2079,7 +2079,7 @@
decl %ebx # i --
jg .L10
- ALIGN_2
+ ALIGN_2
.L99:
#ifdef LN
@@ -2110,12 +2110,12 @@
.L100:
testl $1, N
jle .L999
-
+
#ifdef LN
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -2131,7 +2131,7 @@
sall $BASE_SHIFT, %eax
leal (B, %eax, 1), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -2147,7 +2147,7 @@
sarl $3, %eax
jle .L103
ALIGN_4
-
+
.L102:
movsd 0 * SIZE(B), %xmm3
movhps 2 * SIZE(B), %xmm3
@@ -2262,7 +2262,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movss 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -2447,7 +2447,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -2667,7 +2667,7 @@
sall $1 + BASE_SHIFT, %eax
addl %eax, AORIG
#endif
- ALIGN_2
+ ALIGN_2
.L150:
testl $4, M
@@ -2692,7 +2692,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -2996,7 +2996,7 @@
sall $2 + BASE_SHIFT, %eax
addl %eax, AORIG
#endif
- ALIGN_2
+ ALIGN_2
.L170:
movl M, %ebx
@@ -3024,7 +3024,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -3462,7 +3462,7 @@
#if defined(RN) || defined(RT)
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
mulps %xmm6, %xmm1
#endif
@@ -3470,7 +3470,7 @@
#if defined(LN) || defined(LT)
shufps $0x88, %xmm3, %xmm2
shufps $0x88, %xmm7, %xmm5
-
+
movlps %xmm2, 0 * SIZE(B)
movhps %xmm2, 2 * SIZE(B)
movlps %xmm5, 4 * SIZE(B)
@@ -3570,7 +3570,7 @@
decl %ebx # i --
jg .L110
- ALIGN_2
+ ALIGN_2
.L179:
#ifdef LN
diff --git a/kernel/x86/trsm_kernel_LT_1x4.S b/kernel/x86/trsm_kernel_LT_1x4.S
index 5670746..5210f85 100644
--- a/kernel/x86/trsm_kernel_LT_1x4.S
+++ b/kernel/x86/trsm_kernel_LT_1x4.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 32
-
+
#define J 0 + STACK(%esp)
#define I 4 + STACK(%esp)
#define KK 8 + STACK(%esp)
@@ -111,7 +111,7 @@
movl OFFSET, %eax
negl %eax
movl %eax, KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -167,7 +167,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -186,7 +186,7 @@
jle .L13
ALIGN_4
-.L12:
+.L12:
movl -16 * SIZE(B), %esi
movl -8 * SIZE(B), %esi
movl 0 * SIZE(B), %esi
@@ -220,7 +220,7 @@
leal (B_ORIG, %eax, 4), B
#else
movl B_ORIG, B
-#endif
+#endif
leal (%edi, LDC, 2), %eax
@@ -679,7 +679,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -698,7 +698,7 @@
jle .L23
ALIGN_4
-.L22:
+.L22:
movl -16 * SIZE(B), %esi
movl -8 * SIZE(B), %esi
movl 0 * SIZE(B), %esi
@@ -728,7 +728,7 @@
leal (B_ORIG, %eax, 2), B
#else
movl B_ORIG, B
-#endif
+#endif
fldz
fldz
@@ -1022,7 +1022,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1041,7 +1041,7 @@
jle .L33
ALIGN_4
-.L32:
+.L32:
movl -16 * SIZE(B), %esi
movl -8 * SIZE(B), %esi
movl 0 * SIZE(B), %esi
@@ -1071,7 +1071,7 @@
leal (B_ORIG, %eax, 1), B
#else
movl B_ORIG, B
-#endif
+#endif
fldz
fldz
diff --git a/kernel/x86/trsm_kernel_LT_2x2.S b/kernel/x86/trsm_kernel_LT_2x2.S
index d21909d..ff29a3b 100644
--- a/kernel/x86/trsm_kernel_LT_2x2.S
+++ b/kernel/x86/trsm_kernel_LT_2x2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define J 0 + STACK(%esp)
#define KK 4 + STACK(%esp)
#define KKK 8 + STACK(%esp)
@@ -72,7 +72,7 @@
#else
#define REP rep
#endif
-
+
#define AA %edx
#define BB %ecx
@@ -114,7 +114,7 @@
movl OFFSET, %eax
negl %eax
movl %eax, KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -123,7 +123,7 @@
#endif
movl N, %eax # j = (n >> 1) # MEMORY
- sarl $1, %eax
+ sarl $1, %eax
movl %eax, J # j = (n >> 1) # MEMORY
je .L8
ALIGN_4
@@ -155,7 +155,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -182,7 +182,7 @@
leal (%ebx, %eax, 2), BB
#else
movl %ebx, BB
-#endif
+#endif
fldz
fldz
@@ -535,7 +535,7 @@
leal (%ebx, %eax, 2), BB
#else
movl %ebx, BB
-#endif
+#endif
fldz
fldz
@@ -733,7 +733,7 @@
movl N, %eax # n # MEMORY
andl $1, %eax
je .End
-
+
#if defined(LT) || defined(RN)
movl A, AA
#else
@@ -759,7 +759,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -770,7 +770,7 @@
sarl $1, %esi # m >> 1
je .L36
ALIGN_4
-
+
.L46:
#ifdef LN
movl K, %eax
@@ -786,7 +786,7 @@
leal (%ebx, %eax, 1), BB
#else
movl %ebx, BB
-#endif
+#endif
fldz
fldz
@@ -981,7 +981,7 @@
leal (%ebx, %eax, 1), BB
#else
movl %ebx, BB
-#endif
+#endif
fldz
diff --git a/kernel/x86/trsm_kernel_LT_2x2_atom.S b/kernel/x86/trsm_kernel_LT_2x2_atom.S
index 3835005..139e412 100644
--- a/kernel/x86/trsm_kernel_LT_2x2_atom.S
+++ b/kernel/x86/trsm_kernel_LT_2x2_atom.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -83,7 +83,7 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
leal (, LDC, SIZE), LDC
@@ -146,7 +146,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -178,7 +178,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
xorps %xmm2, %xmm2
@@ -478,7 +478,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
xorps %xmm2, %xmm2
@@ -724,7 +724,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -756,7 +756,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd 0 * SIZE(BB), %xmm1
xorps %xmm0, %xmm0
@@ -971,7 +971,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
xorps %xmm2, %xmm2
diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S
index 65a6cf0..086852c 100644
--- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S
+++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -95,7 +95,7 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
leal (, LDC, SIZE), LDC
@@ -161,7 +161,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -193,7 +193,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
leal (CO1, LDC, 2), %eax
@@ -647,7 +647,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1005,7 +1005,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1037,7 +1037,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1344,7 +1344,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1608,7 +1608,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1640,7 +1640,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1881,7 +1881,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
diff --git a/kernel/x86/trsm_kernel_LT_2x4_sse2.S b/kernel/x86/trsm_kernel_LT_2x4_sse2.S
index ba03221..01ff86c 100644
--- a/kernel/x86/trsm_kernel_LT_2x4_sse2.S
+++ b/kernel/x86/trsm_kernel_LT_2x4_sse2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define OLD_M 4 + STACK + ARGS(%esi)
#define OLD_N 8 + STACK + ARGS(%esi)
#define OLD_K 12 + STACK + ARGS(%esi)
@@ -256,7 +256,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -275,7 +275,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -291,7 +291,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 4), B
leal (BB, %eax, 8), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -307,7 +307,7 @@
sarl $1, %eax
jle .L05
ALIGN_4
-
+
.L02:
#define COPYPREFETCH 40
@@ -373,7 +373,7 @@
addl $4 * SIZE, B
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movl A, AA
@@ -417,7 +417,7 @@
movl KK, %eax
sall $3 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -454,7 +454,7 @@
andl $-8, %eax
sall $4, %eax
je .L15
-.L1X:
+.L1X:
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
@@ -885,7 +885,7 @@
movl KK, %eax
sall $3 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1266,7 +1266,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -1282,7 +1282,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 2), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1298,7 +1298,7 @@
sarl $2, %eax
jle .L35
ALIGN_4
-
+
.L32:
#define COPYPREFETCH 40
@@ -1363,7 +1363,7 @@
decl %eax
jne .L36
ALIGN_4
-
+
.L40:
#if defined(LT) || defined(RN)
movl A, AA
@@ -1407,7 +1407,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1721,7 +1721,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1987,7 +1987,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -2003,7 +2003,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 1), B
leal (BB, %eax, 2), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -2019,7 +2019,7 @@
sarl $3, %eax
jle .L65
ALIGN_4
-
+
.L62:
#define COPYPREFETCH 40
@@ -2081,7 +2081,7 @@
decl %eax
jne .L66
ALIGN_4
-
+
.L70:
#if defined(LT) || defined(RN)
movl A, AA
@@ -2123,7 +2123,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -2369,7 +2369,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -2573,7 +2573,7 @@
.L999:
movl OLD_STACK, %esp
EMMS
-
+
popl %ebx
popl %esi
popl %edi
diff --git a/kernel/x86/trsm_kernel_LT_2x4_sse3.S b/kernel/x86/trsm_kernel_LT_2x4_sse3.S
index 487f059..b276166 100644
--- a/kernel/x86/trsm_kernel_LT_2x4_sse3.S
+++ b/kernel/x86/trsm_kernel_LT_2x4_sse3.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -95,7 +95,7 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
leal (, LDC, SIZE), LDC
@@ -158,7 +158,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -190,7 +190,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -607,7 +607,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movddup 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -982,7 +982,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1014,7 +1014,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1298,7 +1298,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movddup 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1565,7 +1565,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1597,7 +1597,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1823,7 +1823,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
movhpd 1 * SIZE(AA), %xmm0
diff --git a/kernel/x86/trsm_kernel_LT_4x2_core2.S b/kernel/x86/trsm_kernel_LT_4x2_core2.S
index dba627f..1c08745 100644
--- a/kernel/x86/trsm_kernel_LT_4x2_core2.S
+++ b/kernel/x86/trsm_kernel_LT_4x2_core2.S
@@ -45,7 +45,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -55,7 +55,7 @@
#define STACK_C 32 + STACK + ARGS(%esi)
#define STACK_LDC 36 + STACK + ARGS(%esi)
#define STACK_OFFT 40 + STACK + ARGS(%esi)
-
+
#define K 16(%esp)
#define N 20(%esp)
#define M 24(%esp)
@@ -141,7 +141,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -154,14 +154,14 @@
movl %eax, J
jle .L100
ALIGN_2
-
+
.L01:
/* Copying to Sub Buffer */
#ifdef LN
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal 16 * SIZE + BUFFER, BB
@@ -177,7 +177,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 2), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -193,7 +193,7 @@
sarl $2, %eax
jle .L03
ALIGN_2
-
+
.L02:
movddup -16 * SIZE(B), %xmm0
movddup -15 * SIZE(B), %xmm1
@@ -243,7 +243,7 @@
decl %eax
jne .L04
ALIGN_4
-
+
.L05:
#if defined(LT) || defined(RN)
movl A, AA
@@ -287,7 +287,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -703,7 +703,7 @@
decl %ebx # i --
jg .L10
- ALIGN_2
+ ALIGN_2
.L30:
movl M, %ebx
@@ -729,7 +729,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -749,7 +749,7 @@
sarl $3, %eax
je .L32
-.L31:
+.L31:
mulpd %xmm0, %xmm1
mulpd -14 * SIZE(BB), %xmm0
addpd %xmm1, %xmm4
@@ -999,7 +999,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movsd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1019,7 +1019,7 @@
sarl $3, %eax
je .L52
-.L51:
+.L51:
mulsd %xmm0, %xmm1
mulsd -14 * SIZE(BB), %xmm0
addsd %xmm1, %xmm4
@@ -1238,13 +1238,13 @@
testl $1, %eax
jle .L999
ALIGN_2
-
+
.L101:
#ifdef LN
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal 16 * SIZE + BUFFER, BB
@@ -1260,7 +1260,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 1), B
leal (BB, %eax, 2), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1276,7 +1276,7 @@
sarl $3, %eax
jle .L103
ALIGN_4
-
+
.L102:
movddup -16 * SIZE(B), %xmm0
movddup -15 * SIZE(B), %xmm1
@@ -1324,7 +1324,7 @@
decl %eax
jne .L104
ALIGN_4
-
+
.L105:
#if defined(LT) || defined(RN)
movl A, AA
@@ -1366,7 +1366,7 @@
movl KK, %eax
sall $0 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1386,7 +1386,7 @@
sarl $3, %eax
je .L112
-.L111:
+.L111:
mulpd %xmm1, %xmm0
mulpd -14 * SIZE(AA), %xmm1
addpd %xmm0, %xmm4
@@ -1652,7 +1652,7 @@
BRANCH
decl %ebx # i --
jg .L110
- ALIGN_2
+ ALIGN_2
.L130:
movl M, %ebx
@@ -1678,7 +1678,7 @@
movl KK, %eax
sall $0 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1696,7 +1696,7 @@
sarl $3, %eax
je .L132
-.L131:
+.L131:
mulpd %xmm0, %xmm1
movapd -14 * SIZE(AA), %xmm0
addpd %xmm1, %xmm4
@@ -1842,7 +1842,7 @@
movddup %xmm0, %xmm1
unpckhpd %xmm0, %xmm0
-
+
movapd %xmm1, -16 * SIZE(BB)
movapd %xmm0, -14 * SIZE(BB)
#else
@@ -1900,7 +1900,7 @@
movl KK, %eax
sall $0 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movsd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1918,7 +1918,7 @@
sarl $3, %eax
je .L152
-.L151:
+.L151:
mulsd %xmm0, %xmm1
movsd -15 * SIZE(AA), %xmm0
addsd %xmm1, %xmm4
diff --git a/kernel/x86/trsm_kernel_LT_4x2_sse2.S b/kernel/x86/trsm_kernel_LT_4x2_sse2.S
index 626d75a..dd21b3e 100644
--- a/kernel/x86/trsm_kernel_LT_4x2_sse2.S
+++ b/kernel/x86/trsm_kernel_LT_4x2_sse2.S
@@ -45,7 +45,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -55,7 +55,7 @@
#define STACK_C 32 + STACK + ARGS(%esi)
#define STACK_LDC 36 + STACK + ARGS(%esi)
#define STACK_OFFT 40 + STACK + ARGS(%esi)
-
+
#define ALPHA 0(%esp)
#define K 16(%esp)
#define N 20(%esp)
@@ -256,7 +256,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -269,14 +269,14 @@
movl %eax, J
jle .L100
ALIGN_2
-
+
.L01:
/* Copying to Sub Buffer */
#ifdef LN
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -292,7 +292,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 2), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -308,7 +308,7 @@
sarl $2, %eax
jle .L03
ALIGN_2
-
+
.L02:
movsd 0 * SIZE(B), %xmm0
movsd 1 * SIZE(B), %xmm1
@@ -372,7 +372,7 @@
decl %eax
jne .L04
ALIGN_4
-
+
.L05:
#if defined(LT) || defined(RN)
movl A, AA
@@ -416,7 +416,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd 0 * SIZE(BB), %xmm2
pxor %xmm4, %xmm4
@@ -442,7 +442,7 @@
NOBRANCH
je .L12
sall $3, %eax
-
+
.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
@@ -535,7 +535,7 @@
subl $64 * 8, %eax
BRANCH
jg .L1X
-
+
.L11:
leal (AA, %eax, 4), AA
leal (BB, %eax, 4), BB
@@ -544,7 +544,7 @@
sarl $3, %eax
je .L12
-.L11:
+.L11:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
KERNEL3(32 * 0)
@@ -589,7 +589,7 @@
addl $4 * SIZE, BB # boffset1 += 8
subl $1, %eax
jg .L13
- ALIGN_4
+ ALIGN_4
.L14:
#if defined(LN) || defined(RT)
@@ -854,7 +854,7 @@
decl %ebx # i --
jg .L10
- ALIGN_2
+ ALIGN_2
.L30:
movl M, %ebx
@@ -880,7 +880,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd 0 * SIZE(BB), %xmm2
pxor %xmm4, %xmm4
@@ -900,7 +900,7 @@
sarl $3, %eax
je .L32
-.L31:
+.L31:
mulpd %xmm0, %xmm2
mulpd 2 * SIZE(BB), %xmm0
addpd %xmm2, %xmm4
@@ -1168,7 +1168,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movsd 0 * SIZE(BB), %xmm2
pxor %xmm4, %xmm4
@@ -1188,7 +1188,7 @@
sarl $3, %eax
je .L52
-.L51:
+.L51:
mulsd %xmm0, %xmm2
mulsd 2 * SIZE(BB), %xmm0
addsd %xmm2, %xmm4
@@ -1408,14 +1408,14 @@
testl $1, %eax
jle .L999
ALIGN_2
-
+
.L101:
/* Copying to Sub Buffer */
#ifdef LN
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -1431,7 +1431,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 1), B
leal (BB, %eax, 2), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1447,7 +1447,7 @@
sarl $3, %eax
jle .L103
ALIGN_4
-
+
.L102:
movsd 0 * SIZE(B), %xmm0
movsd 1 * SIZE(B), %xmm1
@@ -1509,7 +1509,7 @@
decl %eax
jne .L104
ALIGN_4
-
+
.L105:
#if defined(LT) || defined(RN)
movl A, AA
@@ -1551,7 +1551,7 @@
movl KK, %eax
sall $0 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd 0 * SIZE(BB), %xmm2
pxor %xmm4, %xmm4
@@ -1571,7 +1571,7 @@
sarl $3, %eax
je .L112
-.L111:
+.L111:
mulpd %xmm2, %xmm0
mulpd 2 * SIZE(AA), %xmm2
addpd %xmm0, %xmm4
@@ -1838,7 +1838,7 @@
BRANCH
decl %ebx # i --
jg .L110
- ALIGN_2
+ ALIGN_2
.L130:
movl M, %ebx
@@ -1873,7 +1873,7 @@
movl KK, %eax
sall $0 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1884,7 +1884,7 @@
sarl $3, %eax
je .L132
-.L131:
+.L131:
mulpd %xmm0, %xmm2
movapd 2 * SIZE(AA), %xmm0
addpd %xmm2, %xmm4
@@ -2093,7 +2093,7 @@
movl KK, %eax
sall $0 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -2104,7 +2104,7 @@
sarl $3, %eax
je .L152
-.L151:
+.L151:
mulsd %xmm0, %xmm2
movsd 1 * SIZE(AA), %xmm0
addsd %xmm2, %xmm4
diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S
index d27880b..2dd8ad0 100644
--- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S
+++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -100,7 +100,7 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
leal (, LDC, SIZE), LDC
@@ -165,7 +165,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -197,7 +197,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
leal (CO1, LDC, 2), %eax
@@ -737,7 +737,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
movaps -32 * SIZE(AA), %xmm0
@@ -1114,7 +1114,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
movsd -32 * SIZE(AA), %xmm0
@@ -1451,7 +1451,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1483,7 +1483,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -1861,7 +1861,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -32 * SIZE(AA), %xmm0
pxor %xmm3, %xmm3
@@ -2124,7 +2124,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
movsd -32 * SIZE(AA), %xmm0
@@ -2382,7 +2382,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -2414,7 +2414,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -2728,7 +2728,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd -32 * SIZE(AA), %xmm0
pxor %xmm3, %xmm3
@@ -2955,7 +2955,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
movsd -32 * SIZE(AA), %xmm0
diff --git a/kernel/x86/trsm_kernel_LT_4x4_sse.S b/kernel/x86/trsm_kernel_LT_4x4_sse.S
index 4f7f330..d54dcf2 100644
--- a/kernel/x86/trsm_kernel_LT_4x4_sse.S
+++ b/kernel/x86/trsm_kernel_LT_4x4_sse.S
@@ -40,7 +40,7 @@
#include "common.h"
#define STACK 16
-
+
#define OLD_M 4 + STACK(%esi)
#define OLD_N 8 + STACK(%esi)
#define OLD_K 12 + STACK(%esi)
@@ -268,7 +268,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -286,7 +286,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -302,7 +302,7 @@
sall $2 + BASE_SHIFT, %eax
leal (B, %eax, 1), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -318,7 +318,7 @@
sarl $1, %eax
jle .L05
ALIGN_4
-
+
.L02:
movaps 0 * SIZE(B), %xmm3
movaps 4 * SIZE(B), %xmm7
@@ -373,7 +373,7 @@
addl $4 * SIZE, B
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movl A, AA
@@ -417,7 +417,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
xorps %xmm4, %xmm4
@@ -842,7 +842,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef movsd
xorps %xmm0, %xmm0
@@ -1292,7 +1292,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movss 0 * SIZE(AA), %xmm0
xorps %xmm4, %xmm4
@@ -1693,7 +1693,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -1709,7 +1709,7 @@
sall $1 + BASE_SHIFT, %eax
leal (B, %eax, 1), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1725,7 +1725,7 @@
sarl $2, %eax
jle .L45
ALIGN_4
-
+
.L42:
movaps 0 * SIZE(B), %xmm3
movaps 4 * SIZE(B), %xmm7
@@ -1784,7 +1784,7 @@
decl %eax
jne .L46
ALIGN_4
-
+
.L50:
#if defined(LT) || defined(RN)
movl A, AA
@@ -1828,7 +1828,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -2220,7 +2220,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -2538,7 +2538,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -2801,7 +2801,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -2817,7 +2817,7 @@
sall $BASE_SHIFT, %eax
leal (B, %eax, 1), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -2833,7 +2833,7 @@
sarl $3, %eax
jle .L85
ALIGN_4
-
+
.L82:
movsd 0 * SIZE(B), %xmm3
movhps 2 * SIZE(B), %xmm3
@@ -2889,7 +2889,7 @@
decl %eax
jne .L86
ALIGN_4
-
+
.L90:
#if defined(LT) || defined(RN)
movl A, AA
@@ -2931,7 +2931,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -3250,7 +3250,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -3491,7 +3491,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
diff --git a/kernel/x86/trsm_kernel_LT_8x2_sse.S b/kernel/x86/trsm_kernel_LT_8x2_sse.S
index 5d59698..b184f78 100644
--- a/kernel/x86/trsm_kernel_LT_8x2_sse.S
+++ b/kernel/x86/trsm_kernel_LT_8x2_sse.S
@@ -45,7 +45,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -153,7 +153,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -173,13 +173,13 @@
movl %eax, J
jle .L100
ALIGN_2
-
+
.L01:
#ifdef LN
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -195,7 +195,7 @@
sall $1 + BASE_SHIFT, %eax
leal (B, %eax, 1), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -211,7 +211,7 @@
sarl $2, %eax
jle .L03
ALIGN_4
-
+
.L02:
movsd 0 * SIZE(B), %xmm3
movhps 2 * SIZE(B), %xmm3
@@ -340,7 +340,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -863,7 +863,7 @@
#ifdef RN
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
mulps %xmm6, %xmm1
@@ -879,7 +879,7 @@
movss 3 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm2
mulps %xmm6, %xmm3
#endif
@@ -887,7 +887,7 @@
#ifdef RT
movss 3 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm2
mulps %xmm6, %xmm3
@@ -903,7 +903,7 @@
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
mulps %xmm6, %xmm1
#endif
@@ -1068,7 +1068,7 @@
decl %ebx # i --
jg .L10
- ALIGN_2
+ ALIGN_2
.L30:
testl $4, M
@@ -1093,7 +1093,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -1343,7 +1343,7 @@
#ifdef RN
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
movss 1 * SIZE(B), %xmm6
@@ -1355,14 +1355,14 @@
movss 3 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm2
#endif
#ifdef RT
movss 3 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm2
movss 2 * SIZE(B), %xmm6
@@ -1375,7 +1375,7 @@
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
#endif
@@ -1480,7 +1480,7 @@
sall $2 + BASE_SHIFT, %eax
addl %eax, AORIG
#endif
- ALIGN_2
+ ALIGN_2
.L50:
testl $2, M
@@ -1505,7 +1505,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -1705,7 +1705,7 @@
#ifdef RN
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
movss 1 * SIZE(B), %xmm6
@@ -1717,14 +1717,14 @@
movss 3 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm2
#endif
#ifdef RT
movss 3 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm2
movss 2 * SIZE(B), %xmm6
@@ -1737,7 +1737,7 @@
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
#endif
@@ -1815,7 +1815,7 @@
sall $1 + BASE_SHIFT, %eax
addl %eax, AORIG
#endif
- ALIGN_2
+ ALIGN_2
.L70:
testl $1, M
@@ -1840,7 +1840,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movss 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -2079,7 +2079,7 @@
sall $BASE_SHIFT, %eax
addl %eax, AORIG
#endif
- ALIGN_2
+ ALIGN_2
.L99:
#ifdef LN
@@ -2110,12 +2110,12 @@
.L100:
testl $1, N
jle .L999
-
+
#ifdef LN
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -2131,7 +2131,7 @@
sall $BASE_SHIFT, %eax
leal (B, %eax, 1), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -2147,7 +2147,7 @@
sarl $3, %eax
jle .L103
ALIGN_4
-
+
.L102:
movsd 0 * SIZE(B), %xmm3
movhps 2 * SIZE(B), %xmm3
@@ -2266,7 +2266,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -2704,7 +2704,7 @@
#if defined(RN) || defined(RT)
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
mulps %xmm6, %xmm1
#endif
@@ -2712,7 +2712,7 @@
#if defined(LN) || defined(LT)
shufps $0x88, %xmm3, %xmm2
shufps $0x88, %xmm7, %xmm5
-
+
movlps %xmm2, 0 * SIZE(B)
movhps %xmm2, 2 * SIZE(B)
movlps %xmm5, 4 * SIZE(B)
@@ -2812,7 +2812,7 @@
decl %ebx # i --
jg .L110
- ALIGN_2
+ ALIGN_2
.L130:
testl $4, M
@@ -2837,7 +2837,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -3141,7 +3141,7 @@
sall $2 + BASE_SHIFT, %eax
addl %eax, AORIG
#endif
- ALIGN_2
+ ALIGN_2
.L150:
testl $2, M
@@ -3166,7 +3166,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -3386,7 +3386,7 @@
sall $1 + BASE_SHIFT, %eax
addl %eax, AORIG
#endif
- ALIGN_2
+ ALIGN_2
.L170:
testl $1, M
@@ -3410,7 +3410,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movss 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
diff --git a/kernel/x86/trsm_kernel_RT_1x4.S b/kernel/x86/trsm_kernel_RT_1x4.S
index b7f17e2..09cb00c 100644
--- a/kernel/x86/trsm_kernel_RT_1x4.S
+++ b/kernel/x86/trsm_kernel_RT_1x4.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 32
-
+
#define J 0 + STACK(%esp)
#define I 4 + STACK(%esp)
#define KK 8 + STACK(%esp)
@@ -111,7 +111,7 @@
movl OFFSET, %eax
negl %eax
movl %eax, KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -165,7 +165,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -184,7 +184,7 @@
jle .L33
ALIGN_4
-.L32:
+.L32:
movl -16 * SIZE(B), %esi
movl -8 * SIZE(B), %esi
movl 0 * SIZE(B), %esi
@@ -214,7 +214,7 @@
leal (B_ORIG, %eax, 1), B
#else
movl B_ORIG, B
-#endif
+#endif
fldz
fldz
@@ -414,7 +414,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -433,7 +433,7 @@
jle .L23
ALIGN_4
-.L22:
+.L22:
movl -16 * SIZE(B), %esi
movl -8 * SIZE(B), %esi
movl 0 * SIZE(B), %esi
@@ -463,7 +463,7 @@
leal (B_ORIG, %eax, 2), B
#else
movl B_ORIG, B
-#endif
+#endif
fldz
fldz
@@ -759,7 +759,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -778,7 +778,7 @@
jle .L13
ALIGN_4
-.L12:
+.L12:
movl -16 * SIZE(B), %esi
movl -8 * SIZE(B), %esi
movl 0 * SIZE(B), %esi
@@ -812,7 +812,7 @@
leal (B_ORIG, %eax, 4), B
#else
movl B_ORIG, B
-#endif
+#endif
leal (%edi, LDC, 2), %eax
diff --git a/kernel/x86/trsm_kernel_RT_2x2.S b/kernel/x86/trsm_kernel_RT_2x2.S
index 8603446..8288d83 100644
--- a/kernel/x86/trsm_kernel_RT_2x2.S
+++ b/kernel/x86/trsm_kernel_RT_2x2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define J 0 + STACK(%esp)
#define KK 4 + STACK(%esp)
#define KKK 8 + STACK(%esp)
@@ -72,7 +72,7 @@
#else
#define REP rep
#endif
-
+
#define AA %edx
#define BB %ecx
@@ -112,7 +112,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -123,7 +123,7 @@
movl N, %eax # n # MEMORY
andl $1, %eax
je .L8
-
+
#if defined(LT) || defined(RN)
movl A, AA
#else
@@ -149,7 +149,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -160,7 +160,7 @@
sarl $1, %esi # m >> 1
je .L36
ALIGN_4
-
+
.L46:
#ifdef LN
movl K, %eax
@@ -176,7 +176,7 @@
leal (%ebx, %eax, 1), BB
#else
movl %ebx, BB
-#endif
+#endif
fldz
fldz
@@ -371,7 +371,7 @@
leal (%ebx, %eax, 1), BB
#else
movl %ebx, BB
-#endif
+#endif
fldz
@@ -485,7 +485,7 @@
.L8:
movl N, %eax # j = (n >> 1) # MEMORY
- sarl $1, %eax
+ sarl $1, %eax
movl %eax, J # j = (n >> 1) # MEMORY
je .End
ALIGN_4
@@ -517,7 +517,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -544,7 +544,7 @@
leal (%ebx, %eax, 2), BB
#else
movl %ebx, BB
-#endif
+#endif
fldz
fldz
@@ -897,7 +897,7 @@
leal (%ebx, %eax, 2), BB
#else
movl %ebx, BB
-#endif
+#endif
fldz
fldz
diff --git a/kernel/x86/trsm_kernel_RT_2x2_atom.S b/kernel/x86/trsm_kernel_RT_2x2_atom.S
index 97af198..b3eaf56 100644
--- a/kernel/x86/trsm_kernel_RT_2x2_atom.S
+++ b/kernel/x86/trsm_kernel_RT_2x2_atom.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -83,7 +83,7 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
leal (, LDC, SIZE), LDC
@@ -140,7 +140,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -172,7 +172,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd 0 * SIZE(BB), %xmm1
xorps %xmm0, %xmm0
@@ -387,7 +387,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
xorps %xmm2, %xmm2
@@ -584,7 +584,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -616,7 +616,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
xorps %xmm2, %xmm2
@@ -916,7 +916,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
xorps %xmm2, %xmm2
diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S
index ff8231e..154276f 100644
--- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S
+++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -95,7 +95,7 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
leal (, LDC, SIZE), LDC
@@ -155,7 +155,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -187,7 +187,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd -16 * SIZE(AA), %xmm0
movhps -15 * SIZE(AA), %xmm0
@@ -430,7 +430,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd -16 * SIZE(AA), %xmm0
movhps -15 * SIZE(AA), %xmm0
@@ -610,7 +610,7 @@
#endif
ALIGN_4
-.L30:
+.L30:
testl $2, N
je .L60
@@ -641,7 +641,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -673,7 +673,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -980,7 +980,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1250,7 +1250,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1282,7 +1282,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
leal (CO1, LDC, 2), %eax
@@ -1736,7 +1736,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
diff --git a/kernel/x86/trsm_kernel_RT_2x4_sse2.S b/kernel/x86/trsm_kernel_RT_2x4_sse2.S
index b6d9ca4..c43a0f1 100644
--- a/kernel/x86/trsm_kernel_RT_2x4_sse2.S
+++ b/kernel/x86/trsm_kernel_RT_2x4_sse2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define OLD_M 4 + STACK + ARGS(%esi)
#define OLD_N 8 + STACK + ARGS(%esi)
#define OLD_K 12 + STACK + ARGS(%esi)
@@ -256,7 +256,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -271,7 +271,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -287,7 +287,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 1), B
leal (BB, %eax, 2), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -303,7 +303,7 @@
sarl $3, %eax
jle .L65
ALIGN_4
-
+
.L62:
#define COPYPREFETCH 40
@@ -365,7 +365,7 @@
decl %eax
jne .L66
ALIGN_4
-
+
.L70:
#if defined(LT) || defined(RN)
movl A, AA
@@ -407,7 +407,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -653,7 +653,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -863,7 +863,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -879,7 +879,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 2), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -895,7 +895,7 @@
sarl $2, %eax
jle .L35
ALIGN_4
-
+
.L32:
#define COPYPREFETCH 40
@@ -960,7 +960,7 @@
decl %eax
jne .L36
ALIGN_4
-
+
.L40:
#if defined(LT) || defined(RN)
movl A, AA
@@ -1004,7 +1004,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1318,7 +1318,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1588,7 +1588,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -1604,7 +1604,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 4), B
leal (BB, %eax, 8), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1620,7 +1620,7 @@
sarl $1, %eax
jle .L05
ALIGN_4
-
+
.L02:
#define COPYPREFETCH 40
@@ -1686,7 +1686,7 @@
addl $4 * SIZE, B
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movl A, AA
@@ -1730,7 +1730,7 @@
movl KK, %eax
sall $3 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1767,7 +1767,7 @@
andl $-8, %eax
sall $4, %eax
je .L15
-.L1X:
+.L1X:
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
@@ -2199,7 +2199,7 @@
movl KK, %eax
sall $3 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -2576,7 +2576,7 @@
.L999:
movl OLD_STACK, %esp
EMMS
-
+
popl %ebx
popl %esi
popl %edi
diff --git a/kernel/x86/trsm_kernel_RT_2x4_sse3.S b/kernel/x86/trsm_kernel_RT_2x4_sse3.S
index 6be1d86..792c327 100644
--- a/kernel/x86/trsm_kernel_RT_2x4_sse3.S
+++ b/kernel/x86/trsm_kernel_RT_2x4_sse3.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -95,7 +95,7 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
leal (, LDC, SIZE), LDC
@@ -152,7 +152,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -184,7 +184,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -410,7 +410,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
movhpd 1 * SIZE(AA), %xmm0
@@ -636,7 +636,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -668,7 +668,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -952,7 +952,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movddup 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1225,7 +1225,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1257,7 +1257,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1674,7 +1674,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movddup 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
diff --git a/kernel/x86/trsm_kernel_RT_4x2_core2.S b/kernel/x86/trsm_kernel_RT_4x2_core2.S
index 866eddf..781876b 100644
--- a/kernel/x86/trsm_kernel_RT_4x2_core2.S
+++ b/kernel/x86/trsm_kernel_RT_4x2_core2.S
@@ -45,7 +45,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -55,7 +55,7 @@
#define STACK_C 32 + STACK + ARGS(%esi)
#define STACK_LDC 36 + STACK + ARGS(%esi)
#define STACK_OFFT 40 + STACK + ARGS(%esi)
-
+
#define K 16(%esp)
#define N 20(%esp)
#define M 24(%esp)
@@ -141,7 +141,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -153,13 +153,13 @@
testl $1, %eax
jle .L100
ALIGN_2
-
+
.L101:
#ifdef LN
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal 16 * SIZE + BUFFER, BB
@@ -175,7 +175,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 1), B
leal (BB, %eax, 2), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -191,7 +191,7 @@
sarl $3, %eax
jle .L103
ALIGN_4
-
+
.L102:
movddup -16 * SIZE(B), %xmm0
movddup -15 * SIZE(B), %xmm1
@@ -239,7 +239,7 @@
decl %eax
jne .L104
ALIGN_4
-
+
.L105:
#if defined(LT) || defined(RN)
movl A, AA
@@ -281,7 +281,7 @@
movl KK, %eax
sall $0 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -301,7 +301,7 @@
sarl $3, %eax
je .L112
-.L111:
+.L111:
mulpd %xmm1, %xmm0
mulpd -14 * SIZE(AA), %xmm1
addpd %xmm0, %xmm4
@@ -567,7 +567,7 @@
BRANCH
decl %ebx # i --
jg .L110
- ALIGN_2
+ ALIGN_2
.L130:
movl M, %ebx
@@ -593,7 +593,7 @@
movl KK, %eax
sall $0 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -611,7 +611,7 @@
sarl $3, %eax
je .L132
-.L131:
+.L131:
mulpd %xmm0, %xmm1
movapd -14 * SIZE(AA), %xmm0
addpd %xmm1, %xmm4
@@ -757,7 +757,7 @@
movddup %xmm0, %xmm1
unpckhpd %xmm0, %xmm0
-
+
movapd %xmm1, -16 * SIZE(BB)
movapd %xmm0, -14 * SIZE(BB)
#else
@@ -815,7 +815,7 @@
movl KK, %eax
sall $0 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movsd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -833,7 +833,7 @@
sarl $3, %eax
je .L152
-.L151:
+.L151:
mulsd %xmm0, %xmm1
movsd -15 * SIZE(AA), %xmm0
addsd %xmm1, %xmm4
@@ -1005,14 +1005,14 @@
movl %eax, J
jle .L999
ALIGN_2
-
+
.L01:
/* Copying to Sub Buffer */
#ifdef LN
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal 16 * SIZE + BUFFER, BB
@@ -1028,7 +1028,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 2), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1044,7 +1044,7 @@
sarl $2, %eax
jle .L03
ALIGN_2
-
+
.L02:
movddup -16 * SIZE(B), %xmm0
movddup -15 * SIZE(B), %xmm1
@@ -1094,7 +1094,7 @@
decl %eax
jne .L04
ALIGN_4
-
+
.L05:
#if defined(LT) || defined(RN)
movl A, AA
@@ -1138,7 +1138,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1554,7 +1554,7 @@
decl %ebx # i --
jg .L10
- ALIGN_2
+ ALIGN_2
.L30:
movl M, %ebx
@@ -1580,7 +1580,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1600,7 +1600,7 @@
sarl $3, %eax
je .L32
-.L31:
+.L31:
mulpd %xmm0, %xmm1
mulpd -14 * SIZE(BB), %xmm0
addpd %xmm1, %xmm4
@@ -1850,7 +1850,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movsd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1870,7 +1870,7 @@
sarl $3, %eax
je .L52
-.L51:
+.L51:
mulsd %xmm0, %xmm1
mulsd -14 * SIZE(BB), %xmm0
addsd %xmm1, %xmm4
diff --git a/kernel/x86/trsm_kernel_RT_4x2_sse2.S b/kernel/x86/trsm_kernel_RT_4x2_sse2.S
index 68b52ba..6c3b342 100644
--- a/kernel/x86/trsm_kernel_RT_4x2_sse2.S
+++ b/kernel/x86/trsm_kernel_RT_4x2_sse2.S
@@ -45,7 +45,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -55,7 +55,7 @@
#define STACK_C 32 + STACK + ARGS(%esi)
#define STACK_LDC 36 + STACK + ARGS(%esi)
#define STACK_OFFT 40 + STACK + ARGS(%esi)
-
+
#define ALPHA 0(%esp)
#define K 16(%esp)
#define N 20(%esp)
@@ -216,7 +216,7 @@
addl $STACK_OFFSET, %esp
STACK_TOUCHING
-
+
movd STACK_M, %mm0
movl STACK_N, %eax
movd STACK_K, %mm1
@@ -257,7 +257,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -269,14 +269,14 @@
testl $1, %eax
jle .L100
ALIGN_2
-
+
.L101:
/* Copying to Sub Buffer */
#ifdef LN
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -292,7 +292,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 1), B
leal (BB, %eax, 2), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -308,7 +308,7 @@
sarl $3, %eax
jle .L103
ALIGN_4
-
+
.L102:
movsd 0 * SIZE(B), %xmm0
movsd 1 * SIZE(B), %xmm1
@@ -370,7 +370,7 @@
decl %eax
jne .L104
ALIGN_4
-
+
.L105:
#if defined(LT) || defined(RN)
movl A, AA
@@ -412,7 +412,7 @@
movl KK, %eax
sall $0 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd 0 * SIZE(BB), %xmm2
pxor %xmm4, %xmm4
@@ -432,7 +432,7 @@
sarl $3, %eax
je .L112
-.L111:
+.L111:
mulpd %xmm2, %xmm0
mulpd 2 * SIZE(AA), %xmm2
addpd %xmm0, %xmm4
@@ -699,7 +699,7 @@
BRANCH
decl %ebx # i --
jg .L110
- ALIGN_2
+ ALIGN_2
.L130:
movl M, %ebx
@@ -734,7 +734,7 @@
movl KK, %eax
sall $0 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -745,7 +745,7 @@
sarl $3, %eax
je .L132
-.L131:
+.L131:
mulpd %xmm0, %xmm2
movapd 2 * SIZE(AA), %xmm0
addpd %xmm2, %xmm4
@@ -954,7 +954,7 @@
movl KK, %eax
sall $0 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -965,7 +965,7 @@
sarl $3, %eax
je .L152
-.L151:
+.L151:
mulsd %xmm0, %xmm2
movsd 1 * SIZE(AA), %xmm0
addsd %xmm2, %xmm4
@@ -1131,14 +1131,14 @@
movl %eax, J
jle .L999
ALIGN_2
-
+
.L01:
/* Copying to Sub Buffer */
#ifdef LN
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -1154,7 +1154,7 @@
leal (, %eax, SIZE), %eax
leal (B, %eax, 2), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1170,7 +1170,7 @@
sarl $2, %eax
jle .L03
ALIGN_2
-
+
.L02:
movsd 0 * SIZE(B), %xmm0
movsd 1 * SIZE(B), %xmm1
@@ -1234,7 +1234,7 @@
decl %eax
jne .L04
ALIGN_4
-
+
.L05:
#if defined(LT) || defined(RN)
movl A, AA
@@ -1278,7 +1278,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd 0 * SIZE(BB), %xmm2
pxor %xmm4, %xmm4
@@ -1304,7 +1304,7 @@
NOBRANCH
je .L12
sall $3, %eax
-
+
.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
@@ -1406,7 +1406,7 @@
sarl $3, %eax
je .L12
-.L11:
+.L11:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
KERNEL3(32 * 0)
@@ -1452,7 +1452,7 @@
addl $4 * SIZE, BB # boffset1 += 8
subl $1, %eax
jg .L13
- ALIGN_4
+ ALIGN_4
.L14:
#if defined(LN) || defined(RT)
@@ -1717,7 +1717,7 @@
decl %ebx # i --
jg .L10
- ALIGN_2
+ ALIGN_2
.L30:
movl M, %ebx
@@ -1743,7 +1743,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd 0 * SIZE(BB), %xmm2
pxor %xmm4, %xmm4
@@ -1763,7 +1763,7 @@
sarl $3, %eax
je .L32
-.L31:
+.L31:
mulpd %xmm0, %xmm2
mulpd 2 * SIZE(BB), %xmm0
addpd %xmm2, %xmm4
@@ -2031,7 +2031,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movsd 0 * SIZE(BB), %xmm2
pxor %xmm4, %xmm4
@@ -2051,7 +2051,7 @@
sarl $3, %eax
je .L52
-.L51:
+.L51:
mulsd %xmm0, %xmm2
mulsd 2 * SIZE(BB), %xmm0
addsd %xmm2, %xmm4
diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S
index 8578665..acdcd6e 100644
--- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S
+++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -100,7 +100,7 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
leal (, LDC, SIZE), LDC
@@ -160,7 +160,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -192,7 +192,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -506,7 +506,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd -32 * SIZE(AA), %xmm0
pxor %xmm3, %xmm3
@@ -733,7 +733,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
movsd -32 * SIZE(AA), %xmm0
@@ -925,7 +925,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -957,7 +957,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -1335,7 +1335,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -32 * SIZE(AA), %xmm0
pxor %xmm3, %xmm3
@@ -1598,7 +1598,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
movsd -32 * SIZE(AA), %xmm0
@@ -1861,7 +1861,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1893,7 +1893,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
leal (CO1, LDC, 2), %eax
@@ -2433,7 +2433,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
movaps -32 * SIZE(AA), %xmm0
@@ -2810,7 +2810,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
pxor %xmm4, %xmm4
movsd -32 * SIZE(AA), %xmm0
diff --git a/kernel/x86/trsm_kernel_RT_4x4_sse.S b/kernel/x86/trsm_kernel_RT_4x4_sse.S
index 40afac5..743516e 100644
--- a/kernel/x86/trsm_kernel_RT_4x4_sse.S
+++ b/kernel/x86/trsm_kernel_RT_4x4_sse.S
@@ -40,7 +40,7 @@
#include "common.h"
#define STACK 16
-
+
#define OLD_M 4 + STACK(%esi)
#define OLD_N 8 + STACK(%esi)
#define OLD_K 12 + STACK(%esi)
@@ -268,7 +268,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -283,7 +283,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -299,7 +299,7 @@
sall $BASE_SHIFT, %eax
leal (B, %eax, 1), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -315,7 +315,7 @@
sarl $3, %eax
jle .L85
ALIGN_4
-
+
.L82:
movsd 0 * SIZE(B), %xmm3
movhps 2 * SIZE(B), %xmm3
@@ -371,7 +371,7 @@
decl %eax
jne .L86
ALIGN_4
-
+
.L90:
#if defined(LT) || defined(RN)
movl A, AA
@@ -413,7 +413,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -728,7 +728,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -969,7 +969,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -1165,7 +1165,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -1181,7 +1181,7 @@
sall $1 + BASE_SHIFT, %eax
leal (B, %eax, 1), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1197,7 +1197,7 @@
sarl $2, %eax
jle .L45
ALIGN_4
-
+
.L42:
movaps 0 * SIZE(B), %xmm3
movaps 4 * SIZE(B), %xmm7
@@ -1253,7 +1253,7 @@
decl %eax
jne .L46
ALIGN_4
-
+
.L50:
#if defined(LT) || defined(RN)
movl A, AA
@@ -1297,7 +1297,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -1689,7 +1689,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -2007,7 +2007,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -2273,7 +2273,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -2289,7 +2289,7 @@
sall $2 + BASE_SHIFT, %eax
leal (B, %eax, 1), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -2305,7 +2305,7 @@
sarl $1, %eax
jle .L05
ALIGN_4
-
+
.L02:
movaps 0 * SIZE(B), %xmm3
movaps 4 * SIZE(B), %xmm7
@@ -2360,7 +2360,7 @@
addl $4 * SIZE, B
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movl A, AA
@@ -2404,7 +2404,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
xorps %xmm4, %xmm4
@@ -2829,7 +2829,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef movsd
xorps %xmm0, %xmm0
@@ -3279,7 +3279,7 @@
movl KK, %eax
sall $2 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movss 0 * SIZE(AA), %xmm0
xorps %xmm4, %xmm4
diff --git a/kernel/x86/trsm_kernel_RT_8x2_sse.S b/kernel/x86/trsm_kernel_RT_8x2_sse.S
index 6bc1d21..cea034e 100644
--- a/kernel/x86/trsm_kernel_RT_8x2_sse.S
+++ b/kernel/x86/trsm_kernel_RT_8x2_sse.S
@@ -45,7 +45,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -153,7 +153,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -170,12 +170,12 @@
testl $1, N
jle .L100
-
+
#ifdef LN
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -191,7 +191,7 @@
sall $BASE_SHIFT, %eax
leal (B, %eax, 1), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -207,7 +207,7 @@
sarl $3, %eax
jle .L103
ALIGN_4
-
+
.L102:
movsd 0 * SIZE(B), %xmm3
movhps 2 * SIZE(B), %xmm3
@@ -326,7 +326,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -764,7 +764,7 @@
#if defined(RN) || defined(RT)
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
mulps %xmm6, %xmm1
#endif
@@ -772,7 +772,7 @@
#if defined(LN) || defined(LT)
shufps $0x88, %xmm3, %xmm2
shufps $0x88, %xmm7, %xmm5
-
+
movlps %xmm2, 0 * SIZE(B)
movhps %xmm2, 2 * SIZE(B)
movlps %xmm5, 4 * SIZE(B)
@@ -872,7 +872,7 @@
decl %ebx # i --
jg .L110
- ALIGN_2
+ ALIGN_2
.L130:
testl $4, M
@@ -897,7 +897,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -1201,7 +1201,7 @@
sall $2 + BASE_SHIFT, %eax
addl %eax, AORIG
#endif
- ALIGN_2
+ ALIGN_2
.L150:
testl $2, M
@@ -1226,7 +1226,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -1446,7 +1446,7 @@
sall $1 + BASE_SHIFT, %eax
addl %eax, AORIG
#endif
- ALIGN_2
+ ALIGN_2
.L170:
testl $1, M
@@ -1470,7 +1470,7 @@
movl KK, %eax
sall $BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movss 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -1658,13 +1658,13 @@
movl %eax, J
jle .L999
ALIGN_2
-
+
.L01:
#ifdef LN
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -1680,7 +1680,7 @@
sall $1 + BASE_SHIFT, %eax
leal (B, %eax, 1), B
leal (BB, %eax, 4), BB
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1696,7 +1696,7 @@
sarl $2, %eax
jle .L03
ALIGN_4
-
+
.L02:
movsd 0 * SIZE(B), %xmm3
movhps 2 * SIZE(B), %xmm3
@@ -1825,7 +1825,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -2348,7 +2348,7 @@
#ifdef RN
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
mulps %xmm6, %xmm1
@@ -2364,7 +2364,7 @@
movss 3 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm2
mulps %xmm6, %xmm3
#endif
@@ -2372,7 +2372,7 @@
#ifdef RT
movss 3 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm2
mulps %xmm6, %xmm3
@@ -2388,7 +2388,7 @@
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
mulps %xmm6, %xmm1
#endif
@@ -2553,7 +2553,7 @@
decl %ebx # i --
jg .L10
- ALIGN_2
+ ALIGN_2
.L30:
testl $4, M
@@ -2578,7 +2578,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -2828,7 +2828,7 @@
#ifdef RN
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
movss 1 * SIZE(B), %xmm6
@@ -2840,14 +2840,14 @@
movss 3 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm2
#endif
#ifdef RT
movss 3 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm2
movss 2 * SIZE(B), %xmm6
@@ -2860,7 +2860,7 @@
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
#endif
@@ -2965,7 +2965,7 @@
sall $2 + BASE_SHIFT, %eax
addl %eax, AORIG
#endif
- ALIGN_2
+ ALIGN_2
.L50:
testl $2, M
@@ -2990,7 +2990,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -3190,7 +3190,7 @@
#ifdef RN
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
movss 1 * SIZE(B), %xmm6
@@ -3202,14 +3202,14 @@
movss 3 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm2
#endif
#ifdef RT
movss 3 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm2
movss 2 * SIZE(B), %xmm6
@@ -3222,7 +3222,7 @@
movss 0 * SIZE(B), %xmm6
shufps $0x00, %xmm6, %xmm6
-
+
mulps %xmm6, %xmm0
#endif
@@ -3300,7 +3300,7 @@
sall $1 + BASE_SHIFT, %eax
addl %eax, AORIG
#endif
- ALIGN_2
+ ALIGN_2
.L70:
testl $1, M
@@ -3325,7 +3325,7 @@
movl KK, %eax
sall $1 + BASE_SHIFT, %eax
leal (BB, %eax, 4), BB
-#endif
+#endif
movss 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -3567,7 +3567,7 @@
sall $BASE_SHIFT, %eax
addl %eax, AORIG
#endif
- ALIGN_2
+ ALIGN_2
.L99:
#ifdef LN
diff --git a/kernel/x86/xaxpy.S b/kernel/x86/xaxpy.S
index 554aa0c..99eadab 100644
--- a/kernel/x86/xaxpy.S
+++ b/kernel/x86/xaxpy.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_ALPHA_R 16 + STACK + ARGS(%esp)
#define STACK_ALPHA_I 32 + STACK + ARGS(%esp)
@@ -87,7 +87,7 @@
sall $ZBASE_SHIFT, INCX
sall $ZBASE_SHIFT, INCY
-
+
testl M, M
jle .L40
diff --git a/kernel/x86/xdot.S b/kernel/x86/xdot.S
index 9297632..9f77206 100644
--- a/kernel/x86/xdot.S
+++ b/kernel/x86/xdot.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#if defined(F_INTERFACE) && defined(RETURN_BY_STACK)
#define RESULT 4 + STACK + ARGS(%esp)
#define STACK_N 8 + STACK + ARGS(%esp)
diff --git a/kernel/x86/xgemm3m_kernel_2x2.S b/kernel/x86/xgemm3m_kernel_2x2.S
index b844875..c53825d 100644
--- a/kernel/x86/xgemm3m_kernel_2x2.S
+++ b/kernel/x86/xgemm3m_kernel_2x2.S
@@ -50,7 +50,7 @@
#define PREFETCHSIZE (5 + 4 * 10)
#define STACK 16
#define ARGS 16
-
+
#define J 0 + STACK(%esp)
#define KK 4 + STACK(%esp)
#define KKK 8 + STACK(%esp)
@@ -90,13 +90,13 @@
negl %eax
movl %eax, KK
#endif
-
+
movl ARG_LDC, LDC
movl ARG_B, B
addl $8 * SIZE, A
addl $8 * SIZE, B
-
+
sall $ZBASE_SHIFT, LDC
movl N, %eax
@@ -109,7 +109,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl A, AO
@@ -132,7 +132,7 @@
sall $BASE_SHIFT, %eax
leal (AO, %eax, 2), AO
leal (B, %eax, 2), BO
-#endif
+#endif
fldz
fldz
@@ -152,7 +152,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -178,7 +178,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -196,7 +196,7 @@
FLD -5 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -5 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -216,7 +216,7 @@
FLD -3 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -3 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -234,7 +234,7 @@
FLD -1 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -1 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -270,7 +270,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -359,7 +359,7 @@
sall $BASE_SHIFT, %eax
leal (AO, %eax, 1), AO
leal ( B, %eax, 2), BO
-#endif
+#endif
fldz
fldz
@@ -369,7 +369,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -502,13 +502,13 @@
.L30:
movl N, %eax
- testl $1, %eax
+ testl $1, %eax
je .L999
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl A, AO
@@ -530,7 +530,7 @@
sall $BASE_SHIFT, %eax
leal (AO, %eax, 2), AO
leal ( B, %eax, 1), BO
-#endif
+#endif
fldz
fldz
@@ -546,7 +546,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -683,7 +683,7 @@
sall $BASE_SHIFT, %eax
leal (AO, %eax, 1), AO
leal ( B, %eax, 1), BO
-#endif
+#endif
fldz
@@ -692,7 +692,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/xgemm_kernel_1x1.S b/kernel/x86/xgemm_kernel_1x1.S
index b401bd2..1e2c4a1 100644
--- a/kernel/x86/xgemm_kernel_1x1.S
+++ b/kernel/x86/xgemm_kernel_1x1.S
@@ -50,7 +50,7 @@
#define PREFETCHSIZE (5 + 4 * 10)
#define STACK 16
#define ARGS 16
-
+
#define J 0 + STACK(%esp)
#define KK 4 + STACK(%esp)
#define KKK 8 + STACK(%esp)
@@ -113,18 +113,18 @@
negl %eax
movl %eax, KK
#endif
-
+
movl ARG_LDC, LDC
movl ARG_B, B
addl $8 * SIZE, A
addl $8 * SIZE, B
-
+
sall $ZBASE_SHIFT, LDC
cmpl $0, M
jle .L999
-
+
movl N, %eax
movl %eax, J
testl %eax, %eax
@@ -135,7 +135,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl A, AO
@@ -155,7 +155,7 @@
sall $ZBASE_SHIFT, %eax
leal (AO, %eax, 1), AO
leal (B, %eax, 1), BO
-#endif
+#endif
fldz
fldz
@@ -173,7 +173,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -199,7 +199,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -217,7 +217,7 @@
FLD -5 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -5 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -237,7 +237,7 @@
FLD -3 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -3 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -255,7 +255,7 @@
FLD -1 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -1 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -291,7 +291,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -336,7 +336,7 @@
FST 1 * SIZE(CO)
FST 0 * SIZE(CO)
#endif
-
+
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movl K, %eax
diff --git a/kernel/x86/xgemv_n.S b/kernel/x86/xgemv_n.S
index 32447ba..1a96e83 100644
--- a/kernel/x86/xgemv_n.S
+++ b/kernel/x86/xgemv_n.S
@@ -53,7 +53,7 @@
#define STACK 16
#define ARGS 16
-
+
#define PLDA_M 0 + STACK(%esp)
#define XP 4 + STACK(%esp)
#define MIN_N 8 + STACK(%esp)
diff --git a/kernel/x86/xgemv_t.S b/kernel/x86/xgemv_t.S
index 1397a10..a9c8dbc 100644
--- a/kernel/x86/xgemv_t.S
+++ b/kernel/x86/xgemv_t.S
@@ -49,7 +49,7 @@
#define STACK 16
#define ARGS 24
-
+
#define NLDA 0 + STACK(%esp)
#define XP 4 + STACK(%esp)
#define MIN_M 8 + STACK(%esp)
diff --git a/kernel/x86/xtrsm_kernel_LT_1x1.S b/kernel/x86/xtrsm_kernel_LT_1x1.S
index e05266f..2dcad56 100644
--- a/kernel/x86/xtrsm_kernel_LT_1x1.S
+++ b/kernel/x86/xtrsm_kernel_LT_1x1.S
@@ -50,7 +50,7 @@
#define PREFETCHSIZE (5 + 4 * 10)
#define STACK 16
#define ARGS 16
-
+
#define J 0 + STACK(%esp)
#define KK 4 + STACK(%esp)
#define KKK 8 + STACK(%esp)
@@ -111,7 +111,7 @@
addl $8 * SIZE, A
addl $8 * SIZE, B
-
+
#ifdef LN
movl M, %eax
sall $ZBASE_SHIFT, %eax
@@ -135,7 +135,7 @@
movl OFFSET, %eax
negl %eax
movl %eax, KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -145,7 +145,7 @@
cmpl $0, M
jle .L999
-
+
movl N, %eax
movl %eax, J
testl %eax, %eax
@@ -178,7 +178,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -203,7 +203,7 @@
leal (B, %eax, 1), BO
#else
movl B, BO
-#endif
+#endif
fldz
fldz
@@ -238,7 +238,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -256,7 +256,7 @@
FLD -5 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -5 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -276,7 +276,7 @@
FLD -3 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -3 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -294,7 +294,7 @@
FLD -1 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -1 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -331,7 +331,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -350,7 +350,7 @@
.L18:
faddp %st, %st(3)
faddp %st, %st(1)
-
+
fxch %st(1)
#if defined(LN) || defined(RT)
@@ -430,7 +430,7 @@
FST 0 * SIZE(CO)
FST 1 * SIZE(CO)
-
+
#ifndef LN
addl $2 * SIZE, CO
#endif
diff --git a/kernel/x86/zamax.S b/kernel/x86/zamax.S
index 3056c1e..8af8823 100644
--- a/kernel/x86/zamax.S
+++ b/kernel/x86/zamax.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 8
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -91,9 +91,9 @@
fstp %st(0)
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
addl INCX, X
decl M
@@ -106,43 +106,43 @@
sarl $2, I
jle .L20
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi %st(1), %st
FMOV %st(1), %st(0)
fstp %st(1)
FLD 2 * SIZE(X)
- fabs
+ fabs
FLD 3 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi %st(1), %st
FMOV %st(1), %st(0)
fstp %st(1)
FLD 4 * SIZE(X)
- fabs
+ fabs
FLD 5 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi %st(1), %st
FMOV %st(1), %st(0)
fstp %st(1)
FLD 6 * SIZE(X)
- fabs
+ fabs
FLD 7 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -163,9 +163,9 @@
.L21:
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi %st(1), %st
FMOV %st(1), %st(0)
@@ -182,12 +182,12 @@
sarl $2, I
jle .L60
ALIGN_4
-
+
.L50:
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
addl INCX, X
faddp %st, %st(1)
fcomi %st(1), %st
@@ -196,9 +196,9 @@
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
addl INCX, X
faddp %st, %st(1)
fcomi %st(1), %st
@@ -207,9 +207,9 @@
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
addl INCX, X
faddp %st, %st(1)
fcomi %st(1), %st
@@ -218,9 +218,9 @@
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
addl INCX, X
faddp %st, %st(1)
fcomi %st(1), %st
@@ -240,9 +240,9 @@
.L61:
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi %st(1), %st
FMOV %st(1), %st(0)
diff --git a/kernel/x86/zamax_sse.S b/kernel/x86/zamax_sse.S
index 60dd25b..49e1c9c 100644
--- a/kernel/x86/zamax_sse.S
+++ b/kernel/x86/zamax_sse.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -54,12 +54,12 @@
#define MM %ebp
#define XX %edi
#define TEMP %ebx
-
+
#ifdef USE_MIN
#define maxps minps
#define maxss minss
#endif
-
+
#ifndef HAVE_SSE2
#define pxor xorps
#define movsd movlps
@@ -124,7 +124,7 @@
sarl $3, I
jle .L35
ALIGN_4
-
+
.L31:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
@@ -216,7 +216,7 @@
maxss %xmm1, %xmm0
maxss %xmm3, %xmm0
addl $4 * SIZE, XX
- ALIGN_3
+ ALIGN_3
.L37:
testl $1, MM
@@ -247,7 +247,7 @@
sarl $3, I
jle .L75
ALIGN_4
-
+
.L71:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
@@ -325,7 +325,7 @@
#endif
addps %xmm3, %xmm1
maxps %xmm1, %xmm0
- ALIGN_3
+ ALIGN_3
.L76:
testl $2, MM
@@ -349,7 +349,7 @@
maxss %xmm1, %xmm0
maxss %xmm3, %xmm0
ALIGN_3
-
+
.L77:
testl $1, MM
je .L80
diff --git a/kernel/x86/zamax_sse2.S b/kernel/x86/zamax_sse2.S
index 50adffb..83f5cb8 100644
--- a/kernel/x86/zamax_sse2.S
+++ b/kernel/x86/zamax_sse2.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -54,7 +54,7 @@
#define MM %ebp
#define XX %edi
#define TEMP %ebx
-
+
#ifdef USE_MIN
#define maxpd minpd
#define maxsd minsd
@@ -204,7 +204,7 @@
maxpd %xmm1, %xmm0
addl $4 * SIZE, XX
- ALIGN_3
+ ALIGN_3
.L27:
testl $1, MM
@@ -230,7 +230,7 @@
sarl $3, I
jle .L65
ALIGN_4
-
+
.L61:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
@@ -339,7 +339,7 @@
andpd %xmm7, %xmm2
addpd %xmm2, %xmm1
maxpd %xmm1, %xmm0
- ALIGN_3
+ ALIGN_3
.L67:
testl $1, MM
diff --git a/kernel/x86/zasum.S b/kernel/x86/zasum.S
index 84b8f60..1361205 100644
--- a/kernel/x86/zasum.S
+++ b/kernel/x86/zasum.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 8
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -49,7 +49,7 @@
#define M %edx
#define X %ecx
#define INCX %esi
-
+
#define I %eax
#include "l1param.h"
@@ -92,7 +92,7 @@
sarl $2, I
jle .L20
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -157,7 +157,7 @@
sarl $2, I
jle .L60
ALIGN_4
-
+
.L50:
FLD 0 * SIZE(X)
fabs
diff --git a/kernel/x86/zasum_sse.S b/kernel/x86/zasum_sse.S
index ff8230c..dee096b 100644
--- a/kernel/x86/zasum_sse.S
+++ b/kernel/x86/zasum_sse.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 8
#define ARGS 0
@@ -52,7 +52,7 @@
#define INCX %ebx
#include "l1param.h"
-
+
PROLOGUE
PROFCODE
@@ -79,7 +79,7 @@
movss STACK_M, %xmm3
shufps $0, %xmm3, %xmm3
#endif
-
+
sall $ZBASE_SHIFT, INCX
cmpl $2 * SIZE, INCX
@@ -124,7 +124,7 @@
decl I
jle .L12
ALIGN_3
-
+
.L11:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -277,7 +277,7 @@
sarl $2, I
jle .L105
ALIGN_4
-
+
.L101:
movsd (X), %xmm4
addl INCX, X
@@ -322,18 +322,18 @@
#ifndef HAVE_SSE3
movhlps %xmm0, %xmm1
addps %xmm1, %xmm0
-
+
movaps %xmm0, %xmm1
shufps $1, %xmm0, %xmm0
addss %xmm1, %xmm0
#else
haddps %xmm0, %xmm0
haddps %xmm0, %xmm0
-#endif
+#endif
movss %xmm0, STACK_M
flds STACK_M
-
+
popl %ebx
popl %esi
ret
diff --git a/kernel/x86/zasum_sse2.S b/kernel/x86/zasum_sse2.S
index b7dbc15..0c73491 100644
--- a/kernel/x86/zasum_sse2.S
+++ b/kernel/x86/zasum_sse2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 8
#define ARGS 0
@@ -111,7 +111,7 @@
decl I
jle .L11
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -224,7 +224,7 @@
addpd %xmm5, %xmm1
addl $4 * SIZE, X
- ALIGN_3
+ ALIGN_3
.L22:
testl $2, M
@@ -234,7 +234,7 @@
andps %xmm3, %xmm4
addpd %xmm4, %xmm0
addl $2 * SIZE, X
-
+
.L23:
testl $1, M
je .L999
@@ -253,7 +253,7 @@
sarl $2, I
jle .L60
ALIGN_4
-
+
.L50:
movsd 0 * SIZE(X), %xmm4
movhps 1 * SIZE(X), %xmm4
diff --git a/kernel/x86/zaxpy.S b/kernel/x86/zaxpy.S
index 0894f5d..b79ad79 100644
--- a/kernel/x86/zaxpy.S
+++ b/kernel/x86/zaxpy.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#ifdef DOUBLE
#define STACK_ALPHA_R 16 + STACK + ARGS(%esp)
diff --git a/kernel/x86/zaxpy_sse.S b/kernel/x86/zaxpy_sse.S
index 9c94cec..3f67a0f 100644
--- a/kernel/x86/zaxpy_sse.S
+++ b/kernel/x86/zaxpy_sse.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_ALPHA_R 16 + STACK + ARGS(%esp)
#define STACK_ALPHA_I 20 + STACK + ARGS(%esp)
@@ -97,7 +97,7 @@
movss STACK_M, %xmm5
shufps $0x11, %xmm5, %xmm5
#endif
-
+
shufps $0, ALPHA_R, ALPHA_R
shufps $0, ALPHA_I, ALPHA_I
@@ -3125,7 +3125,7 @@
addps %xmm1, %xmm4
movsd %xmm4, (Y)
-
+
decl %eax
jg .L201
diff --git a/kernel/x86/zaxpy_sse2.S b/kernel/x86/zaxpy_sse2.S
index 9c2caa7..db6001c 100644
--- a/kernel/x86/zaxpy_sse2.S
+++ b/kernel/x86/zaxpy_sse2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_ALPHA_R 16 + STACK + ARGS(%esp)
#define STACK_ALPHA_I 24 + STACK + ARGS(%esp)
@@ -113,10 +113,10 @@
#endif
#ifndef CONJ
- shufps $0x0c, %xmm5, %xmm5
+ shufps $0x0c, %xmm5, %xmm5
xorpd %xmm5, ALPHA_I
#else
- shufps $0xc0, %xmm5, %xmm5
+ shufps $0xc0, %xmm5, %xmm5
xorpd %xmm5, ALPHA_R
#endif
@@ -1518,7 +1518,7 @@
movlpd %xmm4, 0 * SIZE(YY)
movhpd %xmm4, 1 * SIZE(YY)
-
+
decl %eax
jg .L58
ALIGN_3
diff --git a/kernel/x86/zcopy.S b/kernel/x86/zcopy.S
index 153853e..248baf4 100644
--- a/kernel/x86/zcopy.S
+++ b/kernel/x86/zcopy.S
@@ -41,13 +41,13 @@
#define STACK 12
#define ARGS 0
-
+
#define M 4 + STACK + ARGS(%esp)
#define X 8 + STACK + ARGS(%esp)
#define INCX 12 + STACK + ARGS(%esp)
#define Y 16 + STACK + ARGS(%esp)
#define INCY 20 + STACK + ARGS(%esp)
-
+
PROLOGUE
pushl %edi
@@ -71,7 +71,7 @@
sall $ZBASE_SHIFT, %esi
sall $ZBASE_SHIFT, %edi
-
+
cmpl $2 * SIZE, %esi # if incx != 1
jne .L100
cmpl $2 * SIZE, %edi # if incy != 1
@@ -84,14 +84,14 @@
.L11:
#if defined(DOUBLE) || defined(XDOUBLE)
- FLD 7 * SIZE(%ecx)
- FLD 6 * SIZE(%ecx)
- FLD 5 * SIZE(%ecx)
- FLD 4 * SIZE(%ecx)
- FLD 3 * SIZE(%ecx)
- FLD 2 * SIZE(%ecx)
- FLD 1 * SIZE(%ecx)
- FLD 0 * SIZE(%ecx)
+ FLD 7 * SIZE(%ecx)
+ FLD 6 * SIZE(%ecx)
+ FLD 5 * SIZE(%ecx)
+ FLD 4 * SIZE(%ecx)
+ FLD 3 * SIZE(%ecx)
+ FLD 2 * SIZE(%ecx)
+ FLD 1 * SIZE(%ecx)
+ FLD 0 * SIZE(%ecx)
FST 0 * SIZE(%edx)
FST 1 * SIZE(%edx)
@@ -102,10 +102,10 @@
FST 6 * SIZE(%edx)
FST 7 * SIZE(%edx)
#else
- fldl 6 * SIZE(%ecx)
- fldl 4 * SIZE(%ecx)
- fldl 2 * SIZE(%ecx)
- fldl 0 * SIZE(%ecx)
+ fldl 6 * SIZE(%ecx)
+ fldl 4 * SIZE(%ecx)
+ fldl 2 * SIZE(%ecx)
+ fldl 0 * SIZE(%ecx)
fstpl 0 * SIZE(%edx)
fstpl 2 * SIZE(%edx)
diff --git a/kernel/x86/zcopy_sse.S b/kernel/x86/zcopy_sse.S
index 8393005..23e740e 100644
--- a/kernel/x86/zcopy_sse.S
+++ b/kernel/x86/zcopy_sse.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -89,7 +89,7 @@
subl $-32 * SIZE, X
subl $-32 * SIZE, Y
addl M, M
-
+
testl $SIZE, Y
je .L05
diff --git a/kernel/x86/zcopy_sse2.S b/kernel/x86/zcopy_sse2.S
index f936a34..c31726f 100644
--- a/kernel/x86/zcopy_sse2.S
+++ b/kernel/x86/zcopy_sse2.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
diff --git a/kernel/x86/zdot.S b/kernel/x86/zdot.S
index 9d8866a..5a2a758 100644
--- a/kernel/x86/zdot.S
+++ b/kernel/x86/zdot.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#if defined(DOUBLE) || defined(XDOUBLE)
#define RESULT 4 + STACK + ARGS(%esp)
#define STACK_N 8 + STACK + ARGS(%esp)
@@ -100,9 +100,9 @@
addl INCY, INCY
fldz
- leal (, INCX, SIZE), INCX
+ leal (, INCX, SIZE), INCX
fldz
- leal (, INCY, SIZE), INCY
+ leal (, INCY, SIZE), INCY
fldz
cmpl $2 * SIZE, INCX
diff --git a/kernel/x86/zdot_amd.S b/kernel/x86/zdot_amd.S
index 97a1e72..0a74c47 100644
--- a/kernel/x86/zdot_amd.S
+++ b/kernel/x86/zdot_amd.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#if !defined(DOUBLE) && !defined(XDOUBLE)
#define RESULT 4 + STACK + ARGS(%esp)
#define STACK_N 8 + STACK + ARGS(%esp)
@@ -98,8 +98,8 @@
addl INCX, INCX
addl INCY, INCY
- leal (, INCX, SIZE), INCX
- leal (, INCY, SIZE), INCY
+ leal (, INCX, SIZE), INCX
+ leal (, INCY, SIZE), INCY
cmpl $2 * SIZE, INCX
jne .L14
diff --git a/kernel/x86/zdot_sse.S b/kernel/x86/zdot_sse.S
index cc22964..117574e 100644
--- a/kernel/x86/zdot_sse.S
+++ b/kernel/x86/zdot_sse.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define STACK_N 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
diff --git a/kernel/x86/zdot_sse2.S b/kernel/x86/zdot_sse2.S
index 61e1bfc..d799e5d 100644
--- a/kernel/x86/zdot_sse2.S
+++ b/kernel/x86/zdot_sse2.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define RESULT 4 + STACK + ARGS(%esp)
#define STACK_N 8 + STACK + ARGS(%esp)
#define STACK_X 12 + STACK + ARGS(%esp)
@@ -119,7 +119,7 @@
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -12 * SIZE(Y), %xmm6
@@ -127,7 +127,7 @@
movaps -12 * SIZE(X), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -10 * SIZE(Y), %xmm7
@@ -139,7 +139,7 @@
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
#endif
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -8 * SIZE(Y), %xmm6
@@ -147,7 +147,7 @@
movaps -8 * SIZE(X), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -6 * SIZE(Y), %xmm7
@@ -159,7 +159,7 @@
PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
#endif
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -4 * SIZE(Y), %xmm6
@@ -167,7 +167,7 @@
movaps -4 * SIZE(X), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -2 * SIZE(Y), %xmm7
@@ -179,7 +179,7 @@
PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
#endif
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps 0 * SIZE(Y), %xmm6
@@ -187,7 +187,7 @@
movaps 0 * SIZE(X), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps 2 * SIZE(Y), %xmm7
@@ -203,7 +203,7 @@
ALIGN_3
.L12:
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -12 * SIZE(Y), %xmm6
@@ -211,7 +211,7 @@
movaps -12 * SIZE(X), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -10 * SIZE(Y), %xmm7
@@ -219,7 +219,7 @@
movaps -10 * SIZE(X), %xmm5
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -8 * SIZE(Y), %xmm6
@@ -227,7 +227,7 @@
movaps -8 * SIZE(X), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -6 * SIZE(Y), %xmm7
@@ -235,7 +235,7 @@
movaps -6 * SIZE(X), %xmm5
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -4 * SIZE(Y), %xmm6
@@ -243,7 +243,7 @@
movaps -4 * SIZE(X), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -2 * SIZE(Y), %xmm7
@@ -251,13 +251,13 @@
movaps -2 * SIZE(X), %xmm5
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
mulpd %xmm4, %xmm3
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
mulpd %xmm5, %xmm3
@@ -276,7 +276,7 @@
movaps -14 * SIZE(X), %xmm5
movaps -14 * SIZE(Y), %xmm7
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -12 * SIZE(Y), %xmm6
@@ -284,7 +284,7 @@
movaps -12 * SIZE(X), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -10 * SIZE(Y), %xmm7
@@ -292,13 +292,13 @@
movaps -10 * SIZE(X), %xmm5
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
mulpd %xmm4, %xmm3
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
mulpd %xmm5, %xmm3
@@ -317,13 +317,13 @@
movaps -14 * SIZE(X), %xmm5
movaps -14 * SIZE(Y), %xmm7
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
mulpd %xmm4, %xmm3
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
mulpd %xmm5, %xmm3
@@ -340,7 +340,7 @@
movaps -16 * SIZE(X), %xmm4
movaps -16 * SIZE(Y), %xmm6
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
mulpd %xmm4, %xmm3
@@ -370,7 +370,7 @@
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -12 * SIZE(Y), %xmm6
@@ -379,7 +379,7 @@
movhps -11 * SIZE(X), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -10 * SIZE(Y), %xmm7
@@ -392,7 +392,7 @@
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
#endif
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -8 * SIZE(Y), %xmm6
@@ -401,7 +401,7 @@
movhps -7 * SIZE(X), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -6 * SIZE(Y), %xmm7
@@ -414,7 +414,7 @@
PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
#endif
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -4 * SIZE(Y), %xmm6
@@ -423,7 +423,7 @@
movhps -3 * SIZE(X), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -2 * SIZE(Y), %xmm7
@@ -436,7 +436,7 @@
PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
#endif
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps 0 * SIZE(Y), %xmm6
@@ -445,7 +445,7 @@
movhps 1 * SIZE(X), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps 2 * SIZE(Y), %xmm7
@@ -462,7 +462,7 @@
ALIGN_3
.L22:
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -12 * SIZE(Y), %xmm6
@@ -471,7 +471,7 @@
movhps -11 * SIZE(X), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -10 * SIZE(Y), %xmm7
@@ -480,7 +480,7 @@
movhps -9 * SIZE(X), %xmm5
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -8 * SIZE(Y), %xmm6
@@ -489,7 +489,7 @@
movhps -7 * SIZE(X), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -6 * SIZE(Y), %xmm7
@@ -498,7 +498,7 @@
movhps -5 * SIZE(X), %xmm5
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -4 * SIZE(Y), %xmm6
@@ -507,7 +507,7 @@
movhps -3 * SIZE(X), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -2 * SIZE(Y), %xmm7
@@ -516,13 +516,13 @@
movhps -1 * SIZE(X), %xmm5
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
mulpd %xmm4, %xmm3
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
mulpd %xmm5, %xmm3
@@ -544,7 +544,7 @@
movhps -13 * SIZE(X), %xmm5
movaps -14 * SIZE(Y), %xmm7
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -12 * SIZE(Y), %xmm6
@@ -553,7 +553,7 @@
movhps -11 * SIZE(X), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -10 * SIZE(Y), %xmm7
@@ -562,13 +562,13 @@
movhps -9 * SIZE(X), %xmm5
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
mulpd %xmm4, %xmm3
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
mulpd %xmm5, %xmm3
@@ -586,7 +586,7 @@
movhps -15 * SIZE(X), %xmm4
movaps -16 * SIZE(Y), %xmm6
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
mulpd %xmm4, %xmm3
@@ -596,7 +596,7 @@
movhps -13 * SIZE(X), %xmm5
movaps -14 * SIZE(Y), %xmm7
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
mulpd %xmm5, %xmm3
@@ -614,7 +614,7 @@
movhps -15 * SIZE(X), %xmm4
movaps -16 * SIZE(Y), %xmm6
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
mulpd %xmm4, %xmm3
@@ -647,7 +647,7 @@
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
#endif
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -12 * SIZE(X), %xmm6
@@ -656,7 +656,7 @@
movhps -11 * SIZE(Y), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -10 * SIZE(X), %xmm7
@@ -668,7 +668,7 @@
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -8 * SIZE(X), %xmm6
@@ -677,7 +677,7 @@
movhps -7 * SIZE(Y), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -6 * SIZE(X), %xmm7
@@ -690,7 +690,7 @@
PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
#endif
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -4 * SIZE(X), %xmm6
@@ -699,7 +699,7 @@
movhps -3 * SIZE(Y), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -2 * SIZE(X), %xmm7
@@ -712,7 +712,7 @@
PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
#endif
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps 0 * SIZE(X), %xmm6
@@ -721,7 +721,7 @@
movhps 1 * SIZE(Y), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps 2 * SIZE(X), %xmm7
@@ -738,7 +738,7 @@
ALIGN_3
.L32:
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -12 * SIZE(X), %xmm6
@@ -747,7 +747,7 @@
movhps -11 * SIZE(Y), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -10 * SIZE(X), %xmm7
@@ -756,7 +756,7 @@
movhps -9 * SIZE(Y), %xmm5
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -8 * SIZE(X), %xmm6
@@ -765,7 +765,7 @@
movhps -7 * SIZE(Y), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -6 * SIZE(X), %xmm7
@@ -774,7 +774,7 @@
movhps -5 * SIZE(Y), %xmm5
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -4 * SIZE(X), %xmm6
@@ -783,7 +783,7 @@
movhps -3 * SIZE(Y), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -2 * SIZE(X), %xmm7
@@ -792,13 +792,13 @@
movhps -1 * SIZE(Y), %xmm5
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
mulpd %xmm4, %xmm3
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
mulpd %xmm5, %xmm3
@@ -820,7 +820,7 @@
movhps -13 * SIZE(Y), %xmm5
movaps -14 * SIZE(X), %xmm7
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
movaps -12 * SIZE(X), %xmm6
@@ -829,7 +829,7 @@
movhps -11 * SIZE(Y), %xmm4
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
movaps -10 * SIZE(X), %xmm7
@@ -838,13 +838,13 @@
movhps -9 * SIZE(Y), %xmm5
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
mulpd %xmm4, %xmm3
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
mulpd %xmm5, %xmm3
@@ -862,7 +862,7 @@
movhps -15 * SIZE(Y), %xmm4
movaps -16 * SIZE(X), %xmm6
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
mulpd %xmm4, %xmm3
@@ -872,7 +872,7 @@
movhps -13 * SIZE(Y), %xmm5
movaps -14 * SIZE(X), %xmm7
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
mulpd %xmm5, %xmm3
@@ -893,7 +893,7 @@
movhps -15 * SIZE(Y), %xmm4
movaps -16 * SIZE(X), %xmm6
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
mulpd %xmm4, %xmm3
@@ -939,7 +939,7 @@
#endif
movsd %xmm7, %xmm6
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
movsd %xmm5, %xmm4
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
@@ -949,7 +949,7 @@
addpd %xmm3, %xmm1
movsd %xmm6, %xmm7
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
movsd %xmm4, %xmm5
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
@@ -959,7 +959,7 @@
addpd %xmm3, %xmm1
movsd %xmm7, %xmm6
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
movsd %xmm5, %xmm4
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
@@ -969,7 +969,7 @@
addpd %xmm3, %xmm1
movsd %xmm6, %xmm7
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
movsd %xmm4, %xmm5
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
@@ -979,7 +979,7 @@
addpd %xmm3, %xmm1
movsd %xmm7, %xmm6
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
movsd %xmm5, %xmm4
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
@@ -989,7 +989,7 @@
addpd %xmm3, %xmm1
movsd %xmm6, %xmm7
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
movsd %xmm4, %xmm5
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
@@ -999,7 +999,7 @@
addpd %xmm3, %xmm1
movsd %xmm7, %xmm6
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
movsd %xmm5, %xmm4
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
@@ -1009,7 +1009,7 @@
addpd %xmm3, %xmm1
movsd %xmm6, %xmm7
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
movsd %xmm4, %xmm5
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
@@ -1027,7 +1027,7 @@
.L42:
movsd %xmm7, %xmm6
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
movsd %xmm5, %xmm4
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
@@ -1037,7 +1037,7 @@
addpd %xmm3, %xmm1
movsd %xmm6, %xmm7
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
movsd %xmm4, %xmm5
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
@@ -1047,7 +1047,7 @@
addpd %xmm3, %xmm1
movsd %xmm7, %xmm6
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
movsd %xmm5, %xmm4
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
@@ -1057,7 +1057,7 @@
addpd %xmm3, %xmm1
movsd %xmm6, %xmm7
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
movsd %xmm4, %xmm5
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
@@ -1067,7 +1067,7 @@
addpd %xmm3, %xmm1
movsd %xmm7, %xmm6
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
movsd %xmm5, %xmm4
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
@@ -1077,7 +1077,7 @@
addpd %xmm3, %xmm1
movsd %xmm6, %xmm7
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
movsd %xmm4, %xmm5
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
@@ -1087,7 +1087,7 @@
addpd %xmm3, %xmm1
movsd %xmm7, %xmm6
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
movsd %xmm5, %xmm4
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
@@ -1097,7 +1097,7 @@
addpd %xmm3, %xmm1
movsd %xmm6, %xmm7
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
movsd %xmm4, %xmm5
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
@@ -1116,7 +1116,7 @@
movaps -16 * SIZE(Y), %xmm7
movsd %xmm7, %xmm6
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
movsd %xmm5, %xmm4
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
@@ -1126,7 +1126,7 @@
addpd %xmm3, %xmm1
movsd %xmm6, %xmm7
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
movsd %xmm4, %xmm5
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
@@ -1136,7 +1136,7 @@
addpd %xmm3, %xmm1
movsd %xmm7, %xmm6
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
movsd %xmm5, %xmm4
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
@@ -1146,7 +1146,7 @@
addpd %xmm3, %xmm1
movsd %xmm6, %xmm7
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
movsd %xmm4, %xmm5
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
@@ -1165,7 +1165,7 @@
movaps -16 * SIZE(Y), %xmm7
movsd %xmm7, %xmm6
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
movsd %xmm5, %xmm4
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
@@ -1175,7 +1175,7 @@
addpd %xmm3, %xmm1
movsd %xmm6, %xmm7
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
movsd %xmm4, %xmm5
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
@@ -1193,7 +1193,7 @@
movlpd -16 * SIZE(X), %xmm4
movlpd -16 * SIZE(Y), %xmm6
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
mulpd %xmm4, %xmm3
@@ -1205,7 +1205,7 @@
SHUFPD_1 %xmm1, %xmm1
SHUFPD_1 %xmm2, %xmm2
SHUFPD_1 %xmm3, %xmm3
- jmp .L98
+ jmp .L98
ALIGN_3
.L50:
@@ -1232,7 +1232,7 @@
ALIGN_3
.L53:
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
MOVLPS 0 * SIZE(Y), %xmm6
@@ -1244,7 +1244,7 @@
addl INCX, X
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
MOVLPS 0 * SIZE(Y), %xmm7
@@ -1256,7 +1256,7 @@
addl INCX, X
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
MOVLPS 0 * SIZE(Y), %xmm6
@@ -1268,7 +1268,7 @@
addl INCX, X
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
MOVLPS 0 * SIZE(Y), %xmm7
@@ -1280,7 +1280,7 @@
addl INCX, X
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
MOVLPS 0 * SIZE(Y), %xmm6
@@ -1292,7 +1292,7 @@
addl INCX, X
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
MOVLPS 0 * SIZE(Y), %xmm7
@@ -1304,7 +1304,7 @@
addl INCX, X
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
MOVLPS 0 * SIZE(Y), %xmm6
@@ -1316,7 +1316,7 @@
addl INCX, X
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
MOVLPS 0 * SIZE(Y), %xmm7
@@ -1333,7 +1333,7 @@
ALIGN_3
.L54:
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
MOVLPS 0 * SIZE(Y), %xmm6
@@ -1345,7 +1345,7 @@
addl INCX, X
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
MOVLPS 0 * SIZE(Y), %xmm7
@@ -1357,7 +1357,7 @@
addl INCX, X
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
MOVLPS 0 * SIZE(Y), %xmm6
@@ -1369,7 +1369,7 @@
addl INCX, X
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
MOVLPS 0 * SIZE(Y), %xmm7
@@ -1381,7 +1381,7 @@
addl INCX, X
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
MOVLPS 0 * SIZE(Y), %xmm6
@@ -1393,7 +1393,7 @@
addl INCX, X
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
MOVLPS 0 * SIZE(Y), %xmm7
@@ -1405,13 +1405,13 @@
addl INCX, X
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
mulpd %xmm4, %xmm3
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
mulpd %xmm5, %xmm3
@@ -1436,7 +1436,7 @@
movhps 1 * SIZE(Y), %xmm7
addl INCY, Y
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
MOVLPS 0 * SIZE(Y), %xmm6
@@ -1448,7 +1448,7 @@
addl INCX, X
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
MOVLPS 0 * SIZE(Y), %xmm7
@@ -1460,13 +1460,13 @@
addl INCX, X
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
mulpd %xmm4, %xmm3
addpd %xmm3, %xmm1
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
mulpd %xmm5, %xmm3
@@ -1484,7 +1484,7 @@
movhps 1 * SIZE(Y), %xmm6
addl INCY, Y
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
mulpd %xmm4, %xmm3
@@ -1497,7 +1497,7 @@
movhps 1 * SIZE(Y), %xmm7
addl INCY, Y
- pshufd $0x4e, %xmm7, %xmm3
+ pshufd $0x4e, %xmm7, %xmm3
mulpd %xmm5, %xmm7
addpd %xmm7, %xmm0
mulpd %xmm5, %xmm3
@@ -1513,7 +1513,7 @@
MOVLPS 0 * SIZE(Y), %xmm6
movhps 1 * SIZE(Y), %xmm6
- pshufd $0x4e, %xmm6, %xmm3
+ pshufd $0x4e, %xmm6, %xmm3
mulpd %xmm4, %xmm6
addpd %xmm6, %xmm0
mulpd %xmm4, %xmm3
diff --git a/kernel/x86/zgemm3m_kernel_1x4_athlon.S b/kernel/x86/zgemm3m_kernel_1x4_athlon.S
index c57a8cb..4d84e50 100644
--- a/kernel/x86/zgemm3m_kernel_1x4_athlon.S
+++ b/kernel/x86/zgemm3m_kernel_1x4_athlon.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define J 0 + STACK(%esp)
#define I 4 + STACK(%esp)
#define KK 8 + STACK(%esp)
@@ -132,7 +132,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl STACK_A, A
movl C, %edi
@@ -152,7 +152,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -166,7 +166,7 @@
jle .L13
ALIGN_4
-.L12:
+.L12:
movl -16 * SIZE(B), %esi
movl -8 * SIZE(B), %esi
movl 0 * SIZE(B), %esi
@@ -227,7 +227,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -533,7 +533,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl STACK_A, A
movl C, %edi
@@ -553,7 +553,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -567,7 +567,7 @@
jle .L23
ALIGN_4
-.L22:
+.L22:
movl -16 * SIZE(B), %esi
movl -8 * SIZE(B), %esi
movl 0 * SIZE(B), %esi
@@ -610,7 +610,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -774,7 +774,7 @@
#endif
leal (, LDC, 2), %eax
- addl %eax, C
+ addl %eax, C
movl B, B_ORIG
ALIGN_4
@@ -788,7 +788,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl STACK_A, A
movl C, %edi
@@ -808,7 +808,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -822,7 +822,7 @@
jle .L33
ALIGN_4
-.L32:
+.L32:
movl -16 * SIZE(B), %esi
movl -8 * SIZE(B), %esi
movl 0 * SIZE(B), %esi
@@ -861,7 +861,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -964,7 +964,7 @@
addl $1, KK
#endif
- addl LDC, C
+ addl LDC, C
movl B, B_ORIG
ALIGN_4
diff --git a/kernel/x86/zgemm3m_kernel_2x2_atom.S b/kernel/x86/zgemm3m_kernel_2x2_atom.S
index ee918bf..51e948e 100644
--- a/kernel/x86/zgemm3m_kernel_2x2_atom.S
+++ b/kernel/x86/zgemm3m_kernel_2x2_atom.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -84,7 +84,7 @@
movl OFFSET, %eax
#ifndef LEFT
negl %eax
-#endif
+#endif
movl %eax, KK
#endif
@@ -100,7 +100,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sall $BASE_SHIFT + 1, %eax
@@ -129,7 +129,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (B, %eax, 2), BB
-#endif
+#endif
movl BX, %eax
prefetcht0 0 * SIZE(%eax)
@@ -151,7 +151,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -325,7 +325,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 1), AA
leal (B, %eax, 2), BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
xorps %xmm2, %xmm2
@@ -338,7 +338,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -466,7 +466,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, CO1
addl LDC, C
@@ -489,7 +489,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (B, %eax, 1), BB
-#endif
+#endif
movsd 0 * SIZE(BB), %xmm1
xorps %xmm0, %xmm0
@@ -503,7 +503,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -629,7 +629,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 1), AA
leal (B, %eax, 1), BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
xorps %xmm4, %xmm4
@@ -641,7 +641,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/zgemm3m_kernel_2x2_coppermine.S b/kernel/x86/zgemm3m_kernel_2x2_coppermine.S
index 674829f..291dfa6 100644
--- a/kernel/x86/zgemm3m_kernel_2x2_coppermine.S
+++ b/kernel/x86/zgemm3m_kernel_2x2_coppermine.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define J 0 + STACK(%esp)
#define BX 4 + STACK(%esp)
#define KK 8 + STACK(%esp)
@@ -73,7 +73,7 @@
#else
#define REP rep
#endif
-
+
PROLOGUE
subl $ARGS, %esp # Generate Stack Frame
@@ -90,14 +90,14 @@
negl %eax
movl %eax, KK
#endif
-
+
movl N, %eax # j = (n >> 1) # MEMORY
movl LDC, %ebp # ldc # MEMORY
movl B, %ebx
sall $ZBASE_SHIFT, %ebp
- sarl $1, %eax
+ sarl $1, %eax
leal 0(%ecx) , %ecx # NOP
movl %eax, J # j = (n >> 1) # MEMORY
@@ -109,7 +109,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl %ebx, BX
@@ -130,7 +130,7 @@
leal (, %eax, SIZE), %eax
leal (%edx, %eax, 2), %edx
leal (%ebx, %eax, 2), %ecx
-#endif
+#endif
#ifdef HAVE_SSE
movl BX, %eax
@@ -167,7 +167,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -402,7 +402,7 @@
leal (, %eax, SIZE), %eax
leal (%edx, %eax, 1), %edx
leal (%ebx, %eax, 2), %ecx
-#endif
+#endif
fldz
fldz
@@ -413,7 +413,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -522,11 +522,11 @@
movl N, %eax # n # MEMORY
andl $1, %eax
je .End
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, %edi # c # MEMORY
movl A, %edx # a # MEMORY
@@ -535,7 +535,7 @@
sarl $1, %esi # m >> 1
je .L36
ALIGN_4
-
+
.L46:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
@@ -546,14 +546,14 @@
leal (, %eax, SIZE), %eax
leal (%edx, %eax, 2), %edx
leal (%ebx, %eax, 1), %ecx
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -666,14 +666,14 @@
leal (, %eax, SIZE), %eax
leal (%edx, %eax, 1), %edx
leal (%ebx, %eax, 1), %ecx
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/zgemm3m_kernel_2x4_barcelona.S b/kernel/x86/zgemm3m_kernel_2x4_barcelona.S
index 7822094..98d82ed 100644
--- a/kernel/x86/zgemm3m_kernel_2x4_barcelona.S
+++ b/kernel/x86/zgemm3m_kernel_2x4_barcelona.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -202,7 +202,7 @@
#ifndef LEFT
negl %eax
-#endif
+#endif
movl %eax, KK
#endif
@@ -222,7 +222,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
leal GEMM_DEFAULT_Q * GEMM_DEFAULT_UNROLL_N * SIZE(B), %eax
movl %eax, BX
@@ -246,7 +246,7 @@
leal (, %eax, SIZE), %eax
leal (AO, %eax, 2), AO
leal (B, %eax, 4), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
movapd -16 * SIZE(BO), %xmm1
@@ -274,7 +274,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -517,7 +517,7 @@
leal (, %eax, SIZE), %eax
leal (AO, %eax, 1), AO
leal (B, %eax, 4), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
movapd -16 * SIZE(BO), %xmm1
@@ -533,7 +533,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -695,7 +695,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, CO # coffset = c
movl A, AO # aoffset = a
@@ -716,7 +716,7 @@
leal (, %eax, SIZE), %eax
leal (AO, %eax, 2), AO
leal (B, %eax, 2), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
pxor %xmm4, %xmm4
@@ -731,7 +731,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -898,7 +898,7 @@
leal (, %eax, SIZE), %eax
leal (AO, %eax, 1), AO
leal (B, %eax, 2), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
pxor %xmm4, %xmm4
@@ -911,7 +911,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1030,7 +1030,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, CO # coffset = c
movl A, AO # aoffset = a
@@ -1051,7 +1051,7 @@
leal (, %eax, SIZE), %eax
leal (AO, %eax, 2), AO
leal (B, %eax, 1), BO
-#endif
+#endif
movddup -16 * SIZE(BO), %xmm0
pxor %xmm4, %xmm4
@@ -1066,7 +1066,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1186,7 +1186,7 @@
leal (, %eax, SIZE), %eax
leal (AO, %eax, 1), AO
leal (B, %eax, 1), BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
pxor %xmm4, %xmm4
@@ -1199,7 +1199,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/zgemm3m_kernel_2x4_opteron.S b/kernel/x86/zgemm3m_kernel_2x4_opteron.S
index 8e93a28..30d8090 100644
--- a/kernel/x86/zgemm3m_kernel_2x4_opteron.S
+++ b/kernel/x86/zgemm3m_kernel_2x4_opteron.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define OLD_M 4 + STACK + ARGS(%esi)
#define OLD_N 8 + STACK + ARGS(%esi)
#define OLD_K 12 + STACK + ARGS(%esi)
@@ -237,7 +237,7 @@
movss %xmm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
sall $ZBASE_SHIFT, LDC
@@ -251,7 +251,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
movl K, %eax
@@ -259,7 +259,7 @@
sarl $1, %eax
jle .L05
ALIGN_4
-
+
.L02:
#define COPYPREFETCH 40
@@ -320,7 +320,7 @@
addl $4 * SIZE, %edi
ALIGN_4
-
+
.L10:
movl %edi, BX
@@ -343,7 +343,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
movl BX, %eax
@@ -374,7 +374,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -389,7 +389,7 @@
andl $-8, %eax
sall $4, %eax
je .L15
-.L1X:
+.L1X:
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
@@ -630,7 +630,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -648,7 +648,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -862,7 +862,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
movl K, %eax
@@ -870,7 +870,7 @@
sarl $2, %eax
jle .L35
ALIGN_4
-
+
.L32:
#ifdef PENTIUM4
#ifdef HAVE_SSE3
@@ -1002,7 +1002,7 @@
decl %eax
jne .L36
ALIGN_4
-
+
.L40:
movl C, %esi # coffset = c
movl A, AA # aoffset = a
@@ -1023,7 +1023,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1050,7 +1050,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1220,7 +1220,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1239,7 +1239,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1384,14 +1384,14 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
leal BUFFER, %ecx
sarl $3, %eax
jle .L65
ALIGN_4
-
+
.L62:
#ifdef PENTIUM4
#ifdef HAVE_SSE3
@@ -1512,7 +1512,7 @@
decl %eax
jne .L66
ALIGN_4
-
+
.L70:
movl C, %esi # coffset = c
movl A, AA # aoffset = a
@@ -1533,7 +1533,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1558,7 +1558,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1679,7 +1679,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1698,7 +1698,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1793,7 +1793,7 @@
.L999:
movl OLD_STACK, %esp
EMMS
-
+
popl %ebx
popl %esi
popl %edi
diff --git a/kernel/x86/zgemm3m_kernel_2x4_penryn.S b/kernel/x86/zgemm3m_kernel_2x4_penryn.S
index 3920649..f3e94a6 100644
--- a/kernel/x86/zgemm3m_kernel_2x4_penryn.S
+++ b/kernel/x86/zgemm3m_kernel_2x4_penryn.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -103,7 +103,7 @@
movl OFFSET, %eax
#ifndef LEFT
negl %eax
-#endif
+#endif
movl %eax, KK
#endif
@@ -122,7 +122,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sall $BASE_SHIFT + 2, %eax
@@ -176,7 +176,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -481,7 +481,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -653,7 +653,7 @@
movlps %xmm1, 0 * SIZE(%eax, LDC)
movhps %xmm1, 1 * SIZE(%eax, LDC)
ALIGN_4
-
+
.L29:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $4, KK
@@ -675,7 +675,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, C1
movl A, AA
@@ -713,7 +713,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -917,7 +917,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1041,7 +1041,7 @@
movlps %xmm1, 0 * SIZE(C1, LDC)
movhps %xmm1, 1 * SIZE(C1, LDC)
ALIGN_4
-
+
.L49:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $2, KK
@@ -1061,7 +1061,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, C1
movl A, AA
@@ -1096,7 +1096,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1246,7 +1246,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1331,7 +1331,7 @@
movlps %xmm0, 0 * SIZE(C1)
movhps %xmm0, 1 * SIZE(C1)
ALIGN_4
-
+
.L999:
popl %ebx
popl %esi
diff --git a/kernel/x86/zgemm3m_kernel_2x4_prescott.S b/kernel/x86/zgemm3m_kernel_2x4_prescott.S
index a32e0ae..f7cd2ae 100644
--- a/kernel/x86/zgemm3m_kernel_2x4_prescott.S
+++ b/kernel/x86/zgemm3m_kernel_2x4_prescott.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -207,7 +207,7 @@
movl OFFSET, %eax
#ifndef LEFT
negl %eax
-#endif
+#endif
movl %eax, KK
#endif
@@ -223,7 +223,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sall $BASE_SHIFT + 2, %eax
@@ -249,7 +249,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (B, %eax, 4), BB
-#endif
+#endif
movl BX, %eax
prefetcht2 0 * SIZE(%eax)
@@ -278,7 +278,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -293,7 +293,7 @@
andl $-8, %eax
sall $4, %eax
je .L15
-
+
.L1X:
KERNEL1(16 * 0)
KERNEL2(16 * 0)
@@ -715,7 +715,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 1), AA
leal (B, %eax, 4), BB
-#endif
+#endif
movddup 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -731,7 +731,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -944,7 +944,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl M, %ebx
sarl $1, %ebx # i = (m >> 2)
@@ -962,7 +962,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (B, %eax, 2), BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -988,7 +988,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1155,7 +1155,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 1), AA
leal (B, %eax, 2), BB
-#endif
+#endif
movddup 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1171,7 +1171,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1314,7 +1314,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl M, %ebx
sarl $1, %ebx # i = (m >> 2)
@@ -1332,7 +1332,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (B, %eax, 1), BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1352,7 +1352,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1469,7 +1469,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 1), AA
leal (B, %eax, 1), BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1485,7 +1485,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/zgemm3m_kernel_4x2_core2.S b/kernel/x86/zgemm3m_kernel_4x2_core2.S
index 0c01de8..00f4409 100644
--- a/kernel/x86/zgemm3m_kernel_4x2_core2.S
+++ b/kernel/x86/zgemm3m_kernel_4x2_core2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define OLD_M 4 + STACK + ARGS(%esi)
#define OLD_N 8 + STACK + ARGS(%esi)
#define OLD_K 12 + STACK + ARGS(%esi)
@@ -130,13 +130,13 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sarl $2, %eax
jle .L05
ALIGN_4
-
+
.L02:
movddup -16 * SIZE(B), %xmm0
movddup -15 * SIZE(B), %xmm1
@@ -183,7 +183,7 @@
decl %eax
jne .L06
ALIGN_4
-
+
.L10:
movl B, BX
@@ -229,7 +229,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -517,7 +517,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
addl $2, %eax
@@ -685,7 +685,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -828,13 +828,13 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sarl $3, %eax
jle .L45
ALIGN_4
-
+
.L42:
movddup -16 * SIZE(B), %xmm0
movddup -15 * SIZE(B), %xmm1
@@ -876,7 +876,7 @@
decl %eax
jne .L46
ALIGN_4
-
+
.L50:
movl C, C1
movl A, AA
@@ -914,7 +914,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1086,7 +1086,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1219,7 +1219,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
addl $1, %eax
@@ -1316,7 +1316,7 @@
.L999:
movl OLD_STACK, %esp
-
+
EMMS
popl %ebx
diff --git a/kernel/x86/zgemm3m_kernel_4x2_northwood.S b/kernel/x86/zgemm3m_kernel_4x2_northwood.S
index fb7d639..883a874 100644
--- a/kernel/x86/zgemm3m_kernel_4x2_northwood.S
+++ b/kernel/x86/zgemm3m_kernel_4x2_northwood.S
@@ -47,7 +47,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -58,7 +58,7 @@
#define STACK_C 40 + STACK + ARGS(%esi)
#define STACK_LDC 44 + STACK + ARGS(%esi)
#define STACK_OFFT 48 + STACK + ARGS(%esi)
-
+
#define ALPHA 0(%esp)
#define K 16(%esp)
#define N 20(%esp)
@@ -243,7 +243,7 @@
movd %mm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
sall $ZBASE_SHIFT, LDC
@@ -252,12 +252,12 @@
movl %eax, J
jle .L100
ALIGN_2
-
+
.L01:
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leal BUFFER, %ecx
@@ -265,7 +265,7 @@
sarl $2, %eax
jle .L03
ALIGN_2
-
+
.L02:
movsd 0 * SIZE(B), %xmm0
movsd 1 * SIZE(B), %xmm1
@@ -326,7 +326,7 @@
BRANCH
jne .L04
ALIGN_4
-
+
.L05:
movl B, BX
@@ -370,7 +370,7 @@
movapd 8 * SIZE(AA), %xmm1
pxor %xmm7, %xmm7
-#endif
+#endif
prefetchnta 3 * SIZE(%esi)
prefetchnta 3 * SIZE(%esi, LDC)
@@ -385,7 +385,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -403,7 +403,7 @@
je .L12
sall $3, %eax
.align 8
-
+
.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
@@ -496,7 +496,7 @@
subl $64 * 8, %eax
BRANCH
jg .L1X
-
+
.L11:
leal (AA, %eax, 4), AA
leal (BB, %eax, 4), BB
@@ -505,7 +505,7 @@
sarl $3, %eax
je .L12
-.L11:
+.L11:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
KERNEL3(32 * 0)
@@ -550,7 +550,7 @@
addl $4 * SIZE, BB # boffset1 += 8
subl $1, %eax
jg .L13
- ALIGN_4
+ ALIGN_4
.L14:
movsd 0 * SIZE(%esi), %xmm0
@@ -668,14 +668,14 @@
pxor %xmm6, %xmm6
movapd 8 * SIZE(AA), %xmm1
pxor %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
addl $2, %eax
@@ -684,7 +684,7 @@
sarl $3, %eax
je .L32
-.L31:
+.L31:
mulpd %xmm0, %xmm2
mulpd 2 * SIZE(BB), %xmm0
addpd %xmm2, %xmm4
@@ -846,14 +846,14 @@
pxor %xmm6, %xmm6
movsd 4 * SIZE(AA), %xmm1
pxor %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -866,7 +866,7 @@
sarl $3, %eax
je .L52
-.L51:
+.L51:
mulsd %xmm0, %xmm2
mulsd 2 * SIZE(BB), %xmm0
addsd %xmm2, %xmm4
@@ -988,12 +988,12 @@
testl $1, %eax
jle .L999
ALIGN_2
-
+
.L101:
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leal BUFFER, %ecx
@@ -1002,7 +1002,7 @@
sarl $3, %eax
jle .L103
ALIGN_4
-
+
.L102:
movsd 0 * SIZE(B), %xmm0
movsd 1 * SIZE(B), %xmm1
@@ -1059,7 +1059,7 @@
decl %eax
jne .L104
ALIGN_4
-
+
.L105:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -1099,7 +1099,7 @@
pxor %xmm6, %xmm6
movapd 8 * SIZE(AA), %xmm1
pxor %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
@@ -1107,7 +1107,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1120,7 +1120,7 @@
sarl $3, %eax
je .L112
-.L111:
+.L111:
mulpd %xmm2, %xmm0
mulpd 2 * SIZE(AA), %xmm2
addpd %xmm0, %xmm4
@@ -1244,7 +1244,7 @@
BRANCH
decl %ebx # i --
jg .L110
- ALIGN_2
+ ALIGN_2
.L130:
movl M, %ebx
@@ -1281,7 +1281,7 @@
pxor %xmm6, %xmm6
movapd 8 * SIZE(AA), %xmm1
pxor %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
@@ -1289,7 +1289,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1302,7 +1302,7 @@
sarl $3, %eax
je .L132
-.L131:
+.L131:
mulpd %xmm0, %xmm2
movapd 2 * SIZE(AA), %xmm0
addpd %xmm2, %xmm4
@@ -1420,14 +1420,14 @@
pxor %xmm6, %xmm6
movapd 4 * SIZE(AA), %xmm1
pxor %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
addl $1, %eax
@@ -1436,7 +1436,7 @@
sarl $3, %eax
je .L152
-.L151:
+.L151:
mulsd %xmm0, %xmm2
movsd 1 * SIZE(AA), %xmm0
addsd %xmm2, %xmm4
diff --git a/kernel/x86/zgemm3m_kernel_4x4_barcelona.S b/kernel/x86/zgemm3m_kernel_4x4_barcelona.S
index 623f0be..fcdc334 100644
--- a/kernel/x86/zgemm3m_kernel_4x4_barcelona.S
+++ b/kernel/x86/zgemm3m_kernel_4x4_barcelona.S
@@ -40,7 +40,7 @@
#include "common.h"
#define STACK 16
-
+
#define OLD_M 4 + STACK(%esi)
#define OLD_N 8 + STACK(%esi)
#define OLD_K 12 + STACK(%esi)
@@ -207,7 +207,7 @@
andl $-1024, %esp # align stack
STACK_TOUCHING
-
+
movl OLD_N, %eax
movl OLD_K, %ecx
movl OLD_A, %edx
@@ -235,7 +235,7 @@
movss %xmm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
sall $ZBASE_SHIFT, LDC
@@ -247,7 +247,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leal BUFFER, %ecx
@@ -256,7 +256,7 @@
sarl $1, %eax
jle .L05
ALIGN_4
-
+
.L02:
prefetch (RPREFETCHSIZE + 0) * SIZE(%edi)
@@ -312,7 +312,7 @@
addl $4 * SIZE, %edi
ALIGN_4
-
+
.L10:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -333,7 +333,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -356,7 +356,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -370,7 +370,7 @@
andl $-8, %eax
sall $4, %eax
je .L15
-.L1X:
+.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
KERNEL3(32 * 0)
@@ -592,7 +592,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -608,7 +608,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -837,7 +837,7 @@
leal (AA, %eax, 1), AA
leal (BB, %eax, 8), BB
leal (BB, %eax, 8), BB
-#endif
+#endif
movss 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -853,7 +853,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1052,14 +1052,14 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
leal BUFFER, %ecx
sarl $2, %eax
jle .L45
ALIGN_4
-
+
.L42:
prefetch (RPREFETCHSIZE + 0) * SIZE(%edi)
@@ -1116,7 +1116,7 @@
decl %eax
jne .L46
ALIGN_4
-
+
.L50:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -1137,7 +1137,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1157,7 +1157,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1325,7 +1325,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1344,7 +1344,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1501,7 +1501,7 @@
leal (, %eax, 4), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1520,7 +1520,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1667,7 +1667,7 @@
addl $2, KK
#endif
leal (, LDC, 2), %eax
- addl %eax, C
+ addl %eax, C
ALIGN_4
.L80:
@@ -1677,7 +1677,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -1685,7 +1685,7 @@
sarl $3, %eax
jle .L85
ALIGN_4
-
+
.L82:
prefetch (RPREFETCHSIZE + 0) * SIZE(%edi)
@@ -1738,7 +1738,7 @@
decl %eax
jne .L86
ALIGN_4
-
+
.L90:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -1759,7 +1759,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1778,7 +1778,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1901,7 +1901,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1920,7 +1920,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2034,7 +2034,7 @@
leal (, %eax, 4), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -2053,7 +2053,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/zgemm3m_kernel_4x4_opteron.S b/kernel/x86/zgemm3m_kernel_4x4_opteron.S
index 511fc8b..70c3dd8 100644
--- a/kernel/x86/zgemm3m_kernel_4x4_opteron.S
+++ b/kernel/x86/zgemm3m_kernel_4x4_opteron.S
@@ -40,7 +40,7 @@
#include "common.h"
#define STACK 16
-
+
#define OLD_M 4 + STACK(%esi)
#define OLD_N 8 + STACK(%esi)
#define OLD_K 12 + STACK(%esi)
@@ -204,7 +204,7 @@
addps %xmm1, %xmm7; \
movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
#endif
-
+
#ifdef PENTIUM4
#define KERNEL1(address) \
mulps %xmm0, %xmm2; \
@@ -330,7 +330,7 @@
PROFCODE
EMMS
-
+
movl %esp, %esi # save old stack
subl $128 + LOCAL_BUFFER_SIZE, %esp
movl OLD_M, %ebx
@@ -371,7 +371,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leal BUFFER, %ecx
@@ -380,7 +380,7 @@
sarl $1, %eax
jle .L05
ALIGN_4
-
+
.L02:
#ifdef HAVE_SSE2
movss 0 * SIZE(%edi), %xmm0
@@ -516,7 +516,7 @@
#endif
addl $4 * SIZE, %edi
ALIGN_4
-
+
.L10:
movl %edi, BX
@@ -539,7 +539,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
movl BX, %eax
@@ -599,7 +599,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -614,7 +614,7 @@
andl $-8, %eax
sall $4, %eax
je .L15
-.L1X:
+.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
KERNEL3(32 * 0)
@@ -858,7 +858,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -874,7 +874,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1103,7 +1103,7 @@
leal (AA, %eax, 1), AA
leal (BB, %eax, 8), BB
leal (BB, %eax, 8), BB
-#endif
+#endif
movss 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1119,7 +1119,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1321,14 +1321,14 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
leal BUFFER, %ecx
sarl $2, %eax
jle .L45
ALIGN_4
-
+
.L42:
prefetchnta 80 * SIZE(%edi)
@@ -1453,7 +1453,7 @@
decl %eax
jne .L46
ALIGN_4
-
+
.L50:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -1474,7 +1474,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1499,7 +1499,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1667,7 +1667,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1686,7 +1686,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1843,7 +1843,7 @@
leal (, %eax, 4), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1862,7 +1862,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1981,7 +1981,7 @@
addl $2, KK
#endif
leal (, LDC, 2), %eax
- addl %eax, C
+ addl %eax, C
ALIGN_4
.L80:
@@ -1991,14 +1991,14 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
leal BUFFER, %ecx
sarl $3, %eax
jle .L85
ALIGN_4
-
+
.L82:
prefetchnta 80 * SIZE(%edi)
@@ -2112,7 +2112,7 @@
decl %eax
jne .L86
ALIGN_4
-
+
.L90:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -2133,7 +2133,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -2156,7 +2156,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2279,7 +2279,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -2298,7 +2298,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2411,7 +2411,7 @@
leal (, %eax, 4), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -2430,7 +2430,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/zgemm3m_kernel_4x4_penryn.S b/kernel/x86/zgemm3m_kernel_4x4_penryn.S
index 802298c..df38500 100644
--- a/kernel/x86/zgemm3m_kernel_4x4_penryn.S
+++ b/kernel/x86/zgemm3m_kernel_4x4_penryn.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -86,7 +86,7 @@
movl OFFSET, %eax
#ifndef LEFT
negl %eax
-#endif
+#endif
movl %eax, KK
#endif
@@ -105,7 +105,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sall $BASE_SHIFT + 2, %eax
@@ -160,7 +160,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -445,7 +445,7 @@
movhps %xmm0, 2 * SIZE(%eax, LDC)
movlps %xmm1, 4 * SIZE(%eax, LDC)
movhps %xmm1, 6 * SIZE(%eax, LDC)
-
+
addl $8 * SIZE, C1
decl I
jg .L11
@@ -481,7 +481,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -693,7 +693,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -836,7 +836,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, C1
movl A, AA
@@ -877,7 +877,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1079,7 +1079,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1232,7 +1232,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1359,7 +1359,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, C1
movl A, AA
@@ -1396,7 +1396,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1549,7 +1549,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1687,7 +1687,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/zgemm3m_kernel_4x4_prescott.S b/kernel/x86/zgemm3m_kernel_4x4_prescott.S
index 3d602e3..bdb19e1 100644
--- a/kernel/x86/zgemm3m_kernel_4x4_prescott.S
+++ b/kernel/x86/zgemm3m_kernel_4x4_prescott.S
@@ -40,7 +40,7 @@
#include "common.h"
#define STACK 16
-
+
#define OLD_M 4 + STACK(%esi)
#define OLD_N 8 + STACK(%esi)
#define OLD_K 12 + STACK(%esi)
@@ -248,7 +248,7 @@
movss %xmm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
sall $ZBASE_SHIFT, LDC
@@ -261,7 +261,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leal BUFFER, %ecx
@@ -270,7 +270,7 @@
sarl $2, %eax
jle .L05
ALIGN_4
-
+
.L02:
movddup 0 * SIZE(%edi), %xmm0
movddup 2 * SIZE(%edi), %xmm1
@@ -318,7 +318,7 @@
decl %eax
jne .L06
ALIGN_4
-
+
.L10:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -339,7 +339,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -362,7 +362,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -377,7 +377,7 @@
andl $-8, %eax
sall $4, %eax
je .L15
-.L1X:
+.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
KERNEL3(32 * 0)
@@ -703,7 +703,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
movddup 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -717,7 +717,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -900,7 +900,7 @@
leal (, %eax, 4), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
movss 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -914,7 +914,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1054,14 +1054,14 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
leal BUFFER, %ecx
sarl $3, %eax
jle .L45
ALIGN_4
-
+
.L42:
movddup 0 * SIZE(%edi), %xmm0
movddup 2 * SIZE(%edi), %xmm1
@@ -1106,7 +1106,7 @@
decl %eax
jne .L46
ALIGN_4
-
+
.L50:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -1127,7 +1127,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1146,7 +1146,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1311,7 +1311,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
movddup 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1327,7 +1327,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1455,7 +1455,7 @@
leal (, %eax, 4), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
movss 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1471,7 +1471,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1578,7 +1578,7 @@
addl $2, KK
#endif
leal (, LDC, 2), %eax
- addl %eax, C
+ addl %eax, C
ALIGN_4
.L80:
@@ -1588,14 +1588,14 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
leal BUFFER, %ecx
sarl $3, %eax
jle .L85
ALIGN_4
-
+
.L82:
movss 0 * SIZE(%edi), %xmm0
movss 1 * SIZE(%edi), %xmm1
@@ -1649,7 +1649,7 @@
decl %eax
jne .L86
ALIGN_4
-
+
.L90:
movl C, %esi # coffset = c
movl A, %edx # aoffset = a
@@ -1670,7 +1670,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 1), BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1690,7 +1690,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1811,7 +1811,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 1), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1830,7 +1830,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1942,7 +1942,7 @@
leal (, %eax, 4), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
movss 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1958,7 +1958,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/zgemm3m_kernel_8x2_core2.S b/kernel/x86/zgemm3m_kernel_8x2_core2.S
index 9a28c8e..d387dd1 100644
--- a/kernel/x86/zgemm3m_kernel_8x2_core2.S
+++ b/kernel/x86/zgemm3m_kernel_8x2_core2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -83,7 +83,7 @@
#else
#define MOVSD movsd
#endif
-
+
PROLOGUE
pushl %ebp
@@ -140,13 +140,13 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sarl $2, %eax
jle .L05
ALIGN_4
-
+
.L02:
prefetcht0 (PREFETCH_R + 0) * SIZE(B)
movss -32 * SIZE(B), %xmm0
@@ -205,7 +205,7 @@
decl %eax
jne .L06
ALIGN_4
-
+
.L10:
movl C, C1
movl A, AA
@@ -242,7 +242,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -523,7 +523,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -697,7 +697,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -853,7 +853,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -989,13 +989,13 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sarl $3, %eax
jle .L55
ALIGN_4
-
+
.L52:
movss -32 * SIZE(B), %xmm0
movss -31 * SIZE(B), %xmm1
@@ -1047,7 +1047,7 @@
decl %eax
jne .L56
ALIGN_4
-
+
.L60:
movl C, C1
movl A, AA
@@ -1085,7 +1085,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1259,7 +1259,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1393,7 +1393,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1518,7 +1518,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1616,7 +1616,7 @@
.L999:
movl OLD_STACK, %esp
-
+
EMMS
popl %ebx
diff --git a/kernel/x86/zgemm3m_kernel_8x2_sse.S b/kernel/x86/zgemm3m_kernel_8x2_sse.S
index ea66dc1..24ec027 100644
--- a/kernel/x86/zgemm3m_kernel_8x2_sse.S
+++ b/kernel/x86/zgemm3m_kernel_8x2_sse.S
@@ -45,7 +45,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -249,7 +249,7 @@
movd %mm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
sall $ZBASE_SHIFT, LDC
@@ -258,12 +258,12 @@
movl %eax, J
jle .L100
ALIGN_2
-
+
.L01:
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
movl K, %eax
@@ -271,7 +271,7 @@
sarl $2, %eax
jle .L03
ALIGN_4
-
+
.L02:
movss 0 * SIZE(B), %xmm0
movss 1 * SIZE(B), %xmm1
@@ -301,7 +301,7 @@
movaps %xmm7, 28 * SIZE(%ecx)
prefetcht0 104 * SIZE(B)
-
+
addl $ 8 * SIZE, B
addl $32 * SIZE, %ecx
decl %eax
@@ -369,7 +369,7 @@
XORPS %xmm6, %xmm6
movaps 16 * SIZE(AA), %xmm1
XORPS %xmm7, %xmm7
-#endif
+#endif
prefetchnta 7 * SIZE(%esi)
prefetchnta 7 * SIZE(%esi, %ebp)
@@ -379,7 +379,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -393,7 +393,7 @@
NOBRANCH
je .L12
sall $3, %eax
-
+
.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
@@ -520,7 +520,7 @@
XORPS %xmm6, %xmm6
movaps 8 * SIZE(AA), %xmm1
XORPS %xmm7, %xmm7
-#endif
+#endif
prefetchnta 8 * SIZE(%esi)
prefetchnta 8 * SIZE(%esi, %ebp)
@@ -530,7 +530,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -838,7 +838,7 @@
BRANCH
decl %ebx # i --
jg .L10
- ALIGN_2
+ ALIGN_2
.L30:
movl M, %ebx
@@ -877,14 +877,14 @@
XORPS %xmm6, %xmm6
movaps 16 * SIZE(AA), %xmm1
XORPS %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -983,14 +983,14 @@
XORPS %xmm6, %xmm6
movaps 8 * SIZE(AA), %xmm1
XORPS %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1139,7 +1139,7 @@
movhps %xmm1, 6 * SIZE(%esi, LDC)
addl $8 * SIZE, %esi
- ALIGN_2
+ ALIGN_2
.L50:
testl $2, %ebx
@@ -1175,14 +1175,14 @@
XORPS %xmm6, %xmm6
MOVSD 8 * SIZE(AA), %xmm1
XORPS %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1289,14 +1289,14 @@
XORPS %xmm6, %xmm6
MOVSD 8 * SIZE(AA), %xmm1
XORPS %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1436,7 +1436,7 @@
movhps %xmm0, 2 * SIZE(%esi, LDC)
addl $4 * SIZE, %esi
- ALIGN_2
+ ALIGN_2
.L70:
testl $1, %ebx
@@ -1471,14 +1471,14 @@
XORPS %xmm6, %xmm6
movss 4 * SIZE(AA), %xmm1
XORPS %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1577,14 +1577,14 @@
XORPS %xmm6, %xmm6
movss 4 * SIZE(AA), %xmm1
XORPS %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1709,7 +1709,7 @@
addps %xmm2, %xmm0
movlps %xmm0, 0 * SIZE(%esi, LDC)
- ALIGN_2
+ ALIGN_2
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -1728,12 +1728,12 @@
testl $1, %eax
jle .L999
ALIGN_2
-
+
.L101:
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
movl K, %eax
@@ -1741,10 +1741,10 @@
sarl $3, %eax
jle .L103
ALIGN_4
-
+
.L102:
prefetchnta 96 * SIZE(B)
-
+
movss 0 * SIZE(B), %xmm0
movss 1 * SIZE(B), %xmm1
movss 2 * SIZE(B), %xmm2
@@ -1836,14 +1836,14 @@
XORPS %xmm6, %xmm6
movaps 16 * SIZE(AA), %xmm1
XORPS %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1943,14 +1943,14 @@
XORPS %xmm6, %xmm6
movaps 8 * SIZE(AA), %xmm1
XORPS %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2102,7 +2102,7 @@
BRANCH
decl %ebx # i --
jg .L110
- ALIGN_2
+ ALIGN_2
.L130:
movl M, %ebx
@@ -2141,14 +2141,14 @@
XORPS %xmm6, %xmm6
movaps 16 * SIZE(AA), %xmm1
XORPS %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2225,14 +2225,14 @@
XORPS %xmm6, %xmm6
movaps 8 * SIZE(AA), %xmm1
XORPS %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2336,7 +2336,7 @@
movhps %xmm1, 6 * SIZE(%esi)
addl $8 * SIZE, %esi
- ALIGN_2
+ ALIGN_2
.L150:
testl $2, %ebx
@@ -2371,14 +2371,14 @@
XORPS %xmm6, %xmm6
MOVSD 8 * SIZE(AA), %xmm1
XORPS %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2461,14 +2461,14 @@
XORPS %xmm6, %xmm6
MOVSD 8 * SIZE(AA), %xmm1
XORPS %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2567,7 +2567,7 @@
movhps %xmm0, 2 * SIZE(%esi)
addl $4 * SIZE, %esi
- ALIGN_2
+ ALIGN_2
.L170:
testl $1, %ebx
@@ -2602,14 +2602,14 @@
XORPS %xmm6, %xmm6
movss 4 * SIZE(AA), %xmm1
XORPS %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2686,14 +2686,14 @@
XORPS %xmm6, %xmm6
movss 4 * SIZE(AA), %xmm1
XORPS %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -2785,7 +2785,7 @@
addps %xmm2, %xmm0
movlps %xmm0, 0 * SIZE(%esi)
- ALIGN_2
+ ALIGN_2
.L999:
movl OLD_STACK, %esp
diff --git a/kernel/x86/zgemm_beta.S b/kernel/x86/zgemm_beta.S
index c36e7c5..a66b45c 100644
--- a/kernel/x86/zgemm_beta.S
+++ b/kernel/x86/zgemm_beta.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#ifdef DOUBLE
diff --git a/kernel/x86/zgemm_kernel_1x1.S b/kernel/x86/zgemm_kernel_1x1.S
index 117b245..4df46dd 100644
--- a/kernel/x86/zgemm_kernel_1x1.S
+++ b/kernel/x86/zgemm_kernel_1x1.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define BX 0 + STACK(%esp)
#define KK 4 + STACK(%esp)
#define KKK 8 + STACK(%esp)
@@ -105,7 +105,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl %ebx, BX
@@ -125,7 +125,7 @@
leal (, %eax, SIZE), %eax
leal (A, %eax, 2), A
leal (B, %eax, 2), B
-#endif
+#endif
#ifdef HAVE_SSE
movl BX, %eax
@@ -169,7 +169,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
diff --git a/kernel/x86/zgemm_kernel_1x1_atom.S b/kernel/x86/zgemm_kernel_1x1_atom.S
index 5d276b9..1441c65 100644
--- a/kernel/x86/zgemm_kernel_1x1_atom.S
+++ b/kernel/x86/zgemm_kernel_1x1_atom.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -57,7 +57,7 @@
#define BX 4 + STACK(%esp)
#define KK 8 + STACK(%esp)
#define KKK 12 + STACK(%esp)
-
+
#define PREFETCH prefetcht0
#define PREFETCHSIZE 84
@@ -107,7 +107,7 @@
movl OFFSET, %eax
#ifndef LEFT
negl %eax
-#endif
+#endif
movl %eax, KK
#endif
@@ -127,7 +127,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl B, BX
@@ -150,7 +150,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (B, %eax, 2), BB
-#endif
+#endif
movl BX, %eax
prefetcht0 0 * SIZE(%eax)
@@ -171,7 +171,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -327,7 +327,7 @@
addl $2 * SIZE, CO1
decl %ebx
jg .L10
- ALIGN_4
+ ALIGN_4
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
diff --git a/kernel/x86/zgemm_kernel_1x2.S b/kernel/x86/zgemm_kernel_1x2.S
index 0f98069..0d7e993 100644
--- a/kernel/x86/zgemm_kernel_1x2.S
+++ b/kernel/x86/zgemm_kernel_1x2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define J 0 + STACK(%esp)
#define I 4 + STACK(%esp)
#define KK 8 + STACK(%esp)
@@ -331,7 +331,7 @@
movl STACK_LDC, LDC
sall $ZBASE_SHIFT, LDC
-
+
subl $(AOFFSET - 16 * SIZE), STACK_A
subl $(BOFFSET - 16 * SIZE), STACK_B
@@ -346,7 +346,7 @@
movl K, %eax
testl %eax, %eax
jle .L999
-
+
movl N, %eax
sarl $1, %eax
movl %eax, J
@@ -357,7 +357,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl STACK_A, A
movl STACK_B, B
@@ -411,7 +411,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -510,7 +510,7 @@
FMUL ALPHA_R
fxch %st(1)
FMUL ALPHA_I
- faddp %st, %st(1)
+ faddp %st, %st(1)
#ifndef TRMMKERNEL
FADD 1 * SIZE(%edi)
@@ -531,7 +531,7 @@
FMUL ALPHA_R
fxch %st(1)
FMUL ALPHA_I
- faddp %st, %st(1)
+ faddp %st, %st(1)
#ifndef TRMMKERNEL
FADD 1 * SIZE(%edi,LDC)
@@ -580,7 +580,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl STACK_A, A
movl STACK_B, B
@@ -617,7 +617,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -765,7 +765,7 @@
FMUL ALPHA_R
fxch %st(1)
FMUL ALPHA_I
- faddp %st, %st(1)
+ faddp %st, %st(1)
#ifndef TRMMKERNEL
FADD 1 * SIZE(%edi)
diff --git a/kernel/x86/zgemm_kernel_1x2_3dnow.S b/kernel/x86/zgemm_kernel_1x2_3dnow.S
index 3699bb2..f312a9b 100644
--- a/kernel/x86/zgemm_kernel_1x2_3dnow.S
+++ b/kernel/x86/zgemm_kernel_1x2_3dnow.S
@@ -99,7 +99,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
andl $-1024, %esp # align stack
STACK_TOUCHING
-
+
movl OLD_N, %eax
movl OLD_K, %ecx
movl OLD_A, %edx
@@ -172,7 +172,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sarl $2, %eax
@@ -307,7 +307,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -619,7 +619,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
addl $2 * SIZE, %esi
decl %ebx
jg .L11
- ALIGN_4
+ ALIGN_4
.L19:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -708,7 +708,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, %esi # coffset = c
movl A, AA # aoffset = a
@@ -744,7 +744,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -943,11 +943,11 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla
addl $2 * SIZE, %esi # coffset += 4
decl %ebx # i --
jg .L31
- ALIGN_4
+ ALIGN_4
.L999:
EMMS
-
+
movl OLD_STACK, %esp
popl %ebx
popl %esi
diff --git a/kernel/x86/zgemm_kernel_1x2_barcelona.S b/kernel/x86/zgemm_kernel_1x2_barcelona.S
index f71b095..41b6594 100644
--- a/kernel/x86/zgemm_kernel_1x2_barcelona.S
+++ b/kernel/x86/zgemm_kernel_1x2_barcelona.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -219,7 +219,7 @@
#ifndef LEFT
negl %eax
-#endif
+#endif
movl %eax, KK
#endif
@@ -239,7 +239,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
leal GEMM_DEFAULT_Q * GEMM_DEFAULT_UNROLL_N * SIZE(B), %eax
movl %eax, BX
@@ -262,7 +262,7 @@
leal (, %eax, SIZE), %eax
leal (AO, %eax, 2), AO
leal (B, %eax, 4), BO
-#endif
+#endif
movl BX, %eax
@@ -287,7 +287,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -496,7 +496,7 @@
addl $2 * SIZE, CO # coffset += 4
decl I # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -521,7 +521,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, CO
movl A, AO
@@ -542,7 +542,7 @@
leal (, %eax, SIZE), %eax
leal (AO, %eax, 2), AO
leal (B, %eax, 2), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
pxor %xmm4, %xmm4
@@ -558,7 +558,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -713,7 +713,7 @@
addl $2 * SIZE, CO # coffset += 4
decl I # i --
jg .L110
- ALIGN_4
+ ALIGN_4
.L500:
popl %ebx
diff --git a/kernel/x86/zgemm_kernel_1x2_penryn.S b/kernel/x86/zgemm_kernel_1x2_penryn.S
index 70b38dc..adbadef 100644
--- a/kernel/x86/zgemm_kernel_1x2_penryn.S
+++ b/kernel/x86/zgemm_kernel_1x2_penryn.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -125,13 +125,13 @@
movl OFFSET, %eax
#ifndef LEFT
negl %eax
-#endif
+#endif
movl %eax, KK
#endif
movl M, %ebx
testl %ebx, %ebx
- jle .L999
+ jle .L999
subl $-16 * SIZE, A
subl $-16 * SIZE, B
@@ -148,7 +148,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl B, BX
@@ -169,7 +169,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
movl BX, %eax
PREFETCHB -16 * SIZE(%eax)
@@ -193,7 +193,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -420,7 +420,7 @@
addsubpd %xmm5, %xmm4
addsubpd %xmm7, %xmm6
-
+
#if! defined(TRMMKERNEL) && !defined(BETAZERO)
addpd %xmm0, %xmm4
addpd %xmm1, %xmm6
@@ -468,7 +468,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, C1 # coffset = c
movl A, AA # aoffset = a
@@ -487,7 +487,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
movaps -16 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -505,7 +505,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -663,7 +663,7 @@
mulpd %xmm2, %xmm4
mulpd %xmm3, %xmm5
addsubpd %xmm5, %xmm4
-
+
#if! defined(TRMMKERNEL) && !defined(BETAZERO)
addpd %xmm0, %xmm4
#endif
diff --git a/kernel/x86/zgemm_kernel_1x2_sse2.S b/kernel/x86/zgemm_kernel_1x2_sse2.S
index 63fc30a..e621e4a 100644
--- a/kernel/x86/zgemm_kernel_1x2_sse2.S
+++ b/kernel/x86/zgemm_kernel_1x2_sse2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -242,7 +242,7 @@
movlpd %xmm0, 0 + ALPHA_R
movlpd %xmm0, 8 + ALPHA_R
-
+
movlpd %xmm1, 8 + ALPHA_I
xorpd %xmm7, %xmm1
movlpd %xmm1, 0 + ALPHA_I
@@ -258,7 +258,7 @@
movss %xmm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
sall $ZBASE_SHIFT, LDC
@@ -272,7 +272,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -362,7 +362,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -381,7 +381,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -396,7 +396,7 @@
andl $-8, %eax
sall $4, %eax
je .L15
-.L1X:
+.L1X:
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
@@ -576,7 +576,7 @@
pshufd $0x4e, %xmm4, %xmm5
pshufd $0x4e, %xmm6, %xmm7
-
+
mulpd %xmm2, %xmm4
mulpd %xmm3, %xmm5
mulpd %xmm2, %xmm6
@@ -611,7 +611,7 @@
addl $2 * SIZE, %esi # coffset += 4
decl %ebx # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -634,7 +634,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
movapd POSINV, %xmm7
@@ -705,7 +705,7 @@
movl C, %esi # coffset = c
movl A, AA # aoffset = a
movl M, %ebx
- testl %ebx, %ebx
+ testl %ebx, %ebx
jle .L500
ALIGN_4
@@ -721,7 +721,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -738,7 +738,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -864,7 +864,7 @@
#endif
pshufd $0x4e, %xmm4, %xmm5
-
+
mulpd %xmm2, %xmm4
mulpd %xmm3, %xmm5
@@ -893,13 +893,13 @@
addl $2 * SIZE, %esi # coffset += 4
decl %ebx # i --
jg .L110
- ALIGN_4
+ ALIGN_4
.L500:
movl OLD_STACK, %esp
EMMS
-
+
popl %ebx
popl %esi
popl %edi
diff --git a/kernel/x86/zgemm_kernel_1x2_sse3.S b/kernel/x86/zgemm_kernel_1x2_sse3.S
index 70e6400..774cb0f 100644
--- a/kernel/x86/zgemm_kernel_1x2_sse3.S
+++ b/kernel/x86/zgemm_kernel_1x2_sse3.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -57,7 +57,7 @@
#define BX 4 + STACK(%esp)
#define KK 8 + STACK(%esp)
#define KKK 12 + STACK(%esp)
-
+
#ifdef PENTIUM4
#define PREFETCH_R (8 * 4)
#define PREFETCH prefetcht1
@@ -222,7 +222,7 @@
movl OFFSET, %eax
#ifndef LEFT
negl %eax
-#endif
+#endif
movl %eax, KK
#endif
@@ -238,7 +238,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl B, BX
@@ -261,7 +261,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (B, %eax, 4), BB
-#endif
+#endif
movl BX, %eax
@@ -289,7 +289,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -304,7 +304,7 @@
andl $-8, %eax
sall $4, %eax
je .L12
-
+
.L1X:
KERNEL1(16 * 0)
KERNEL2(16 * 0)
@@ -578,7 +578,7 @@
addsubpd %xmm5, %xmm4
addsubpd %xmm7, %xmm6
-
+
#ifndef TRMMKERNEL
movsd 0 * SIZE(%esi), %xmm0
movhpd 1 * SIZE(%esi), %xmm0
@@ -610,7 +610,7 @@
addl $2 * SIZE, %esi # coffset += 4
decl %ebx # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -635,10 +635,10 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl M, %ebx
- testl %ebx, %ebx
+ testl %ebx, %ebx
jle .L500
ALIGN_4
@@ -653,7 +653,7 @@ L110:
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (B, %eax, 2), BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -673,7 +673,7 @@ L110:
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -843,7 +843,7 @@ L114:
addl $2 * SIZE, %esi # coffset += 4
decl %ebx # i --
jg L110
- ALIGN_4
+ ALIGN_4
.L500:
popl %ebx
diff --git a/kernel/x86/zgemm_kernel_2x1_core2.S b/kernel/x86/zgemm_kernel_2x1_core2.S
index 3ed5342..8b3e9f3 100644
--- a/kernel/x86/zgemm_kernel_2x1_core2.S
+++ b/kernel/x86/zgemm_kernel_2x1_core2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -145,7 +145,7 @@
movd %mm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
subl $-16 * SIZE, A
@@ -163,7 +163,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sarl $2, %eax
@@ -191,7 +191,7 @@
movapd %xmm5, -6 * SIZE(BB)
movapd %xmm6, -4 * SIZE(BB)
movapd %xmm7, -2 * SIZE(BB)
-
+
addl $ 8 * SIZE, B
subl $-16 * SIZE, BB
decl %eax
@@ -242,7 +242,7 @@
leal (AA, %eax, 4), AA
leal (BB, %eax, 4), BB /* because it's doubled */
-#endif
+#endif
movapd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -264,7 +264,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -481,7 +481,7 @@
addsubpd %xmm5, %xmm4
addsubpd %xmm7, %xmm6
-
+
#if! defined(TRMMKERNEL) && !defined(BETAZERO)
addpd %xmm0, %xmm4
addpd %xmm1, %xmm6
@@ -527,7 +527,7 @@
leal (AA, %eax, 2), AA
leal (BB, %eax, 4), BB /* because it's doubled */
-#endif
+#endif
movapd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -543,7 +543,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
addl $1, %eax
@@ -662,7 +662,7 @@
mulpd %xmm3, %xmm5
addsubpd %xmm5, %xmm4
-
+
#if! defined(TRMMKERNEL) && !defined(BETAZERO)
addpd %xmm0, %xmm4
#endif
diff --git a/kernel/x86/zgemm_kernel_2x1_sse2.S b/kernel/x86/zgemm_kernel_2x1_sse2.S
index 3ef96d1..54c205b 100644
--- a/kernel/x86/zgemm_kernel_2x1_sse2.S
+++ b/kernel/x86/zgemm_kernel_2x1_sse2.S
@@ -47,7 +47,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -239,7 +239,7 @@
movsd %xmm0, 0 + ALPHA_R
movsd %xmm0, 8 + ALPHA_R
-
+
movsd %xmm1, 8 + ALPHA_I
xorpd %xmm7, %xmm1
movsd %xmm1, 0 + ALPHA_I
@@ -264,7 +264,7 @@
movd %mm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
sall $ZBASE_SHIFT, LDC
@@ -277,7 +277,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
movapd POSINV, %xmm7
@@ -299,7 +299,7 @@
unpcklpd %xmm3, %xmm3
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorpd %xmm7, %xmm1
xorpd %xmm7, %xmm3
#else
@@ -323,7 +323,7 @@
unpcklpd %xmm3, %xmm3
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorpd %xmm7, %xmm1
xorpd %xmm7, %xmm3
#else
@@ -337,7 +337,7 @@
movapd %xmm3, 14 * SIZE(BB)
prefetcht0 104 * SIZE(B)
-
+
addl $ 8 * SIZE, B
addl $16 * SIZE, BB
decl %eax
@@ -359,7 +359,7 @@
unpcklpd %xmm1, %xmm1
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorpd %xmm7, %xmm1
#else
xorpd %xmm7, %xmm0
@@ -422,7 +422,7 @@
movapd 8 * SIZE(AA), %xmm1
pxor %xmm7, %xmm7
-#endif
+#endif
prefetchnta 3 * SIZE(%esi)
@@ -431,7 +431,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -445,7 +445,7 @@
NOBRANCH
je .L12
sall $3, %eax
-
+
.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
@@ -542,7 +542,7 @@
.L11:
leal (BB, %eax, 4), BB
leal (AA, %eax, 4), AA
-
+
.L12:
#ifndef TRMMKERNEL
movl K, %eax
@@ -670,7 +670,7 @@
movapd 8 * SIZE(BB), %xmm2
pxor %xmm6, %xmm6
pxor %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
@@ -678,7 +678,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
addl $1, %eax
diff --git a/kernel/x86/zgemm_kernel_2x2_barcelona.S b/kernel/x86/zgemm_kernel_2x2_barcelona.S
index 2ad6893..21f7469 100644
--- a/kernel/x86/zgemm_kernel_2x2_barcelona.S
+++ b/kernel/x86/zgemm_kernel_2x2_barcelona.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -270,7 +270,7 @@
movss %xmm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
sall $ZBASE_SHIFT, LDC
@@ -283,7 +283,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leal BUFFER, %ecx
@@ -306,7 +306,7 @@
pshufd $0xff, %xmm3, %xmm3
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm7, %xmm1
xorps %xmm7, %xmm3
#else
@@ -329,7 +329,7 @@
pshufd $0xff, %xmm3, %xmm3
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm7, %xmm1
xorps %xmm7, %xmm3
#else
@@ -367,7 +367,7 @@
pshufd $0xff, %xmm3, %xmm3
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm7, %xmm1
xorps %xmm7, %xmm3
#else
@@ -403,7 +403,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -422,7 +422,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -436,7 +436,7 @@
andl $-8, %eax
sall $4, %eax
je .L15
-.L1X:
+.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
KERNEL3(32 * 0)
@@ -526,7 +526,7 @@
leal (AA, %eax, 1), AA
leal (BB, %eax, 4), BB
ALIGN_4
-
+
.L15:
#ifndef TRMMKERNEL
movl K, %eax
@@ -641,7 +641,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -658,7 +658,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -776,7 +776,7 @@
decl %eax
jne .L41
ALIGN_4
-
+
.L42:
#ifndef TRMMKERNEL
movl K, %eax
@@ -887,7 +887,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leal BUFFER, %ecx
@@ -909,7 +909,7 @@
pshufd $0xff, %xmm3, %xmm3
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm7, %xmm1
xorps %xmm7, %xmm3
#else
@@ -932,7 +932,7 @@
pshufd $0xff, %xmm3, %xmm3
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm7, %xmm1
xorps %xmm7, %xmm3
#else
@@ -967,7 +967,7 @@
pshufd $0x55, %xmm3, %xmm1
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm7, %xmm1
#else
xorps %xmm7, %xmm0
@@ -1002,7 +1002,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1021,7 +1021,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1098,7 +1098,7 @@
decl %eax
jne .L111
ALIGN_4
-
+
.L112:
#ifndef TRMMKERNEL
movl K, %eax
@@ -1196,7 +1196,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1212,7 +1212,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1289,7 +1289,7 @@
decl %eax
jne .L141
ALIGN_4
-
+
.L142:
#ifndef TRMMKERNEL
movl K, %eax
diff --git a/kernel/x86/zgemm_kernel_2x2_penryn.S b/kernel/x86/zgemm_kernel_2x2_penryn.S
index 715eb4d..f50117b 100644
--- a/kernel/x86/zgemm_kernel_2x2_penryn.S
+++ b/kernel/x86/zgemm_kernel_2x2_penryn.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -125,7 +125,7 @@
movl OFFSET, %eax
#ifndef LEFT
negl %eax
-#endif
+#endif
movl %eax, KK
#endif
@@ -144,7 +144,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl B, BX
@@ -168,7 +168,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 4), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
movl BX, %eax
PREFETCHB -32 * SIZE(%eax)
@@ -192,7 +192,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -331,11 +331,11 @@
movaps -32 * SIZE(BB), %xmm1
mulps %xmm0, %xmm2
movaps -32 * SIZE(AA), %xmm0
-
+
decl %eax
jne .L12
ALIGN_4
-
+
.L15:
#ifndef TRMMKERNEL
movl K, %eax
@@ -449,7 +449,7 @@
decl %ebx
jg .L10
ALIGN_4
-
+
.L20:
movl M, %ebx
testl $1, %ebx
@@ -466,7 +466,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
movsd -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -483,7 +483,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -630,11 +630,11 @@
subl $-16 * SIZE, AA
subl $-32 * SIZE, BB
-
+
decl %eax
jne .L22
ALIGN_4
-
+
.L25:
#ifndef TRMMKERNEL
movl K, %eax
@@ -757,7 +757,7 @@
addl $2, KK
#endif
movl BB, B
-
+
leal (, LDC, 2), %eax
addl %eax, C
@@ -773,7 +773,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl C, C1
movl A, AA
@@ -795,7 +795,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 4), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
movaps -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -813,7 +813,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -901,11 +901,11 @@
subl $-32 * SIZE, AA
subl $-16 * SIZE, BB
-
+
decl %eax
jne .L32
ALIGN_4
-
+
.L35:
#ifndef TRMMKERNEL
movl K, %eax
@@ -997,7 +997,7 @@
decl %ebx
jg .L31
ALIGN_4
-
+
.L40:
movl M, %ebx
testl $1, %ebx
@@ -1014,7 +1014,7 @@
leal (, %eax, SIZE), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
movsd -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -1031,7 +1031,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1122,11 +1122,11 @@
subl $-16 * SIZE, AA
subl $-16 * SIZE, BB
-
+
decl %eax
jne .L42
ALIGN_4
-
+
.L45:
#ifndef TRMMKERNEL
movl K, %eax
diff --git a/kernel/x86/zgemm_kernel_2x2_sse.S b/kernel/x86/zgemm_kernel_2x2_sse.S
index fad42cc..c0fba78 100644
--- a/kernel/x86/zgemm_kernel_2x2_sse.S
+++ b/kernel/x86/zgemm_kernel_2x2_sse.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -219,7 +219,7 @@
addps %xmm1, %xmm7; \
movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
#endif
-
+
#ifdef PENTIUM4
#define KERNEL1(address) \
mulps %xmm0, %xmm2; \
@@ -409,7 +409,7 @@
movss %xmm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
sall $ZBASE_SHIFT, LDC
@@ -422,7 +422,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leal BUFFER, %ecx
@@ -445,7 +445,7 @@
shufps $0, %xmm3, %xmm3
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm7, %xmm1
xorps %xmm7, %xmm3
#else
@@ -469,7 +469,7 @@
shufps $0, %xmm3, %xmm3
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm7, %xmm1
xorps %xmm7, %xmm3
#else
@@ -512,7 +512,7 @@
shufps $0, %xmm3, %xmm3
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm7, %xmm1
xorps %xmm7, %xmm3
#else
@@ -547,7 +547,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
xorps %xmm4, %xmm4
@@ -573,7 +573,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -589,7 +589,7 @@
andl $-8, %eax
sall $4, %eax
je .L15
-.L1X:
+.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
KERNEL3(32 * 0)
@@ -700,7 +700,7 @@
jne .L11
ALIGN_4
#endif
-
+
.L15:
#ifndef TRMMKERNEL
movl K, %eax
@@ -815,7 +815,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 8), BB
-#endif
+#endif
#ifdef movsd
xorps %xmm0, %xmm0
@@ -837,7 +837,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -957,7 +957,7 @@
decl %eax
jne .L41
ALIGN_4
-
+
.L42:
#ifndef TRMMKERNEL
movl K, %eax
@@ -1074,7 +1074,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leal BUFFER, %ecx
@@ -1097,7 +1097,7 @@
shufps $0, %xmm3, %xmm3
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm7, %xmm1
xorps %xmm7, %xmm3
#else
@@ -1121,7 +1121,7 @@
shufps $0, %xmm3, %xmm3
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm7, %xmm1
xorps %xmm7, %xmm3
#else
@@ -1157,7 +1157,7 @@
shufps $0, %xmm1, %xmm1
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm7, %xmm1
#else
xorps %xmm7, %xmm0
@@ -1192,7 +1192,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -1217,7 +1217,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1294,7 +1294,7 @@
decl %eax
jne .L111
ALIGN_4
-
+
.L112:
#ifndef TRMMKERNEL
movl K, %eax
@@ -1392,7 +1392,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
xorps %xmm4, %xmm4
@@ -1408,7 +1408,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1485,7 +1485,7 @@
decl %eax
jne .L141
ALIGN_4
-
+
.L142:
#ifndef TRMMKERNEL
movl K, %eax
diff --git a/kernel/x86/zgemm_kernel_2x2_sse3.S b/kernel/x86/zgemm_kernel_2x2_sse3.S
index 23afa8f..4bca5ff 100644
--- a/kernel/x86/zgemm_kernel_2x2_sse3.S
+++ b/kernel/x86/zgemm_kernel_2x2_sse3.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -268,7 +268,7 @@
movss %xmm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
sall $ZBASE_SHIFT, LDC
@@ -281,7 +281,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leal BUFFER, %ecx
@@ -360,7 +360,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -379,7 +379,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -395,7 +395,7 @@
andl $-8, %eax
sall $4, %eax
je .L15
-.L1X:
+.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
KERNEL3(32 * 0)
@@ -588,7 +588,7 @@
jne .L11
ALIGN_4
#endif
-
+
.L15:
#ifndef TRMMKERNEL
movl K, %eax
@@ -714,7 +714,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 4), BB
-#endif
+#endif
movddup 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -728,7 +728,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -822,7 +822,7 @@
decl %eax
jne .L41
ALIGN_4
-
+
.L42:
#ifndef TRMMKERNEL
movl K, %eax
@@ -859,12 +859,12 @@
movhlps %xmm6, %xmm5
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
- defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
cmpeqps %xmm7, %xmm7
pslld $31, %xmm7
xorps %xmm7, %xmm5
-#endif
-
+#endif
+
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
shufps $0xb1, %xmm5, %xmm5
@@ -934,7 +934,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leal BUFFER, %ecx
@@ -1009,7 +1009,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1029,7 +1029,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1107,7 +1107,7 @@
decl %eax
jne .L111
ALIGN_4
-
+
.L112:
#ifndef TRMMKERNEL
movl K, %eax
@@ -1208,7 +1208,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 2), BB
-#endif
+#endif
movddup 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -1222,7 +1222,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1284,7 +1284,7 @@
decl %eax
jne .L141
ALIGN_4
-
+
.L142:
#ifndef TRMMKERNEL
movl K, %eax
@@ -1317,12 +1317,12 @@
movhlps %xmm4, %xmm5
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
- defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
cmpeqps %xmm7, %xmm7
pslld $31, %xmm7
xorps %xmm7, %xmm5
-#endif
-
+#endif
+
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
shufps $0xb1, %xmm5, %xmm5
diff --git a/kernel/x86/zgemm_kernel_4x1_core2.S b/kernel/x86/zgemm_kernel_4x1_core2.S
index ca232e4..05c2f02 100644
--- a/kernel/x86/zgemm_kernel_4x1_core2.S
+++ b/kernel/x86/zgemm_kernel_4x1_core2.S
@@ -45,7 +45,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -111,7 +111,7 @@
addl $STACK_OFFSET, %esp
STACK_TOUCHING
-
+
movd STACK_M, %mm0
movl STACK_N, %eax
movd STACK_K, %mm1
@@ -134,7 +134,7 @@
movd %mm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
subl $-32 * SIZE, A
@@ -166,7 +166,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
movl K, %eax
sarl $2, %eax
@@ -250,7 +250,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 4), AA
leal (BB, %eax, 4), BB /* because it's doubled */
-#endif
+#endif
movaps -32 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -266,7 +266,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -395,11 +395,11 @@
ADDSUB %xmm1, %xmm7
movaps -32 * SIZE(BB), %xmm1
-
+
decl %eax
jne .L12
ALIGN_4
-
+
.L15:
#ifndef TRMMKERNEL
movl K, %eax
@@ -502,7 +502,7 @@
decl %ebx
jg .L10
ALIGN_2
-
+
.L20:
movl M, %ebx
testl $2, %ebx
@@ -520,7 +520,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 2), AA
leal (BB, %eax, 4), BB /* because it's doubled */
-#endif
+#endif
movaps -32 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -536,7 +536,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -695,7 +695,7 @@
leal (, %eax, 8), %eax
leal (AA, %eax, 1), AA
leal (BB, %eax, 4), BB /* because it's doubled */
-#endif
+#endif
movsd -32 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -712,7 +712,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
addl $1, %eax
@@ -860,7 +860,7 @@
.L999:
movl OLD_STACK, %esp
-
+
EMMS
popl %ebx
diff --git a/kernel/x86/zgemm_kernel_4x1_sse.S b/kernel/x86/zgemm_kernel_4x1_sse.S
index 6c51463..685e5d3 100644
--- a/kernel/x86/zgemm_kernel_4x1_sse.S
+++ b/kernel/x86/zgemm_kernel_4x1_sse.S
@@ -45,7 +45,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -243,7 +243,7 @@
movd %mm4, KK
#ifndef LEFT
negl KK
-#endif
+#endif
#endif
leal (, LDC, SIZE * 2), LDC
@@ -292,7 +292,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movl OFFSET, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
movaps POSINV, %xmm7
@@ -313,7 +313,7 @@
shufps $0, %xmm3, %xmm3
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm7, %xmm1
xorps %xmm7, %xmm3
#else
@@ -337,7 +337,7 @@
shufps $0, %xmm3, %xmm3
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm7, %xmm1
xorps %xmm7, %xmm3
#else
@@ -371,7 +371,7 @@
shufps $0, %xmm1, %xmm1
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm7, %xmm1
#else
xorps %xmm7, %xmm0
@@ -429,7 +429,7 @@
movaps 16 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
prefetchnta 8 * SIZE(%esi)
@@ -438,7 +438,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -451,7 +451,7 @@
andl $-8, %eax
je .L12
sall $3, %eax
-
+
.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
@@ -582,14 +582,14 @@
movaps 8 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -764,7 +764,7 @@
decl %eax
jne .L11
#endif
-
+
.L12:
#ifndef TRMMKERNEL
movl K, %eax
@@ -859,7 +859,7 @@
decl %ebx # i --
jg .L10
ALIGN_2
-
+
.L50:
movl M, %ebx
testl $2, %ebx
@@ -899,14 +899,14 @@
movaps 16 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1009,14 +1009,14 @@
movaps 8 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
#ifdef LEFT
@@ -1085,7 +1085,7 @@
decl %eax
jne .L51
#endif
-
+
.L52:
#ifndef TRMMKERNEL
movl K, %eax
@@ -1208,14 +1208,14 @@
movsd 8 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
movl K, %eax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
addl $1, %eax
@@ -1327,7 +1327,7 @@
movsd 8 * SIZE(AA), %xmm1
xorps %xmm7, %xmm7
-#endif
+#endif
#ifndef TRMMKERNEL
@@ -1335,7 +1335,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movl K, %eax
subl KK, %eax
- movl %eax, KKK
+ movl %eax, KKK
#else
movl KK, %eax
addl $1, %eax
@@ -1409,7 +1409,7 @@
jne .L71
ALIGN_2
#endif
-
+
.L72:
#ifndef TRMMKERNEL
movl K, %eax
@@ -1496,7 +1496,7 @@
.L999:
movl OLD_STACK, %esp
-
+
EMMS
popl %ebx
diff --git a/kernel/x86/zgemm_ncopy_2.S b/kernel/x86/zgemm_ncopy_2.S
index bc80b47..ad5ffbe 100644
--- a/kernel/x86/zgemm_ncopy_2.S
+++ b/kernel/x86/zgemm_ncopy_2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 8
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_N 8 + STACK + ARGS(%esp)
#define STACK_A 12 + STACK + ARGS(%esp)
diff --git a/kernel/x86/zgemm_tcopy_2.S b/kernel/x86/zgemm_tcopy_2.S
index f9a601d..1598e9f 100644
--- a/kernel/x86/zgemm_tcopy_2.S
+++ b/kernel/x86/zgemm_tcopy_2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 8
-
+
#define J 0 + STACK(%esp)
#define BOFFSET2 4 + STACK(%esp)
@@ -60,7 +60,7 @@
pushl %ebx
PROFCODE
-
+
#define A %ebp
#define A1 %edx
#define LDA %ecx
diff --git a/kernel/x86/zgemv_n.S b/kernel/x86/zgemv_n.S
index a3c9174..4a411cc 100644
--- a/kernel/x86/zgemv_n.S
+++ b/kernel/x86/zgemv_n.S
@@ -46,14 +46,14 @@
#if defined(PENTIUM4) || defined(ATHLON)
#define P ((DTB_DEFAULT_ENTRIES) >> 1)
#endif
-
+
#ifndef P
#define P DTB_DEFAULT_ENTRIES
#endif
#define STACK 16
#define ARGS 16
-
+
#define PLDA_M 0 + STACK(%esp)
#define XP 4 + STACK(%esp)
#define MIN_N 8 + STACK(%esp)
diff --git a/kernel/x86/zgemv_n_atom.S b/kernel/x86/zgemv_n_atom.S
index 3dba030..36e82f7 100644
--- a/kernel/x86/zgemv_n_atom.S
+++ b/kernel/x86/zgemv_n_atom.S
@@ -58,7 +58,7 @@
#define Y 48 + STACKSIZE(%esp)
#define STACK_INCY 52 + STACKSIZE(%esp)
#define BUFFER 56 + STACKSIZE(%esp)
-
+
#define I %eax
#define J %ebx
@@ -122,7 +122,7 @@
jle .L999
movl BUFFER, Y1
-
+
movl N, J
pxor %xmm7, %xmm7
@@ -538,7 +538,7 @@
.L999:
popl %ebx
popl %esi
- popl %edi
+ popl %edi
popl %ebp
ret
diff --git a/kernel/x86/zgemv_n_sse.S b/kernel/x86/zgemv_n_sse.S
index b0f686a..7bf41bb 100644
--- a/kernel/x86/zgemv_n_sse.S
+++ b/kernel/x86/zgemv_n_sse.S
@@ -106,7 +106,7 @@
#define MMM 0+ARGS(%esp)
#define YY 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
-
+
#define I %eax
#define J %ebx
@@ -159,7 +159,7 @@
.L00t:
movl AA,%eax
movl %eax,A
-
+
movl YY,J
movl J,Y
@@ -178,7 +178,7 @@
jle .L999
movl BUFFER, Y1
-
+
movl N, J
xorps %xmm7, %xmm7
@@ -640,7 +640,7 @@
.L999x:
popl %ebx
popl %esi
- popl %edi
+ popl %edi
popl %ebp
addl $ARGS,%esp
ret
diff --git a/kernel/x86/zgemv_n_sse2.S b/kernel/x86/zgemv_n_sse2.S
index bb33d26..fd01e2a 100644
--- a/kernel/x86/zgemv_n_sse2.S
+++ b/kernel/x86/zgemv_n_sse2.S
@@ -93,7 +93,7 @@
#define YY 4 + ARGS(%esp)
#define AA 8 + ARGS(%esp)
-
+
#define I %eax
#define J %ebx
@@ -165,7 +165,7 @@
jle .L999
movl BUFFER, Y1
-
+
movl N, J
pxor %xmm7, %xmm7
@@ -202,7 +202,7 @@
pcmpeqb %xmm5, %xmm5
psllq $63, %xmm5
- shufps $0xc0, %xmm5, %xmm5
+ shufps $0xc0, %xmm5, %xmm5
pshufd $0x4e, %xmm6, %xmm7
@@ -503,7 +503,7 @@
.L999x:
popl %ebx
popl %esi
- popl %edi
+ popl %edi
popl %ebp
addl $ARGS,%esp
ret
diff --git a/kernel/x86/zgemv_t.S b/kernel/x86/zgemv_t.S
index 452794c..83b602d 100644
--- a/kernel/x86/zgemv_t.S
+++ b/kernel/x86/zgemv_t.S
@@ -49,7 +49,7 @@
#define STACK 16
#define ARGS 24
-
+
#define NLDA 0 + STACK(%esp)
#define XP 4 + STACK(%esp)
#define MIN_M 8 + STACK(%esp)
diff --git a/kernel/x86/zgemv_t_atom.S b/kernel/x86/zgemv_t_atom.S
index 6f0dee0..444f9ac 100644
--- a/kernel/x86/zgemv_t_atom.S
+++ b/kernel/x86/zgemv_t_atom.S
@@ -58,7 +58,7 @@
#define Y 48 + STACKSIZE(%esp)
#define STACK_INCY 52 + STACKSIZE(%esp)
#define BUFFER 56 + STACKSIZE(%esp)
-
+
#define I %eax
#define J %ebx
@@ -124,7 +124,7 @@
jle .L999
movl BUFFER, Y1
-
+
movl M, I
sarl $2, I
jle .L05
@@ -180,7 +180,7 @@
movl N, J
ALIGN_3
-.L11:
+.L11:
movl BUFFER, X
addl $16 * SIZE, X
@@ -434,11 +434,11 @@
decl J
jg .L11
ALIGN_4
-
+
.L999:
popl %ebx
popl %esi
- popl %edi
+ popl %edi
popl %ebp
ret
diff --git a/kernel/x86/zgemv_t_sse.S b/kernel/x86/zgemv_t_sse.S
index a7a7abd..fc955e2 100644
--- a/kernel/x86/zgemv_t_sse.S
+++ b/kernel/x86/zgemv_t_sse.S
@@ -106,7 +106,7 @@
#define MMM 0+ARGS(%esp)
#define XX 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
-
+
#define I %eax
#define J %ebx
@@ -180,7 +180,7 @@
jle .L999
movl BUFFER, Y1
-
+
movl M, I
sarl $2, I
jle .L05
@@ -239,7 +239,7 @@
movl N, J
ALIGN_3
-.L11:
+.L11:
movl BUFFER, X
addl $32 * SIZE, X
@@ -473,7 +473,7 @@
mulps %xmm2, %xmm5
SUBPS %xmm5, %xmm1
ALIGN_4
-
+
.L19:
#ifdef HAVE_SSE2
pcmpeqb %xmm5, %xmm5
@@ -486,7 +486,7 @@
addl $8, %esp
movlhps %xmm5, %xmm5
#endif
-
+
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
xorps %xmm5, %xmm0
#else
@@ -529,7 +529,7 @@
movaps %xmm0, %xmm2
shufps $0x88, %xmm1, %xmm0
shufps $0xdd, %xmm1, %xmm2
-
+
addps %xmm2, %xmm0
#endif
@@ -544,7 +544,7 @@
decl J
jg .L11
ALIGN_4
-
+
.L999:
movl M,%eax
sall $ZBASE_SHIFT, %eax
@@ -558,7 +558,7 @@
.L999x:
popl %ebx
popl %esi
- popl %edi
+ popl %edi
popl %ebp
addl $ARGS,%esp
diff --git a/kernel/x86/zgemv_t_sse2.S b/kernel/x86/zgemv_t_sse2.S
index 86f5976..b58f698 100644
--- a/kernel/x86/zgemv_t_sse2.S
+++ b/kernel/x86/zgemv_t_sse2.S
@@ -167,7 +167,7 @@
jle .L999
movl BUFFER, Y1
-
+
movl M, I
sarl $2, I
jle .L05
@@ -223,7 +223,7 @@
movl N, J
ALIGN_4
-.L11:
+.L11:
movl BUFFER, X
addl $16 * SIZE, X
@@ -377,7 +377,7 @@
pcmpeqb %xmm5, %xmm5
psllq $63, %xmm5
shufps $0xc0, %xmm5, %xmm5
-
+
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
xorpd %xmm5, %xmm0
#else
@@ -426,7 +426,7 @@
decl J
jg .L11
ALIGN_4
-
+
.L999:
movl M,%eax
sall $ZBASE_SHIFT,%eax
@@ -440,7 +440,7 @@
.L999x:
popl %ebx
popl %esi
- popl %edi
+ popl %edi
popl %ebp
addl $ARGS,%esp
ret
diff --git a/kernel/x86/znrm2.S b/kernel/x86/znrm2.S
index c645b57..263612e 100644
--- a/kernel/x86/znrm2.S
+++ b/kernel/x86/znrm2.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 8
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -49,7 +49,7 @@
#define M %edx
#define X %ecx
#define INCX %esi
-
+
#define I %eax
#include "l1param.h"
@@ -91,7 +91,7 @@
sarl $2, I
jle .L20
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -156,7 +156,7 @@
sarl $2, I
jle .L60
ALIGN_4
-
+
.L50:
FLD 0 * SIZE(X)
fmul %st(0), %st
diff --git a/kernel/x86/znrm2_sse.S b/kernel/x86/znrm2_sse.S
index 95ca9fd..bbc3677 100644
--- a/kernel/x86/znrm2_sse.S
+++ b/kernel/x86/znrm2_sse.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 8
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -49,7 +49,7 @@
#define M %edx
#define X %ecx
#define INCX %esi
-
+
#define I %eax
#include "l1param.h"
@@ -82,7 +82,7 @@
testl $SIZE, X
je .L05
-
+
movss -32 * SIZE(X), %xmm0
cvtss2sd %xmm0, %xmm0
mulsd %xmm0, %xmm0
@@ -96,7 +96,7 @@
movl M, I
sarl $4, I
jle .L13
-
+
movsd -32 * SIZE(X), %xmm4
movsd -30 * SIZE(X), %xmm5
movsd -28 * SIZE(X), %xmm6
@@ -269,7 +269,7 @@
movl M, I
sarl $3, I
jle .L43
-
+
movsd (X), %xmm4
addl INCX, X
movsd (X), %xmm5
diff --git a/kernel/x86/zrot.S b/kernel/x86/zrot.S
index 7ac984e..93f86c8 100644
--- a/kernel/x86/zrot.S
+++ b/kernel/x86/zrot.S
@@ -38,10 +38,10 @@
#define ASSEMBLER
#include "common.h"
-
+
#define STACK 12
#define ARGS 0
-
+
#define STACK_N 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
@@ -110,7 +110,7 @@
sarl $1, I
jle .L15
ALIGN_4
-
+
.L10:
#ifdef PENTIUM4
PREFETCH (PREFETCH_SIZE + 0) * SIZE(X)
@@ -261,7 +261,7 @@
sarl $1, I
jle .L55
ALIGN_4
-
+
.L51:
FLD 0 * SIZE(X)
FLD 0 * SIZE(Y)
diff --git a/kernel/x86/zrot_sse.S b/kernel/x86/zrot_sse.S
index d10183f..9c2fa4f 100644
--- a/kernel/x86/zrot_sse.S
+++ b/kernel/x86/zrot_sse.S
@@ -1285,12 +1285,12 @@
.L50:
movl N, I
-//if incx ==0 || incy==0 jump to the tail
+//if incx ==0 || incy==0 jump to the tail
cmpl $0, INCX
je .L56
cmpl $0, INCY
je .L56
-
+
sarl $2, I
jle .L55
ALIGN_3
diff --git a/kernel/x86/zrot_sse2.S b/kernel/x86/zrot_sse2.S
index 7787f45..0bab351 100644
--- a/kernel/x86/zrot_sse2.S
+++ b/kernel/x86/zrot_sse2.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define STACK_N 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
diff --git a/kernel/x86/zscal.S b/kernel/x86/zscal.S
index 7505cea..1eb5185 100644
--- a/kernel/x86/zscal.S
+++ b/kernel/x86/zscal.S
@@ -40,7 +40,7 @@
#include "common.h"
#define STACK 8
-
+
#define STACK_N 4 + STACK(%esp)
#ifdef XDOUBLE
#define ALPHA_R 16 + STACK(%esp)
diff --git a/kernel/x86/zscal_sse.S b/kernel/x86/zscal_sse.S
index 53abb69..e011c98 100644
--- a/kernel/x86/zscal_sse.S
+++ b/kernel/x86/zscal_sse.S
@@ -1073,7 +1073,7 @@
#else
-
+
PSHUFD2($0, %xmm0, %xmm6)
PSHUFD2($0, %xmm1, %xmm1)
subps %xmm1, %xmm7
diff --git a/kernel/x86/zscal_sse2.S b/kernel/x86/zscal_sse2.S
index 26ef693..cc7ab66 100644
--- a/kernel/x86/zscal_sse2.S
+++ b/kernel/x86/zscal_sse2.S
@@ -73,7 +73,7 @@
#define xmm14 xmm6
#define xmm15 xmm7
-
+
PROLOGUE
PROFCODE
@@ -94,7 +94,7 @@
testl M, M
jle .L999
-
+
xorps %xmm7, %xmm7
comisd %xmm0, %xmm7
jne .L100
@@ -193,7 +193,7 @@
jle .L22
ALIGN_4
-.L21:
+.L21:
#ifdef PREFETCHW
PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif
@@ -294,10 +294,10 @@
#else
pshufd $0x44, %xmm0, %xmm6
#endif
-
+
xorps %xmm7, %xmm7
subsd %xmm1, %xmm7
- movlhps %xmm1, %xmm7
+ movlhps %xmm1, %xmm7
cmpl $2 * SIZE, INCX
jne .L120
@@ -869,7 +869,7 @@
#endif
pxor %xmm7, %xmm7
subsd %xmm1, %xmm7
- movlhps %xmm1, %xmm7
+ movlhps %xmm1, %xmm7
shufpd $1, %xmm7, %xmm7
movhps 0 * SIZE(X), %xmm0
@@ -1150,7 +1150,7 @@
#endif
pxor %xmm7, %xmm7
subsd %xmm1, %xmm7
- movlhps %xmm1, %xmm7
+ movlhps %xmm1, %xmm7
subl $-16 * SIZE, X
@@ -1427,7 +1427,7 @@
#endif
pxor %xmm7, %xmm7
subsd %xmm1, %xmm7
- movlhps %xmm1, %xmm7
+ movlhps %xmm1, %xmm7
movl X, XX
diff --git a/kernel/x86/zswap.S b/kernel/x86/zswap.S
index ca4660f..620a009 100644
--- a/kernel/x86/zswap.S
+++ b/kernel/x86/zswap.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define N 4 + STACK + ARGS(%esp)
#ifdef XDOUBLE
#define X 48 + STACK + ARGS(%esp)
diff --git a/kernel/x86/zswap_sse.S b/kernel/x86/zswap_sse.S
index 24d0001..479d946 100644
--- a/kernel/x86/zswap_sse.S
+++ b/kernel/x86/zswap_sse.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 24 + STACK + ARGS(%esp)
#define STACK_INCX 28 + STACK + ARGS(%esp)
@@ -85,7 +85,7 @@
subl $-32 * SIZE, X
subl $-32 * SIZE, Y
-
+
cmpl $3, M
jle .L16
@@ -307,7 +307,7 @@
.L20:
movaps -33 * SIZE(X), %xmm0
movaps -32 * SIZE(Y), %xmm1
-
+
movss %xmm1, -32 * SIZE(X)
PSHUFD2($0x39, %xmm1, %xmm3)
movlps %xmm3, -31 * SIZE(X)
@@ -783,7 +783,7 @@
.L40:
movaps -35 * SIZE(X), %xmm0
movaps -32 * SIZE(Y), %xmm1
-
+
movss %xmm1, -32 * SIZE(X)
subl $3, M
diff --git a/kernel/x86/zswap_sse2.S b/kernel/x86/zswap_sse2.S
index d900ea5..cc012b3 100644
--- a/kernel/x86/zswap_sse2.S
+++ b/kernel/x86/zswap_sse2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 32 + STACK + ARGS(%esp)
#define STACK_INCX 36 + STACK + ARGS(%esp)
diff --git a/kernel/x86/ztrsm_kernel_LN_2x1_core2.S b/kernel/x86/ztrsm_kernel_LN_2x1_core2.S
index 1d3107a..1a6f8c0 100644
--- a/kernel/x86/ztrsm_kernel_LN_2x1_core2.S
+++ b/kernel/x86/ztrsm_kernel_LN_2x1_core2.S
@@ -47,7 +47,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -166,7 +166,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -185,7 +185,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal 16 * SIZE + BUFFER, BB
@@ -201,7 +201,7 @@
sall $ZBASE_SHIFT, %eax
addl %eax, B
leal (BB, %eax, 2), BB
-#endif
+#endif
#if defined(LT)
movl OFFSET, %eax
@@ -312,7 +312,7 @@
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -600,7 +600,7 @@
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -744,7 +744,7 @@
decl %eax
jne .L12
ALIGN_4
-
+
.L15:
#if defined(LT) || defined(RN)
movl KK, %eax
diff --git a/kernel/x86/ztrsm_kernel_LN_2x1_sse2.S b/kernel/x86/ztrsm_kernel_LN_2x1_sse2.S
index 7aef336..029a2f5 100644
--- a/kernel/x86/ztrsm_kernel_LN_2x1_sse2.S
+++ b/kernel/x86/ztrsm_kernel_LN_2x1_sse2.S
@@ -47,7 +47,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -276,7 +276,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -295,7 +295,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -311,7 +311,7 @@
sall $ZBASE_SHIFT, %eax
addl %eax, B
leal (BB, %eax, 2), BB
-#endif
+#endif
#if defined(LT)
movl OFFSET, %eax
@@ -357,7 +357,7 @@
movapd %xmm7, 14 * SIZE(BB)
prefetcht0 104 * SIZE(B)
-
+
addl $ 8 * SIZE, B
addl $16 * SIZE, BB
decl %eax
@@ -436,7 +436,7 @@
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd 0 * SIZE(BB), %xmm1
pxor %xmm4, %xmm4
@@ -713,7 +713,7 @@
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd 0 * SIZE(BB), %xmm2
pxor %xmm4, %xmm4
@@ -740,7 +740,7 @@
NOBRANCH
je .L12
sall $3, %eax
-
+
.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
@@ -837,7 +837,7 @@
.L11:
leal (BB, %eax, 4), BB
leal (AA, %eax, 4), AA
-
+
.L12:
#if defined(LT) || defined(RN)
movl KK, %eax
diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S
index 6d98805..da561b5 100644
--- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -55,7 +55,7 @@
#define KK 4 + STACK(%esp)
#define KKK 8 + STACK(%esp)
#define AORIG 12 + STACK(%esp)
-
+
#if defined(PENRYN) || defined(DUNNINGTON)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
@@ -101,12 +101,12 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
movl M, %ebx
testl %ebx, %ebx
- jle .L999
+ jle .L999
subl $-32 * SIZE, A
subl $-32 * SIZE, B
@@ -134,7 +134,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -177,7 +177,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -207,7 +207,7 @@
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -365,7 +365,7 @@
decl %eax
jne .L41
ALIGN_4
-
+
.L42:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -438,7 +438,7 @@
pxor %xmm0, %xmm7
#endif
#endif
-
+
addps %xmm5, %xmm4
addps %xmm7, %xmm6
@@ -662,7 +662,7 @@
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -821,11 +821,11 @@
movaps -32 * SIZE(BB), %xmm1
mulps %xmm0, %xmm2
movaps -32 * SIZE(AA), %xmm0
-
+
decl %eax
jne .L11
ALIGN_4
-
+
.L15:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1258,7 +1258,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1288,7 +1288,7 @@
movl KK, %eax
sall $ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -1387,11 +1387,11 @@
subl $-16 * SIZE, AA
subl $-16 * SIZE, BB
-
+
decl %eax
jne .L141
ALIGN_4
-
+
.L142:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1449,7 +1449,7 @@
pxor %xmm0, %xmm5
#endif
#endif
-
+
addps %xmm5, %xmm4
#if defined(LN) || defined(LT)
@@ -1570,7 +1570,7 @@
movl KK, %eax
sall $ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -1671,11 +1671,11 @@
subl $-32 * SIZE, AA
subl $-16 * SIZE, BB
-
+
decl %eax
jne .L111
ALIGN_4
-
+
.L112:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1737,7 +1737,7 @@
pxor %xmm0, %xmm5
#endif
#endif
-
+
addps %xmm5, %xmm4
#if defined(LN) || defined(LT)
diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S
index b397813..61ce10d 100644
--- a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S
+++ b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -231,7 +231,7 @@
addl $STACK_OFFSET, %esp
STACK_TOUCHING
-
+
movl STACK_M, %ebx
movl STACK_N, %eax
movl STACK_K, %ecx
@@ -295,7 +295,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -314,7 +314,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -330,7 +330,7 @@
sall $1 + ZBASE_SHIFT, %eax
addl %eax, B
leal (BB, %eax, 4), BB
-#endif
+#endif
#if defined(LT)
movl OFFSET, %eax
@@ -454,7 +454,7 @@
movl KK, %eax
sall $3 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -588,7 +588,7 @@
decl %eax
jne .L41
ALIGN_4
-
+
.L42:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -903,7 +903,7 @@
movl KK, %eax
sall $3 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
xorps %xmm4, %xmm4
@@ -942,7 +942,7 @@
decl %eax
jne .L11
ALIGN_4
-
+
.L15:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1373,7 +1373,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -1389,7 +1389,7 @@
sall $ZBASE_SHIFT, %eax
addl %eax, B
leal (BB, %eax, 4), BB
-#endif
+#endif
#if defined(LT)
movl OFFSET, %eax
@@ -1510,7 +1510,7 @@
movl KK, %eax
sall $2 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
#ifdef movsd
xorps %xmm0, %xmm0
@@ -1600,7 +1600,7 @@
decl %eax
jne .L141
ALIGN_4
-
+
.L142:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1797,7 +1797,7 @@
movl KK, %eax
sall $2 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -1884,7 +1884,7 @@
decl %eax
jne .L111
ALIGN_4
-
+
.L112:
#if defined(LT) || defined(RN)
movl KK, %eax
diff --git a/kernel/x86/ztrsm_kernel_LN_4x1_sse.S b/kernel/x86/ztrsm_kernel_LN_4x1_sse.S
index 877a3ba..15a53f5 100644
--- a/kernel/x86/ztrsm_kernel_LN_4x1_sse.S
+++ b/kernel/x86/ztrsm_kernel_LN_4x1_sse.S
@@ -45,7 +45,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -160,7 +160,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -178,7 +178,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -194,7 +194,7 @@
sall $ZBASE_SHIFT, %eax
addl %eax, B
leal (BB, %eax, 4), BB
-#endif
+#endif
#if defined(LT)
movl OFFSET, %eax
@@ -317,7 +317,7 @@
movl KK, %eax
sall $2 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -405,7 +405,7 @@
decl %eax
jne .L71
ALIGN_2
-
+
.L72:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -576,7 +576,7 @@
movl KK, %eax
sall $2 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -652,7 +652,7 @@
decl %eax
jne .L51
ALIGN_4
-
+
.L52:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -990,7 +990,7 @@
movl KK, %eax
sall $2 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -1171,7 +1171,7 @@
addl $64 * SIZE, AA
decl %eax
jne .L11
-
+
.L12:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1852,7 +1852,7 @@
decl %ebx # i --
jg .L10
ALIGN_2
-
+
.L99:
#ifdef LN
movl K, %eax
@@ -1881,7 +1881,7 @@
.L999:
movl OLD_STACK, %esp
-
+
EMMS
popl %ebx
diff --git a/kernel/x86/ztrsm_kernel_LT_1x1.S b/kernel/x86/ztrsm_kernel_LT_1x1.S
index 5b13a54..c093806 100644
--- a/kernel/x86/ztrsm_kernel_LT_1x1.S
+++ b/kernel/x86/ztrsm_kernel_LT_1x1.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define KK 0 + STACK(%esp)
#define KKK 4 + STACK(%esp)
#define AORIG 8 + STACK(%esp)
@@ -112,7 +112,7 @@
movl OFFSET, %eax
negl %eax
movl %eax, KK
-#endif
+#endif
#ifdef RT
movl STACK_N, %eax
@@ -154,7 +154,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -178,7 +178,7 @@
addl %eax, B
#else
movl STACK_B, B
-#endif
+#endif
fldz
fldz
diff --git a/kernel/x86/ztrsm_kernel_LT_1x1_atom.S b/kernel/x86/ztrsm_kernel_LT_1x1_atom.S
index bc0d03e..e2a5278 100644
--- a/kernel/x86/ztrsm_kernel_LT_1x1_atom.S
+++ b/kernel/x86/ztrsm_kernel_LT_1x1_atom.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -57,7 +57,7 @@
#define KK 4 + STACK(%esp)
#define KKK 8 + STACK(%esp)
#define AORIG 12 + STACK(%esp)
-
+
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 8 + 3)
@@ -101,7 +101,7 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
sall $ZBASE_SHIFT, LDC
@@ -163,7 +163,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -195,7 +195,7 @@
movl KK, %eax
sall $ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd 0 * SIZE(AA), %xmm0
xorps %xmm2, %xmm2
@@ -416,7 +416,7 @@
decl %ebx # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L99:
#ifdef LN
diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S
index 452e3bf..a11b028 100644
--- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -57,7 +57,7 @@
#define KK 4 + STACK(%esp)
#define KKK 8 + STACK(%esp)
#define AORIG 12 + STACK(%esp)
-
+
#if defined(PENRYN) || defined(DUNNINGTON)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
@@ -98,12 +98,12 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
movl M, %ebx
testl %ebx, %ebx
- jle .L999
+ jle .L999
subl $-16 * SIZE, A
subl $-16 * SIZE, B
@@ -169,7 +169,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -199,7 +199,7 @@
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -16 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -598,7 +598,7 @@
decl %ebx # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L99:
#ifdef LN
@@ -653,7 +653,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -683,7 +683,7 @@ L110:
movl KK, %eax
sall $ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -16 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -955,7 +955,7 @@ L118:
#ifdef RT
subl $1, KK
#endif
- ALIGN_4
+ ALIGN_4
.L999:
popl %ebx
diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_sse2.S b/kernel/x86/ztrsm_kernel_LT_1x2_sse2.S
index fdeecc7..dfa5a55 100644
--- a/kernel/x86/ztrsm_kernel_LT_1x2_sse2.S
+++ b/kernel/x86/ztrsm_kernel_LT_1x2_sse2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -227,7 +227,7 @@
addl $STACK_OFFSET, %esp
STACK_TOUCHING
-
+
movl STACK_M, %ebx
movl STACK_N, %eax
movl STACK_K, %ecx
@@ -279,7 +279,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -298,7 +298,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -314,7 +314,7 @@
sall $1 + ZBASE_SHIFT, %eax
addl %eax, B
leal (BB, %eax, 2), BB
-#endif
+#endif
#if defined(LT)
movl OFFSET, %eax
@@ -443,7 +443,7 @@
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -473,7 +473,7 @@
andl $-8, %eax
sall $4, %eax
je .L15
-.L1X:
+.L1X:
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
@@ -851,7 +851,7 @@
decl %ebx # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L99:
#ifdef LN
@@ -890,7 +890,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -906,7 +906,7 @@
sall $ZBASE_SHIFT, %eax
addl %eax, B
leal (BB, %eax, 2), BB
-#endif
+#endif
#if defined(LT)
movl OFFSET, %eax
@@ -1005,7 +1005,7 @@
#endif
movl M, %ebx
- testl %ebx, %ebx
+ testl %ebx, %ebx
jle .L199
ALIGN_4
@@ -1031,7 +1031,7 @@
movl KK, %eax
sall $ZBASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -1289,7 +1289,7 @@
decl %ebx # i --
jg .L110
- ALIGN_4
+ ALIGN_4
.L199:
#ifdef LN
@@ -1318,7 +1318,7 @@
movl OLD_STACK, %esp
EMMS
-
+
popl %ebx
popl %esi
popl %edi
diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_sse3.S b/kernel/x86/ztrsm_kernel_LT_1x2_sse3.S
index 29103ba..9ab1b9d 100644
--- a/kernel/x86/ztrsm_kernel_LT_1x2_sse3.S
+++ b/kernel/x86/ztrsm_kernel_LT_1x2_sse3.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -57,7 +57,7 @@
#define KK 4 + STACK(%esp)
#define KKK 8 + STACK(%esp)
#define AORIG 12 + STACK(%esp)
-
+
#ifdef PENTIUM4
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
@@ -218,7 +218,7 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
sall $ZBASE_SHIFT, LDC
@@ -282,7 +282,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -314,7 +314,7 @@
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -606,7 +606,7 @@
decl %ebx # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L99:
#ifdef LN
@@ -661,7 +661,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -669,7 +669,7 @@
#endif
movl M, %ebx
- testl %ebx, %ebx
+ testl %ebx, %ebx
jle .L500
ALIGN_4
@@ -693,7 +693,7 @@ L110:
movl KK, %eax
sall $ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -951,7 +951,7 @@ L114:
#ifdef RT
subl $1, KK
#endif
- ALIGN_4
+ ALIGN_4
.L500:
popl %ebx
diff --git a/kernel/x86/ztrsm_kernel_LT_2x1_core2.S b/kernel/x86/ztrsm_kernel_LT_2x1_core2.S
index 4674654..d971aeb 100644
--- a/kernel/x86/ztrsm_kernel_LT_2x1_core2.S
+++ b/kernel/x86/ztrsm_kernel_LT_2x1_core2.S
@@ -47,7 +47,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -114,7 +114,7 @@
addl $STACK_OFFSET, %esp
STACK_TOUCHING
-
+
movd STACK_M, %mm0
movl STACK_N, %eax
movd STACK_K, %mm1
@@ -166,7 +166,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -185,7 +185,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal 16 * SIZE + BUFFER, BB
@@ -201,7 +201,7 @@
sall $ZBASE_SHIFT, %eax
addl %eax, B
leal (BB, %eax, 2), BB
-#endif
+#endif
#if defined(LT)
movl OFFSET, %eax
@@ -314,7 +314,7 @@
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -458,7 +458,7 @@
decl %eax
jne .L12
ALIGN_4
-
+
.L15:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -757,7 +757,7 @@
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd -16 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
diff --git a/kernel/x86/ztrsm_kernel_LT_2x1_sse2.S b/kernel/x86/ztrsm_kernel_LT_2x1_sse2.S
index 77f3026..9c25dc0 100644
--- a/kernel/x86/ztrsm_kernel_LT_2x1_sse2.S
+++ b/kernel/x86/ztrsm_kernel_LT_2x1_sse2.S
@@ -47,7 +47,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -276,7 +276,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -295,7 +295,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -311,7 +311,7 @@
sall $ZBASE_SHIFT, %eax
addl %eax, B
leal (BB, %eax, 2), BB
-#endif
+#endif
#if defined(LT)
movl OFFSET, %eax
@@ -357,7 +357,7 @@
movapd %xmm7, 14 * SIZE(BB)
prefetcht0 104 * SIZE(B)
-
+
addl $ 8 * SIZE, B
addl $16 * SIZE, BB
decl %eax
@@ -438,7 +438,7 @@
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd 0 * SIZE(BB), %xmm2
pxor %xmm4, %xmm4
@@ -465,7 +465,7 @@
NOBRANCH
je .L12
sall $3, %eax
-
+
.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
@@ -562,7 +562,7 @@
.L11:
leal (BB, %eax, 4), BB
leal (AA, %eax, 4), AA
-
+
.L12:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -876,7 +876,7 @@
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd 0 * SIZE(BB), %xmm1
pxor %xmm4, %xmm4
diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S
index 64232fd..787ab59 100644
--- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -55,7 +55,7 @@
#define KK 4 + STACK(%esp)
#define KKK 8 + STACK(%esp)
#define AORIG 12 + STACK(%esp)
-
+
#if defined(PENRYN) || defined(DUNNINGTON)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
@@ -101,12 +101,12 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
movl M, %ebx
testl %ebx, %ebx
- jle .L999
+ jle .L999
subl $-32 * SIZE, A
subl $-32 * SIZE, B
@@ -134,7 +134,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -177,7 +177,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -209,7 +209,7 @@
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -368,11 +368,11 @@
movaps -32 * SIZE(BB), %xmm1
mulps %xmm0, %xmm2
movaps -32 * SIZE(AA), %xmm0
-
+
decl %eax
jne .L11
ALIGN_4
-
+
.L15:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -775,7 +775,7 @@
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -933,7 +933,7 @@
decl %eax
jne .L41
ALIGN_4
-
+
.L42:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1006,7 +1006,7 @@
pxor %xmm0, %xmm7
#endif
#endif
-
+
addps %xmm5, %xmm4
addps %xmm7, %xmm6
@@ -1258,7 +1258,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -1290,7 +1290,7 @@
movl KK, %eax
sall $ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -1391,11 +1391,11 @@
subl $-32 * SIZE, AA
subl $-16 * SIZE, BB
-
+
decl %eax
jne .L111
ALIGN_4
-
+
.L112:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1457,7 +1457,7 @@
pxor %xmm0, %xmm5
#endif
#endif
-
+
addps %xmm5, %xmm4
#if defined(LN) || defined(LT)
@@ -1678,7 +1678,7 @@
movl KK, %eax
sall $ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -1777,11 +1777,11 @@
subl $-16 * SIZE, AA
subl $-16 * SIZE, BB
-
+
decl %eax
jne .L141
ALIGN_4
-
+
.L142:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1839,7 +1839,7 @@
pxor %xmm0, %xmm5
#endif
#endif
-
+
addps %xmm5, %xmm4
#if defined(LN) || defined(LT)
diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S
index 5ff9393..a4c2ab7 100644
--- a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S
+++ b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -295,7 +295,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -314,7 +314,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -330,7 +330,7 @@
sall $1 + ZBASE_SHIFT, %eax
addl %eax, B
leal (BB, %eax, 4), BB
-#endif
+#endif
#if defined(LT)
movl OFFSET, %eax
@@ -454,7 +454,7 @@
movl KK, %eax
sall $3 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
xorps %xmm4, %xmm4
@@ -493,7 +493,7 @@
decl %eax
jne .L11
ALIGN_4
-
+
.L15:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -915,7 +915,7 @@
movl KK, %eax
sall $3 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -1049,7 +1049,7 @@
decl %eax
jne .L41
ALIGN_4
-
+
.L42:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1373,7 +1373,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -1389,7 +1389,7 @@
sall $ZBASE_SHIFT, %eax
addl %eax, B
leal (BB, %eax, 4), BB
-#endif
+#endif
#if defined(LT)
movl OFFSET, %eax
@@ -1512,7 +1512,7 @@
movl KK, %eax
sall $2 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -1599,7 +1599,7 @@
decl %eax
jne .L111
ALIGN_4
-
+
.L112:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1906,7 +1906,7 @@
movl KK, %eax
sall $2 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
#ifdef movsd
xorps %xmm0, %xmm0
@@ -1996,7 +1996,7 @@
decl %eax
jne .L141
ALIGN_4
-
+
.L142:
#if defined(LT) || defined(RN)
movl KK, %eax
diff --git a/kernel/x86/ztrsm_kernel_LT_4x1_sse.S b/kernel/x86/ztrsm_kernel_LT_4x1_sse.S
index 4f324bc..57b2133 100644
--- a/kernel/x86/ztrsm_kernel_LT_4x1_sse.S
+++ b/kernel/x86/ztrsm_kernel_LT_4x1_sse.S
@@ -45,7 +45,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -160,7 +160,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -178,7 +178,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -194,7 +194,7 @@
sall $ZBASE_SHIFT, %eax
addl %eax, B
leal (BB, %eax, 4), BB
-#endif
+#endif
#if defined(LT)
movl OFFSET, %eax
@@ -319,7 +319,7 @@
movl KK, %eax
sall $2 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -500,7 +500,7 @@
addl $64 * SIZE, AA
decl %eax
jne .L11
-
+
.L12:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1184,7 +1184,7 @@
decl %ebx # i --
jg .L10
ALIGN_2
-
+
.L50:
movl M, %ebx
testl $2, %ebx
@@ -1211,7 +1211,7 @@
movl KK, %eax
sall $2 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
xorps %xmm4, %xmm4
@@ -1287,7 +1287,7 @@
decl %eax
jne .L51
ALIGN_4
-
+
.L52:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1623,7 +1623,7 @@
movl KK, %eax
sall $2 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps 0 * SIZE(BB), %xmm2
@@ -1713,7 +1713,7 @@
decl %eax
jne .L71
ALIGN_2
-
+
.L72:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1886,7 +1886,7 @@
.L999:
movl OLD_STACK, %esp
-
+
EMMS
popl %ebx
diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S
index 71246d7..9a3b0cb 100644
--- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -57,7 +57,7 @@
#define KK 4 + STACK(%esp)
#define KKK 8 + STACK(%esp)
#define AORIG 12 + STACK(%esp)
-
+
#if defined(PENRYN) || defined(DUNNINGTON)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
@@ -98,12 +98,12 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
movl M, %ebx
testl %ebx, %ebx
- jle .L999
+ jle .L999
subl $-16 * SIZE, A
subl $-16 * SIZE, B
@@ -164,7 +164,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -194,7 +194,7 @@ L110:
movl KK, %eax
sall $ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -16 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -466,7 +466,7 @@ L118:
#ifdef RT
subl $1, KK
#endif
- ALIGN_4
+ ALIGN_4
.L100:
movl N, %eax
@@ -503,7 +503,7 @@ L118:
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -533,7 +533,7 @@ L118:
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -16 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -932,7 +932,7 @@ L118:
decl %ebx # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L99:
#ifdef LN
diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_sse2.S b/kernel/x86/ztrsm_kernel_RT_1x2_sse2.S
index 8824868..108d4be 100644
--- a/kernel/x86/ztrsm_kernel_RT_1x2_sse2.S
+++ b/kernel/x86/ztrsm_kernel_RT_1x2_sse2.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -277,7 +277,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -295,7 +295,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -311,7 +311,7 @@
sall $ZBASE_SHIFT, %eax
addl %eax, B
leal (BB, %eax, 2), BB
-#endif
+#endif
#if defined(LT)
movl OFFSET, %eax
@@ -410,7 +410,7 @@
#endif
movl M, %ebx
- testl %ebx, %ebx
+ testl %ebx, %ebx
jle .L199
ALIGN_4
@@ -436,7 +436,7 @@
movl KK, %eax
sall $ZBASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
@@ -694,7 +694,7 @@
decl %ebx # i --
jg .L110
- ALIGN_4
+ ALIGN_4
.L199:
#ifdef LN
@@ -731,7 +731,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, BB
@@ -747,7 +747,7 @@
sall $1 + ZBASE_SHIFT, %eax
addl %eax, B
leal (BB, %eax, 2), BB
-#endif
+#endif
#if defined(LT)
movl OFFSET, %eax
@@ -876,7 +876,7 @@
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
leal (BB, %eax, 2), BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -906,7 +906,7 @@
andl $-8, %eax
sall $4, %eax
je .L15
-.L1X:
+.L1X:
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
@@ -1284,7 +1284,7 @@
decl %ebx # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L99:
#ifdef LN
diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_sse3.S b/kernel/x86/ztrsm_kernel_RT_1x2_sse3.S
index 8b7bf6b..7f7e4d3 100644
--- a/kernel/x86/ztrsm_kernel_RT_1x2_sse3.S
+++ b/kernel/x86/ztrsm_kernel_RT_1x2_sse3.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -57,7 +57,7 @@
#define KK 4 + STACK(%esp)
#define KKK 8 + STACK(%esp)
#define AORIG 12 + STACK(%esp)
-
+
#ifdef PENTIUM4
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
@@ -218,7 +218,7 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
sall $ZBASE_SHIFT, LDC
@@ -277,7 +277,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -285,7 +285,7 @@
#endif
movl M, %ebx
- testl %ebx, %ebx
+ testl %ebx, %ebx
jle .L500
ALIGN_4
@@ -309,7 +309,7 @@ L110:
movl KK, %eax
sall $ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -567,7 +567,7 @@ L114:
#ifdef RT
subl $1, KK
#endif
- ALIGN_4
+ ALIGN_4
.L100:
movl N, %eax
@@ -604,7 +604,7 @@ L114:
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -636,7 +636,7 @@ L114:
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movapd 0 * SIZE(AA), %xmm0
pxor %xmm4, %xmm4
@@ -928,7 +928,7 @@ L114:
decl %ebx # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L99:
#ifdef LN
diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S
index dfd555c..bd7a78b 100644
--- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 16
-
+
#define M 4 + STACK + ARGS(%esp)
#define N 8 + STACK + ARGS(%esp)
#define K 12 + STACK + ARGS(%esp)
@@ -55,7 +55,7 @@
#define KK 4 + STACK(%esp)
#define KKK 8 + STACK(%esp)
#define AORIG 12 + STACK(%esp)
-
+
#if defined(PENRYN) || defined(DUNNINGTON)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
@@ -101,12 +101,12 @@
movl OFFSET, %eax
#ifdef RN
negl %eax
-#endif
+#endif
movl %eax, KK
movl M, %ebx
testl %ebx, %ebx
- jle .L999
+ jle .L999
subl $-32 * SIZE, A
subl $-32 * SIZE, B
@@ -134,7 +134,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -172,7 +172,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -204,7 +204,7 @@
movl KK, %eax
sall $ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -305,11 +305,11 @@
subl $-32 * SIZE, AA
subl $-16 * SIZE, BB
-
+
decl %eax
jne .L111
ALIGN_4
-
+
.L112:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -371,7 +371,7 @@
pxor %xmm0, %xmm5
#endif
#endif
-
+
addps %xmm5, %xmm4
#if defined(LN) || defined(LT)
@@ -592,7 +592,7 @@
movl KK, %eax
sall $ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -691,11 +691,11 @@
subl $-16 * SIZE, AA
subl $-16 * SIZE, BB
-
+
decl %eax
jne .L141
ALIGN_4
-
+
.L142:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -753,7 +753,7 @@
pxor %xmm0, %xmm5
#endif
#endif
-
+
addps %xmm5, %xmm4
#if defined(LN) || defined(LT)
@@ -904,7 +904,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
#ifdef LT
movl OFFSET, %eax
@@ -936,7 +936,7 @@
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -1095,11 +1095,11 @@
movaps -32 * SIZE(BB), %xmm1
mulps %xmm0, %xmm2
movaps -32 * SIZE(AA), %xmm0
-
+
decl %eax
jne .L11
ALIGN_4
-
+
.L15:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1502,7 +1502,7 @@
movl KK, %eax
sall $1 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movsd -32 * SIZE(AA), %xmm0
pxor %xmm2, %xmm2
@@ -1660,7 +1660,7 @@
decl %eax
jne .L41
ALIGN_4
-
+
.L42:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1733,7 +1733,7 @@
pxor %xmm0, %xmm7
#endif
#endif
-
+
addps %xmm5, %xmm4
addps %xmm7, %xmm6
diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S
index 9249252..5cd0dd5 100644
--- a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S
+++ b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S
@@ -41,7 +41,7 @@
#define STACK 16
#define ARGS 0
-
+
#define STACK_M 4 + STACK + ARGS(%esi)
#define STACK_N 8 + STACK + ARGS(%esi)
#define STACK_K 12 + STACK + ARGS(%esi)
@@ -295,7 +295,7 @@
#ifdef RN
negl KK
-#endif
+#endif
#ifdef RT
movl N, %eax
@@ -313,7 +313,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -329,7 +329,7 @@
sall $ZBASE_SHIFT, %eax
addl %eax, B
leal (BB, %eax, 4), BB
-#endif
+#endif
#if defined(LT)
movl OFFSET, %eax
@@ -452,7 +452,7 @@
movl KK, %eax
sall $2 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -539,7 +539,7 @@
decl %eax
jne .L111
ALIGN_4
-
+
.L112:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -846,7 +846,7 @@
movl KK, %eax
sall $2 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
#ifdef movsd
xorps %xmm0, %xmm0
@@ -936,7 +936,7 @@
decl %eax
jne .L141
ALIGN_4
-
+
.L142:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1140,7 +1140,7 @@
movl OFFSET, %eax
addl M, %eax
movl %eax, KK
-#endif
+#endif
leal BUFFER, %ecx
@@ -1156,7 +1156,7 @@
sall $1 + ZBASE_SHIFT, %eax
addl %eax, B
leal (BB, %eax, 4), BB
-#endif
+#endif
#if defined(LT)
movl OFFSET, %eax
@@ -1280,7 +1280,7 @@
movl KK, %eax
sall $3 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
movaps 0 * SIZE(AA), %xmm0
xorps %xmm4, %xmm4
@@ -1319,7 +1319,7 @@
decl %eax
jne .L11
ALIGN_4
-
+
.L15:
#if defined(LT) || defined(RN)
movl KK, %eax
@@ -1741,7 +1741,7 @@
movl KK, %eax
sall $3 + ZBASE_SHIFT, %eax
addl %eax, BB
-#endif
+#endif
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
@@ -1875,7 +1875,7 @@
decl %eax
jne .L41
ALIGN_4
-
+
.L42:
#if defined(LT) || defined(RN)
movl KK, %eax
diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL
index e6a9715..ec21826 100644
--- a/kernel/x86_64/KERNEL
+++ b/kernel/x86_64/KERNEL
@@ -119,7 +119,11 @@ XCOPYKERNEL = zcopy.S
endif
ifndef SDOTKERNEL
-SDOTKERNEL = ../arm/dot.c
+SDOTKERNEL = ../generic/dot.c
+endif
+
+ifndef DSDOTKERNEL
+DSDOTKERNEL = ../generic/dot.c
endif
ifndef DDOTKERNEL
diff --git a/kernel/x86_64/KERNEL.ATOM b/kernel/x86_64/KERNEL.ATOM
index cfbd05a..c248483 100644
--- a/kernel/x86_64/KERNEL.ATOM
+++ b/kernel/x86_64/KERNEL.ATOM
@@ -29,8 +29,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4.S
SGEMMOTCOPY = gemm_tcopy_4.S
-SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_4x2_atom.S
diff --git a/kernel/x86_64/KERNEL.BARCELONA b/kernel/x86_64/KERNEL.BARCELONA
index 051a522..313c62d 100644
--- a/kernel/x86_64/KERNEL.BARCELONA
+++ b/kernel/x86_64/KERNEL.BARCELONA
@@ -1,13 +1,13 @@
ZGEMVNKERNEL = zgemv_n_dup.S
-ZGEMVTKERNEL = zgemv_t_dup.S
+ZGEMVTKERNEL = zgemv_t.S
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4_opteron.S
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
-SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_4x4_barcelona.S
diff --git a/kernel/x86_64/KERNEL.BOBCAT b/kernel/x86_64/KERNEL.BOBCAT
index 051a522..313c62d 100644
--- a/kernel/x86_64/KERNEL.BOBCAT
+++ b/kernel/x86_64/KERNEL.BOBCAT
@@ -1,13 +1,13 @@
ZGEMVNKERNEL = zgemv_n_dup.S
-ZGEMVTKERNEL = zgemv_t_dup.S
+ZGEMVTKERNEL = zgemv_t.S
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4_opteron.S
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
-SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_4x4_barcelona.S
diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER
index 3803cd1..70370a7 100644
--- a/kernel/x86_64/KERNEL.BULLDOZER
+++ b/kernel/x86_64/KERNEL.BULLDOZER
@@ -1,9 +1,9 @@
+
ZGEMVNKERNEL = zgemv_n_dup.S
-ZGEMVTKERNEL = zgemv_t_dup.S
+ZGEMVTKERNEL = zgemv_t.S
DGEMVNKERNEL = dgemv_n_bulldozer.S
DGEMVTKERNEL = dgemv_t_bulldozer.S
-DAXPYKERNEL = daxpy_bulldozer.S
DDOTKERNEL = ddot_bulldozer.S
DCOPYKERNEL = dcopy_bulldozer.S
@@ -13,8 +13,8 @@ SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
-SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
diff --git a/kernel/x86_64/KERNEL.CORE2 b/kernel/x86_64/KERNEL.CORE2
index 8a07e80..867c941 100644
--- a/kernel/x86_64/KERNEL.CORE2
+++ b/kernel/x86_64/KERNEL.CORE2
@@ -3,8 +3,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4.S
SGEMMOTCOPY = gemm_tcopy_4.S
-SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_4x4_core2.S
diff --git a/kernel/x86_64/KERNEL.DUNNINGTON b/kernel/x86_64/KERNEL.DUNNINGTON
index b96daa0..8c2a23c 100644
--- a/kernel/x86_64/KERNEL.DUNNINGTON
+++ b/kernel/x86_64/KERNEL.DUNNINGTON
@@ -3,8 +3,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4.S
SGEMMOTCOPY = gemm_tcopy_4.S
-SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_4x4_penryn.S
diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index 4474810..ae316cf 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -1,3 +1,4 @@
+
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
@@ -9,12 +10,12 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_4x4_haswell.S
-DGEMMINCOPY =
-DGEMMITCOPY =
+DGEMMINCOPY =
+DGEMMITCOPY =
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
-DGEMMINCOPYOBJ =
-DGEMMITCOPYOBJ =
+DGEMMINCOPYOBJ =
+DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/x86_64/KERNEL.NANO b/kernel/x86_64/KERNEL.NANO
index 0b771a4..e30bd2b 100644
--- a/kernel/x86_64/KERNEL.NANO
+++ b/kernel/x86_64/KERNEL.NANO
@@ -3,8 +3,8 @@ SGEMMINCOPY = gemm_ncopy_4.S
SGEMMITCOPY = gemm_tcopy_4.S
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
-SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_4x4_penryn.S
diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM
index 878e3cd..2f9c205 100644
--- a/kernel/x86_64/KERNEL.NEHALEM
+++ b/kernel/x86_64/KERNEL.NEHALEM
@@ -1,21 +1,22 @@
+
SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
SGEMMINCOPY = gemm_ncopy_4.S
SGEMMITCOPY = gemm_tcopy_4.S
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
-SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-DGEMMKERNEL = gemm_kernel_4x4_core2.S
-DGEMMINCOPY =
-DGEMMITCOPY =
-DGEMMONCOPY = gemm_ncopy_4.S
-DGEMMOTCOPY = gemm_tcopy_4.S
-DGEMMINCOPYOBJ =
-DGEMMITCOPYOBJ =
+DGEMMKERNEL = gemm_kernel_2x8_nehalem.S
+DGEMMINCOPY = ../generic/gemm_ncopy_2.c
+DGEMMITCOPY = ../generic/gemm_tcopy_2.c
+DGEMMONCOPY = ../generic/gemm_ncopy_8.c
+DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
@@ -44,11 +45,10 @@ STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S
-DTRSMKERNEL_LN = trsm_kernel_LN_4x4_core2.S
-DTRSMKERNEL_LT = trsm_kernel_LT_4x4_core2.S
-DTRSMKERNEL_RN = trsm_kernel_LT_4x4_core2.S
-DTRSMKERNEL_RT = trsm_kernel_RT_4x4_core2.S
-
+DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S
+DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S
+DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S
+DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S
diff --git a/kernel/x86_64/KERNEL.OPTERON b/kernel/x86_64/KERNEL.OPTERON
index 27fb785..d917c27 100644
--- a/kernel/x86_64/KERNEL.OPTERON
+++ b/kernel/x86_64/KERNEL.OPTERON
@@ -3,8 +3,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4_opteron.S
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
-SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_4x4_sse2.S
diff --git a/kernel/x86_64/KERNEL.OPTERON_SSE3 b/kernel/x86_64/KERNEL.OPTERON_SSE3
index 565daf3..7218ad6 100644
--- a/kernel/x86_64/KERNEL.OPTERON_SSE3
+++ b/kernel/x86_64/KERNEL.OPTERON_SSE3
@@ -1,13 +1,13 @@
ZGEMVNKERNEL = zgemv_n_dup.S
-ZGEMVTKERNEL = zgemv_t_dup.S
+ZGEMVTKERNEL = zgemv_t.S
SGEMMKERNEL = gemm_kernel_8x4_sse.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4_opteron.S
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
-SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_4x4_sse2.S
diff --git a/kernel/x86_64/KERNEL.PENRYN b/kernel/x86_64/KERNEL.PENRYN
index b96daa0..8c2a23c 100644
--- a/kernel/x86_64/KERNEL.PENRYN
+++ b/kernel/x86_64/KERNEL.PENRYN
@@ -3,8 +3,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4.S
SGEMMOTCOPY = gemm_tcopy_4.S
-SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_4x4_penryn.S
diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER
index abed953..92b5dc7 100644
--- a/kernel/x86_64/KERNEL.PILEDRIVER
+++ b/kernel/x86_64/KERNEL.PILEDRIVER
@@ -1,9 +1,9 @@
+
ZGEMVNKERNEL = zgemv_n_dup.S
-ZGEMVTKERNEL = zgemv_t_dup.S
+ZGEMVTKERNEL = zgemv_t.S
DGEMVNKERNEL = dgemv_n_bulldozer.S
DGEMVTKERNEL = dgemv_t_bulldozer.S
-DAXPYKERNEL = daxpy_bulldozer.S
DDOTKERNEL = ddot_bulldozer.S
DCOPYKERNEL = dcopy_bulldozer.S
@@ -12,8 +12,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
-SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/x86_64/KERNEL.PRESCOTT b/kernel/x86_64/KERNEL.PRESCOTT
index e155531..0ea43ad 100644
--- a/kernel/x86_64/KERNEL.PRESCOTT
+++ b/kernel/x86_64/KERNEL.PRESCOTT
@@ -1,13 +1,13 @@
ZGEMVNKERNEL = zgemv_n_dup.S
-ZGEMVTKERNEL = zgemv_t_dup.S
+ZGEMVTKERNEL = zgemv_t.S
SGEMMKERNEL = gemm_kernel_8x4_sse3.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4.S
SGEMMOTCOPY = gemm_tcopy_4.S
-SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_4x4_sse3.S
@@ -19,7 +19,7 @@ DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-CGEMMKERNEL = zgemm_kernel_4x2_sse3.S
+CGEMMKERNEL = zgemm_kernel_4x2_sse.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = zgemm_ncopy_2.S
diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE
index 4d095d2..7228357 100644
--- a/kernel/x86_64/KERNEL.SANDYBRIDGE
+++ b/kernel/x86_64/KERNEL.SANDYBRIDGE
@@ -1,14 +1,14 @@
-SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
-SGEMMINCOPY = gemm_ncopy_4.S
-SGEMMITCOPY = gemm_tcopy_4.S
-SGEMMONCOPY = ../generic/gemm_ncopy_8.c
-SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
+
+SGEMMKERNEL = sgemm_kernel_16x4_sandy.S
+SGEMMINCOPY = ../generic/gemm_ncopy_16.c
+SGEMMITCOPY = ../generic/gemm_tcopy_16.c
+SGEMMONCOPY = ../generic/gemm_ncopy_4.c
+SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
DGEMMKERNEL = dgemm_kernel_4x8_sandy.S
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
@@ -19,11 +19,11 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S
-CGEMMINCOPY = zgemm_ncopy_2.S
-CGEMMITCOPY = zgemm_tcopy_2.S
-CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
-CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
+CGEMMKERNEL = cgemm_kernel_8x2_sandy.S
+CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
+CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
+CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
@@ -31,12 +31,12 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S
-ZGEMMINCOPY =
-ZGEMMITCOPY =
+ZGEMMINCOPY =
+ZGEMMITCOPY =
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
-ZGEMMINCOPYOBJ =
-ZGEMMITCOPYOBJ =
+ZGEMMINCOPYOBJ =
+ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/x86_64/amax.S b/kernel/x86_64/amax.S
index d096d88..0e9bf4d 100644
--- a/kernel/x86_64/amax.S
+++ b/kernel/x86_64/amax.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1
#define X ARG2
#define INCX ARG3
@@ -68,7 +68,7 @@
FLD (X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
addq INCX, X
decq M
@@ -81,7 +81,7 @@
sarq $3, I
jle .L20
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -89,7 +89,7 @@
FLD 0 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -98,7 +98,7 @@
FLD 1 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -107,7 +107,7 @@
FLD 2 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -116,7 +116,7 @@
FLD 3 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -125,7 +125,7 @@
FLD 4 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -134,7 +134,7 @@
FLD 5 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -143,7 +143,7 @@
FLD 6 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -152,7 +152,7 @@
FLD 7 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -175,7 +175,7 @@
.L21:
FLD 0 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -193,12 +193,12 @@
sarq $3, I
jle .L60
ALIGN_4
-
+
.L50:
FLD 0 * SIZE(X)
addq INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -208,7 +208,7 @@
FLD 0 * SIZE(X)
addq INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -218,7 +218,7 @@
FLD 0 * SIZE(X)
addq INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -228,7 +228,7 @@
FLD 0 * SIZE(X)
addq INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -238,7 +238,7 @@
FLD 0 * SIZE(X)
addq INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -248,7 +248,7 @@
FLD 0 * SIZE(X)
addq INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -258,7 +258,7 @@
FLD 0 * SIZE(X)
addq INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -268,7 +268,7 @@
FLD 0 * SIZE(X)
addq INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -289,7 +289,7 @@
.L61:
FLD 0 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
diff --git a/kernel/x86_64/amax_atom.S b/kernel/x86_64/amax_atom.S
index fa7b9a3..6164cb3 100644
--- a/kernel/x86_64/amax_atom.S
+++ b/kernel/x86_64/amax_atom.S
@@ -38,13 +38,13 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
#define I %rax
-
+
#ifdef USE_MIN
#define maxsd minsd
#endif
@@ -103,7 +103,7 @@
decq I
jle .L13
ALIGN_4
-
+
.L12:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -227,7 +227,7 @@
maxsd %xmm7, %xmm3
addq $4 * SIZE, X
- ALIGN_3
+ ALIGN_3
.L17:
testq $2, M
@@ -246,7 +246,7 @@
maxsd %xmm5, %xmm2
addq $2 * SIZE, X
ALIGN_3
-
+
.L18:
testq $1, M
jle .L998
@@ -284,7 +284,7 @@
decq I
jle .L23
ALIGN_4
-
+
.L22:
#ifdef USE_ABS
andps %xmm15, %xmm4
@@ -412,7 +412,7 @@
andps %xmm15, %xmm7
#endif
maxsd %xmm7, %xmm3
- ALIGN_3
+ ALIGN_3
.L27:
testq $2, M
@@ -432,7 +432,7 @@
#endif
maxsd %xmm5, %xmm2
ALIGN_3
-
+
.L28:
testq $1, M
jle .L998
diff --git a/kernel/x86_64/amax_sse.S b/kernel/x86_64/amax_sse.S
index 22b8b16..2349905 100644
--- a/kernel/x86_64/amax_sse.S
+++ b/kernel/x86_64/amax_sse.S
@@ -38,18 +38,18 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
#define I %rax
-
+
#ifdef USE_MIN
#define maxps minps
#define maxss minss
#endif
-
+
#include "l1param.h"
PROLOGUE
@@ -126,7 +126,7 @@
decq I
jle .L12
ALIGN_4
-
+
.L11:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -297,7 +297,7 @@
#endif
maxps %xmm4, %xmm2
addq $4 * SIZE, X
- ALIGN_3
+ ALIGN_3
.L18:
testq $2, M
@@ -311,7 +311,7 @@
maxps %xmm4, %xmm3
addq $2 * SIZE, X
ALIGN_3
-
+
.L19:
testq $1, M
je .L998
@@ -329,7 +329,7 @@
sarq $3, I
jle .L45
ALIGN_4
-
+
.L41:
movss (X), %xmm4
addq INCX, X
@@ -422,7 +422,7 @@
andps %xmm15, %xmm7
#endif
maxss %xmm7, %xmm3
- ALIGN_3
+ ALIGN_3
.L46:
testq $2, M
@@ -442,7 +442,7 @@
#endif
maxss %xmm5, %xmm1
ALIGN_3
-
+
.L47:
testq $1, M
je .L998
diff --git a/kernel/x86_64/amax_sse2.S b/kernel/x86_64/amax_sse2.S
index 033e8e1..44ddaba 100644
--- a/kernel/x86_64/amax_sse2.S
+++ b/kernel/x86_64/amax_sse2.S
@@ -38,13 +38,13 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
#define I %rax
-
+
#ifdef USE_MIN
#define maxpd minpd
#define maxsd minsd
@@ -112,7 +112,7 @@
decq I
jle .L12
ALIGN_4
-
+
.L11:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -270,7 +270,7 @@
maxpd %xmm5, %xmm1
addq $4 * SIZE, X
- ALIGN_3
+ ALIGN_3
.L17:
testq $2, M
@@ -282,8 +282,8 @@
#endif
maxpd %xmm4, %xmm2
addq $2 * SIZE, X
- ALIGN_3
-
+ ALIGN_3
+
.L18:
testq $1, M
jle .L998
@@ -302,7 +302,7 @@
sarq $4, I
jle .L45
ALIGN_4
-
+
.L41:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -453,7 +453,7 @@
andps %xmm15, %xmm5
#endif
maxpd %xmm5, %xmm1
- ALIGN_3
+ ALIGN_3
.L47:
testq $2, M
@@ -468,7 +468,7 @@
#endif
maxpd %xmm6, %xmm2
ALIGN_3
-
+
.L48:
testq $1, M
je .L998
diff --git a/kernel/x86_64/asum.S b/kernel/x86_64/asum.S
index 13c6f4f..31f9738 100644
--- a/kernel/x86_64/asum.S
+++ b/kernel/x86_64/asum.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1
#define X ARG2
#define INCX ARG3
@@ -68,7 +68,7 @@
sarq $3, I
jle .L20
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -128,7 +128,7 @@
sarq $3, I
jle .L60
ALIGN_4
-
+
.L50:
FLD (X)
addq INCX, X
diff --git a/kernel/x86_64/asum_atom.S b/kernel/x86_64/asum_atom.S
index b6ea65f..910a48f 100644
--- a/kernel/x86_64/asum_atom.S
+++ b/kernel/x86_64/asum_atom.S
@@ -38,20 +38,20 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
#define I %rax
-
+
#include "l1param.h"
PROLOGUE
PROFCODE
SAVEREGISTERS
-
+
xorps %xmm0, %xmm0
testq M, M
jle .L999
@@ -101,7 +101,7 @@
decq I
jle .L11
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -256,7 +256,7 @@
pshufd $0x4e, %xmm5, %xmm13
addsd %xmm5, %xmm2
addsd %xmm13, %xmm3
- ALIGN_3
+ ALIGN_3
.L14:
testq $2, M
@@ -269,8 +269,8 @@
pshufd $0x4e, %xmm4, %xmm5
addsd %xmm4, %xmm2
addsd %xmm5, %xmm3
- ALIGN_3
-
+ ALIGN_3
+
.L15:
testq $1, M
je .L998
@@ -306,7 +306,7 @@
decq I
jle .L23
ALIGN_4
-
+
.L22:
andps %xmm15, %xmm4
addq INCX, X
@@ -391,7 +391,7 @@
addsd %xmm6, %xmm2
andps %xmm15, %xmm7
addsd %xmm7, %xmm3
- ALIGN_3
+ ALIGN_3
.L26:
testq $2, M
@@ -408,7 +408,7 @@
addsd %xmm4, %xmm0
addsd %xmm5, %xmm1
ALIGN_3
-
+
.L27:
testq $1, M
je .L998
@@ -426,7 +426,7 @@
.L999:
RESTOREREGISTERS
-
+
ret
EPILOGUE
diff --git a/kernel/x86_64/asum_sse.S b/kernel/x86_64/asum_sse.S
index 840e193..7d7004d 100644
--- a/kernel/x86_64/asum_sse.S
+++ b/kernel/x86_64/asum_sse.S
@@ -38,20 +38,20 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
#define I %rax
-
+
#include "l1param.h"
PROLOGUE
PROFCODE
SAVEREGISTERS
-
+
xorps %xmm0, %xmm0
testq M, M
jle .L999
@@ -112,7 +112,7 @@
decq I
jle .L12
ALIGN_3
-
+
.L11:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -259,7 +259,7 @@
sarq $3, I
jle .L105
ALIGN_4
-
+
.L101:
movss 0 * SIZE(X), %xmm4
addq INCX, X
@@ -327,7 +327,7 @@
#ifndef HAVE_SSE3
movhlps %xmm0, %xmm1
addps %xmm1, %xmm0
-
+
movaps %xmm0, %xmm1
shufps $1, %xmm0, %xmm0
addss %xmm1, %xmm0
@@ -339,7 +339,7 @@
.L999:
RESTOREREGISTERS
-
+
ret
EPILOGUE
diff --git a/kernel/x86_64/asum_sse2.S b/kernel/x86_64/asum_sse2.S
index 7286fc0..e75ebde 100644
--- a/kernel/x86_64/asum_sse2.S
+++ b/kernel/x86_64/asum_sse2.S
@@ -38,20 +38,20 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
#define I %rax
-
+
#include "l1param.h"
PROLOGUE
PROFCODE
SAVEREGISTERS
-
+
xorps %xmm0, %xmm0
testq M, M
jle .L999
@@ -101,7 +101,7 @@
decq I
jle .L11
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -209,7 +209,7 @@
addpd %xmm5, %xmm1
addq $4 * SIZE, X
- ALIGN_3
+ ALIGN_3
.L22:
testq $2, M
@@ -219,7 +219,7 @@
andps %xmm15, %xmm6
addpd %xmm6, %xmm3
addq $2 * SIZE, X
-
+
.L23:
testq $1, M
je .L998
@@ -238,7 +238,7 @@
sarq $3, I
jle .L60
ALIGN_4
-
+
.L50:
movsd -16 * SIZE(X), %xmm4
addq INCX, X
@@ -304,7 +304,7 @@
#endif
RESTOREREGISTERS
-
+
ret
EPILOGUE
diff --git a/kernel/x86_64/axpy.S b/kernel/x86_64/axpy.S
index 478cc88..0ad6591 100644
--- a/kernel/x86_64/axpy.S
+++ b/kernel/x86_64/axpy.S
@@ -44,14 +44,14 @@
#define INCX ARG5 /* rdx */
#define Y ARG6 /* rcx */
#define INCY ARG2 /* r8 */
-
+
#define ALPHA 8(%rsp)
#include "l1param.h"
PROLOGUE
PROFCODE
-
+
movq 24(%rsp), INCY
FLD ALPHA
@@ -61,7 +61,7 @@
testq M, M
jle .L40
-
+
cmpq $SIZE, INCX
jne .L14
cmpq $SIZE, INCY
diff --git a/kernel/x86_64/axpy_atom.S b/kernel/x86_64/axpy_atom.S
index a786329..adfd691 100644
--- a/kernel/x86_64/axpy_atom.S
+++ b/kernel/x86_64/axpy_atom.S
@@ -84,7 +84,7 @@
testq M, M
jle .L29
-
+
cmpq $SIZE, INCX
jne .L20
cmpq $SIZE, INCY
diff --git a/kernel/x86_64/axpy_sse.S b/kernel/x86_64/axpy_sse.S
index 2a9e928..dd52a7c 100644
--- a/kernel/x86_64/axpy_sse.S
+++ b/kernel/x86_64/axpy_sse.S
@@ -69,7 +69,7 @@
#endif
movaps %xmm0, ALPHA
#else
-
+
movq 40(%rsp), X
movq 48(%rsp), INCX
@@ -82,7 +82,7 @@
#ifdef WINDOWS_ABI
movaps %xmm3, ALPHA
#endif
-
+
shufps $0, ALPHA, ALPHA
leaq (, INCX, SIZE), INCX
@@ -90,7 +90,7 @@
testq M, M
jle .L19
-
+
cmpq $SIZE, INCX
jne .L50
cmpq $SIZE, INCY
@@ -368,7 +368,7 @@
.L20:
#ifdef ALIGNED_ACCESS
-
+
testq $SIZE, X
jne .L30
diff --git a/kernel/x86_64/axpy_sse2.S b/kernel/x86_64/axpy_sse2.S
index 45c7b03..9b07b90 100644
--- a/kernel/x86_64/axpy_sse2.S
+++ b/kernel/x86_64/axpy_sse2.S
@@ -57,7 +57,7 @@
#define ALPHA %xmm15
#include "l1param.h"
-
+
PROLOGUE
PROFCODE
@@ -89,7 +89,7 @@
testq M, M
jle .L47
-
+
cmpq $SIZE, INCX
jne .L40
cmpq $SIZE, INCY
@@ -813,7 +813,7 @@
je .L46
cmpq $0, INCY
je .L46
-
+
sarq $3, %rax
jle .L45
ALIGN_3
diff --git a/kernel/x86_64/builtin_stinit.S b/kernel/x86_64/builtin_stinit.S
index c05a1c5..cb3a288 100644
--- a/kernel/x86_64/builtin_stinit.S
+++ b/kernel/x86_64/builtin_stinit.S
@@ -53,7 +53,7 @@
cmpq $4096, %rax
jg .L01
ALIGN_3
-
+
.L999:
subq %rax, %rsp
ret
diff --git a/kernel/x86_64/cabs.S b/kernel/x86_64/cabs.S
index 0b1a911..7de9ca4 100644
--- a/kernel/x86_64/cabs.S
+++ b/kernel/x86_64/cabs.S
@@ -46,7 +46,7 @@
movsd 0 * SIZE(ARG1), %xmm0
movsd 1 * SIZE(ARG1), %xmm1
pcmpeqb %xmm4, %xmm4
-
+
psrlq $1, %xmm4
andpd %xmm4, %xmm0
andpd %xmm4, %xmm1
@@ -55,13 +55,13 @@
movss 0 * SIZE(ARG1), %xmm0
movss 1 * SIZE(ARG1), %xmm1
pcmpeqb %xmm4, %xmm4
-
+
psrld $1, %xmm4
andps %xmm4, %xmm0
andps %xmm4, %xmm1
addps %xmm1, %xmm0
#endif
-
+
#if !defined(DOUBLE) && defined(NEED_F2CCONV)
cvtss2sd %xmm0, %xmm0
#endif
diff --git a/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S b/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S
index 431f25a..97958a8 100644
--- a/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S
+++ b/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S
@@ -79,8 +79,7 @@
#endif
-#define L_BUFFER_SIZE 512*8*4
-#define LB2_OFFSET 512*8*2
+#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@@ -523,16 +522,16 @@
#ifdef WINDOWS_ABI
movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp)
- movups %xmm6, 64(%rsp)
- movups %xmm7, 80(%rsp)
- movups %xmm8, 96(%rsp)
- movups %xmm9, 112(%rsp)
- movups %xmm10, 128(%rsp)
- movups %xmm11, 144(%rsp)
- movups %xmm12, 160(%rsp)
- movups %xmm13, 176(%rsp)
- movups %xmm14, 192(%rsp)
- movups %xmm15, 208(%rsp)
+ vmovups %xmm6, 64(%rsp)
+ vmovups %xmm7, 80(%rsp)
+ vmovups %xmm8, 96(%rsp)
+ vmovups %xmm9, 112(%rsp)
+ vmovups %xmm10, 128(%rsp)
+ vmovups %xmm11, 144(%rsp)
+ vmovups %xmm12, 160(%rsp)
+ vmovups %xmm13, 176(%rsp)
+ vmovups %xmm14, 192(%rsp)
+ vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M
movq ARG2, OLD_N
@@ -542,14 +541,15 @@
movq OLD_C, C
movq OLD_LDC, LDC
#ifdef TRMMKERNEL
- movsd OLD_OFFSET, %xmm12
+ vmovsd OLD_OFFSET, %xmm12
#endif
vmovaps %xmm3, %xmm0
+ vmovsd OLD_ALPHA_I, %xmm1
#else
movq STACKSIZE + 8(%rsp), LDC
#ifdef TRMMKERNEL
- movsd STACKSIZE + 16(%rsp), %xmm12
+ vmovsd STACKSIZE + 16(%rsp), %xmm12
#endif
#endif
@@ -1866,6 +1866,8 @@
.L999:
+ vzeroupper
+
movq SP, %rsp
movq (%rsp), %rbx
movq 8(%rsp), %rbp
@@ -1877,16 +1879,16 @@
#ifdef WINDOWS_ABI
movq 48(%rsp), %rdi
movq 56(%rsp), %rsi
- movups 64(%rsp), %xmm6
- movups 80(%rsp), %xmm7
- movups 96(%rsp), %xmm8
- movups 112(%rsp), %xmm9
- movups 128(%rsp), %xmm10
- movups 144(%rsp), %xmm11
- movups 160(%rsp), %xmm12
- movups 176(%rsp), %xmm13
- movups 192(%rsp), %xmm14
- movups 208(%rsp), %xmm15
+ vmovups 64(%rsp), %xmm6
+ vmovups 80(%rsp), %xmm7
+ vmovups 96(%rsp), %xmm8
+ vmovups 112(%rsp), %xmm9
+ vmovups 128(%rsp), %xmm10
+ vmovups 144(%rsp), %xmm11
+ vmovups 160(%rsp), %xmm12
+ vmovups 176(%rsp), %xmm13
+ vmovups 192(%rsp), %xmm14
+ vmovups 208(%rsp), %xmm15
#endif
addq $STACKSIZE, %rsp
diff --git a/kernel/x86_64/cgemm_kernel_4x2_piledriver.S b/kernel/x86_64/cgemm_kernel_4x2_piledriver.S
index 9313162..72deee1 100644
--- a/kernel/x86_64/cgemm_kernel_4x2_piledriver.S
+++ b/kernel/x86_64/cgemm_kernel_4x2_piledriver.S
@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/*********************************************************************
*
-* 2013/10/31 Saar
+* 2014/06/28 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
@@ -104,8 +104,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
-#define L_BUFFER_SIZE 512*8*4
-#define LB2_OFFSET 512*8*2
+#define L_BUFFER_SIZE 256*8*4
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@@ -116,7 +115,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
-#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384
@@ -548,16 +546,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef WINDOWS_ABI
movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp)
- movups %xmm6, 64(%rsp)
- movups %xmm7, 80(%rsp)
- movups %xmm8, 96(%rsp)
- movups %xmm9, 112(%rsp)
- movups %xmm10, 128(%rsp)
- movups %xmm11, 144(%rsp)
- movups %xmm12, 160(%rsp)
- movups %xmm13, 176(%rsp)
- movups %xmm14, 192(%rsp)
- movups %xmm15, 208(%rsp)
+ vmovups %xmm6, 64(%rsp)
+ vmovups %xmm7, 80(%rsp)
+ vmovups %xmm8, 96(%rsp)
+ vmovups %xmm9, 112(%rsp)
+ vmovups %xmm10, 128(%rsp)
+ vmovups %xmm11, 144(%rsp)
+ vmovups %xmm12, 160(%rsp)
+ vmovups %xmm13, 176(%rsp)
+ vmovups %xmm14, 192(%rsp)
+ vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M
movq ARG2, OLD_N
@@ -570,6 +568,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
movsd OLD_OFFSET, %xmm12
#endif
vmovaps %xmm3, %xmm0
+ vmovsd OLD_ALPHA_I, %xmm1
#else
movq STACKSIZE + 8(%rsp), LDC
@@ -1891,6 +1890,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L999:
+ vzeroupper
+
movq SP, %rsp
movq (%rsp), %rbx
movq 8(%rsp), %rbp
@@ -1902,16 +1903,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef WINDOWS_ABI
movq 48(%rsp), %rdi
movq 56(%rsp), %rsi
- movups 64(%rsp), %xmm6
- movups 80(%rsp), %xmm7
- movups 96(%rsp), %xmm8
- movups 112(%rsp), %xmm9
- movups 128(%rsp), %xmm10
- movups 144(%rsp), %xmm11
- movups 160(%rsp), %xmm12
- movups 176(%rsp), %xmm13
- movups 192(%rsp), %xmm14
- movups 208(%rsp), %xmm15
+ vmovups 64(%rsp), %xmm6
+ vmovups 80(%rsp), %xmm7
+ vmovups 96(%rsp), %xmm8
+ vmovups 112(%rsp), %xmm9
+ vmovups 128(%rsp), %xmm10
+ vmovups 144(%rsp), %xmm11
+ vmovups 160(%rsp), %xmm12
+ vmovups 176(%rsp), %xmm13
+ vmovups 192(%rsp), %xmm14
+ vmovups 208(%rsp), %xmm15
#endif
addq $STACKSIZE, %rsp
diff --git a/kernel/x86_64/cgemm_kernel_4x8_sandy.S b/kernel/x86_64/cgemm_kernel_4x8_sandy.S
index 5a55880..487f959 100644
--- a/kernel/x86_64/cgemm_kernel_4x8_sandy.S
+++ b/kernel/x86_64/cgemm_kernel_4x8_sandy.S
@@ -13,19 +13,19 @@ notice, this list of conditions and the following disclaimer.
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
-3. Neither the name of the ISCAS nor the names of its contributors may
-be used to endorse or promote products derived from this software
+3. Neither the name of the ISCAS nor the names of its contributors may
+be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -59,7 +59,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef WINDOWS_ABI
-#define STACKSIZE 128
+#define STACKSIZE 128
#define old_ldc 8+STACKSIZE(%rsp)
#define old_offset 16+STACKSIZE(%rsp)
@@ -144,10 +144,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OR orq
#define JNE jne
#define JMP jmp
-#define NOP
+#define NOP
#define XOR xorpd
#undef MOVQ
-#define MOVQ movq
+#define MOVQ movq
#define XOR_SY vxorps
#define XOR_SX vxorps
@@ -171,7 +171,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define EDUP_SX vmovsldup
#define ODUP_SX vmovshdup
-#define ADD_SY vaddps
+#define ADD_SY vaddps
#define ADD_SX vaddps
#define SUB_SY vsubps
#define SUB_SX vsubps
@@ -189,7 +189,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VPERMILP_SX vpermilps
#define BROAD_SY vbroadcastss
-#define BROAD_SX vbroadcastss
+#define BROAD_SX vbroadcastss
#define MOV_SY vmovaps
#define MOV_SX vmovaps
@@ -214,7 +214,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ADD2_SY ADDSUB_SY
#define ADD1_SX SUB_SX
#define ADD2_SX ADDSUB_SX
-#else
+#else
#define ADD1_SY ADD_SY
#define ADD2_SY ADDSUB_SY
#define ADD1_SX ADD_SX
@@ -309,7 +309,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 8), ptrba;
LEAQ (ptrbb, %rax, 4), ptrbb;
#endif
-# Initial results register
+# Initial results register
PREFETCH0 0*SIZE(prebb);
XOR_SY yvec15, yvec15, yvec15;
PREFETCH0 16*SIZE(prebb);
@@ -338,7 +338,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $8, %rax;
-#else
+#else
ADDQ $4, %rax;
#endif
MOVQ %rax, kkk;
@@ -366,7 +366,7 @@ VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1
ADD1_SY yvec6, yvec14, yvec14;
ADD1_SY yvec7, yvec12, yvec12;
-MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec4, yvec6;
MUL_SY yvec0, yvec5, yvec7;
VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1..
ADD1_SY yvec6, yvec11, yvec11;
@@ -420,7 +420,7 @@ VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1
ADD1_SY yvec6, yvec14, yvec14;
ADD1_SY yvec7, yvec12, yvec12;
-MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec4, yvec6;
MUL_SY yvec0, yvec5, yvec7;
VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1..
ADD1_SY yvec6, yvec11, yvec11;
@@ -474,7 +474,7 @@ VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1
ADD1_SY yvec6, yvec14, yvec14;
ADD1_SY yvec7, yvec12, yvec12;
-MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec4, yvec6;
MUL_SY yvec0, yvec5, yvec7;
VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1..
ADD1_SY yvec6, yvec11, yvec11;
@@ -530,7 +530,7 @@ ADDQ $32*SIZE, ptrbb;
ADD1_SY yvec6, yvec14, yvec14;
ADD1_SY yvec7, yvec12, yvec12;
-MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec4, yvec6;
MUL_SY yvec0, yvec5, yvec7;
VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1..
ADD1_SY yvec6, yvec11, yvec11;
@@ -573,7 +573,7 @@ ALIGN_5
.L2_loopE:;
#ifndef TRMMKERNEL
TEST $2, bk;
-#else
+#else
TEST $2, kkk;
#endif
JLE .L3_loopE;
@@ -595,7 +595,7 @@ ADD1_SY yvec6, yvec14, yvec14;
ADD1_SY yvec7, yvec12, yvec12;
ODUP_SY 0*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3
-MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec4, yvec6;
MUL_SY yvec0, yvec5, yvec7;
VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1
ADD1_SY yvec6, yvec11, yvec11;
@@ -650,7 +650,7 @@ ADD1_SY yvec6, yvec14, yvec14;
ADD1_SY yvec7, yvec12, yvec12;
ODUP_SY 8*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3
-MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec4, yvec6;
MUL_SY yvec0, yvec5, yvec7;
ADDQ $16*SIZE, ptrbb;
VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1
@@ -714,7 +714,7 @@ ADD1_SY yvec6, yvec14, yvec14;
ADD1_SY yvec7, yvec12, yvec12;
ODUP_SY 0*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3
-MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec4, yvec6;
MUL_SY yvec0, yvec5, yvec7;
ADDQ $8*SIZE, ptrbb;
VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1
@@ -903,7 +903,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 8), ptrba;
LEAQ (ptrbb, %rax, 4), ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $8, kk;
#endif
ADDQ $16*SIZE,C0;
@@ -1048,7 +1048,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 8), ptrba;
LEAQ (ptrbb, %rax, 4), ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $8, kk;
#endif
ADDQ $16*SIZE, C0;
@@ -1084,7 +1084,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $4, %rax;
-#else
+#else
ADDQ $4, %rax;
#endif
MOVQ %rax, kkk;
@@ -1224,7 +1224,7 @@ ALIGN_5
.L8_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
-#else
+#else
TEST $2, kkk;
#endif
JLE .L9_loopE;
@@ -1462,7 +1462,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 4), ptrba;
LEAQ (ptrbb, %rax, 4), ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $4, kk;
#endif
@@ -1498,7 +1498,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $2, %rax;
-#else
+#else
ADDQ $4, %rax;
#endif
MOVQ %rax, kkk;
@@ -1843,7 +1843,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 2), ptrba;
LEAQ (ptrbb, %rax, 4), ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $2, kk;
#endif
@@ -1876,7 +1876,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $1, %rax;
-#else
+#else
ADDQ $4, %rax;
#endif
MOVQ %rax, kkk;
@@ -2090,7 +2090,7 @@ SALQ $ZBASE_SHIFT, %rax;
ADDQ %rax, ptrba;
LEAQ (ptrbb, %rax, 4), ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $1, kk;
#endif
@@ -2152,7 +2152,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $8, %rax;
-#else
+#else
ADDQ $2, %rax;
#endif
MOVQ %rax, kkk;
@@ -2795,7 +2795,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 8), ptrba;
LEAQ (ptrbb, %rax, 2), ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $8, kk;
#endif
@@ -2832,7 +2832,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $4, %rax;
-#else
+#else
ADDQ $2, %rax;
#endif
MOVQ %rax, kkk;
@@ -2984,7 +2984,7 @@ ALIGN_5
.L221_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
-#else
+#else
TEST $2, kkk;
#endif
JLE .L222_loopE;
@@ -3205,7 +3205,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 4), ptrba;
LEAQ (ptrbb, %rax, 2), ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $4, kk;
#endif
@@ -3238,7 +3238,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $2, %rax;
-#else
+#else
ADDQ $2, %rax;
#endif
MOVQ %rax, kkk;
@@ -3337,7 +3337,7 @@ TEST $2, bk;
TEST $2, kkk;
#endif
JLE .L232_loopE;
-ALIGN_5
+ALIGN_5
.L232_bodyB:
EDUP_SX 0*SIZE(ptrbb), xvec4;
ODUP_SX 0*SIZE(ptrbb), xvec5;
@@ -3471,7 +3471,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 2), ptrba;
LEAQ (ptrbb, %rax, 2), ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $2, kk;
#endif
@@ -3503,7 +3503,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $1, %rax;
-#else
+#else
ADDQ $2, %rax;
#endif
MOVQ %rax, kkk;
@@ -3646,7 +3646,7 @@ SALQ $ZBASE_SHIFT, %rax;
ADDQ %rax, ptrba;
LEAQ (ptrbb, %rax, 2), ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $1, kk;
#endif
ADDQ $2*SIZE, C0;
@@ -3698,7 +3698,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $8, %rax;
-#else
+#else
ADDQ $1, %rax;
#endif
MOVQ %rax, kkk;
@@ -3913,7 +3913,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 8), ptrba;
ADDQ %rax, ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $8, kk;
#endif
ADDQ $16*SIZE, C0;
@@ -3945,7 +3945,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $4, %rax;
-#else
+#else
ADDQ $1, %rax;
#endif
MOVQ %rax, kkk;
@@ -4098,7 +4098,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 4), ptrba;
ADDQ %rax, ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $4, kk;
#endif
ADDQ $8*SIZE, C0;
@@ -4128,7 +4128,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $2, %rax;
-#else
+#else
ADDQ $1, %rax;
#endif
MOVQ %rax, kkk;
@@ -4270,7 +4270,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 2), ptrba;
ADDQ %rax, ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $2, kk;
#endif
ADDQ $4*SIZE, C0;
@@ -4300,7 +4300,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $1, %rax;
-#else
+#else
ADDQ $1, %rax;
#endif
MOVQ %rax, kkk;
@@ -4413,7 +4413,7 @@ SALQ $ZBASE_SHIFT, %rax;
ADDQ %rax, ptrba;
ADDQ %rax, ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $1, kk;
#endif
ADDQ $2*SIZE, C0;
diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.S b/kernel/x86_64/cgemm_kernel_8x2_haswell.S
index 38c864c..baee3cd 100644
--- a/kernel/x86_64/cgemm_kernel_8x2_haswell.S
+++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.S
@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
/*********************************************************************
-* 2013/11/13 Saar
+* 2014/06/28 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
@@ -93,8 +93,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
-#define L_BUFFER_SIZE 512*8*4
-#define LB2_OFFSET 512*8*2
+#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@@ -105,7 +104,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
-#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384
@@ -818,16 +816,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef WINDOWS_ABI
movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp)
- movups %xmm6, 64(%rsp)
- movups %xmm7, 80(%rsp)
- movups %xmm8, 96(%rsp)
- movups %xmm9, 112(%rsp)
- movups %xmm10, 128(%rsp)
- movups %xmm11, 144(%rsp)
- movups %xmm12, 160(%rsp)
- movups %xmm13, 176(%rsp)
- movups %xmm14, 192(%rsp)
- movups %xmm15, 208(%rsp)
+ vmovups %xmm6, 64(%rsp)
+ vmovups %xmm7, 80(%rsp)
+ vmovups %xmm8, 96(%rsp)
+ vmovups %xmm9, 112(%rsp)
+ vmovups %xmm10, 128(%rsp)
+ vmovups %xmm11, 144(%rsp)
+ vmovups %xmm12, 160(%rsp)
+ vmovups %xmm13, 176(%rsp)
+ vmovups %xmm14, 192(%rsp)
+ vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M
movq ARG2, OLD_N
@@ -840,6 +838,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
movsd OLD_OFFSET, %xmm12
#endif
vmovaps %xmm3, %xmm0
+ vmovsd OLD_ALPHA_I, %xmm1
#else
movq STACKSIZE + 8(%rsp), LDC
@@ -2255,6 +2254,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L999:
+ vzeroupper
+
movq SP, %rsp
movq (%rsp), %rbx
movq 8(%rsp), %rbp
@@ -2266,16 +2267,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef WINDOWS_ABI
movq 48(%rsp), %rdi
movq 56(%rsp), %rsi
- movups 64(%rsp), %xmm6
- movups 80(%rsp), %xmm7
- movups 96(%rsp), %xmm8
- movups 112(%rsp), %xmm9
- movups 128(%rsp), %xmm10
- movups 144(%rsp), %xmm11
- movups 160(%rsp), %xmm12
- movups 176(%rsp), %xmm13
- movups 192(%rsp), %xmm14
- movups 208(%rsp), %xmm15
+ vmovups 64(%rsp), %xmm6
+ vmovups 80(%rsp), %xmm7
+ vmovups 96(%rsp), %xmm8
+ vmovups 112(%rsp), %xmm9
+ vmovups 128(%rsp), %xmm10
+ vmovups 144(%rsp), %xmm11
+ vmovups 160(%rsp), %xmm12
+ vmovups 176(%rsp), %xmm13
+ vmovups 192(%rsp), %xmm14
+ vmovups 208(%rsp), %xmm15
#endif
addq $ STACKSIZE, %rsp
diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.S b/kernel/x86_64/cgemm_kernel_8x2_sandy.S
similarity index 89%
copy from kernel/x86_64/cgemm_kernel_8x2_haswell.S
copy to kernel/x86_64/cgemm_kernel_8x2_sandy.S
index 38c864c..564b733 100644
--- a/kernel/x86_64/cgemm_kernel_8x2_haswell.S
+++ b/kernel/x86_64/cgemm_kernel_8x2_sandy.S
@@ -25,31 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
-/*********************************************************************
-* 2013/11/13 Saar
-* BLASTEST : OK
-* CTEST : OK
-* TEST : OK
-*
-* 2013/10/28 Saar
-* Parameter:
-* CGEMM_DEFAULT_UNROLL_N 2
-* CGEMM_DEFAULT_UNROLL_M 8
-* CGEMM_DEFAULT_P 384
-* CGEMM_DEFAULT_Q 192
-* A_PR1 512
-* B_PR1 512
-*
-* Performance at 6912x6912x6912:
-* 1 thread: 84 GFLOPS (SANDYBRIDGE: 60) (MKL: 86)
-* 2 threads: 153 GFLOPS (SANDYBRIDGE: 114) (MKL: 155)
-* 3 threads: 224 GFLOPS (SANDYBRIDGE: 162) (MKL: 222)
-* 4 threads: 278 GFLOPS (SANDYBRIDGE: 223) (MKL: 279)
-*
-*
-*********************************************************************/
-
-
#define ASSEMBLER
#include "common.h"
@@ -93,8 +68,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
-#define L_BUFFER_SIZE 512*8*4
-#define LB2_OFFSET 512*8*2
+#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@@ -105,7 +79,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
-#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384
@@ -134,61 +107,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
-#if defined(BULLDOZER)
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
-#define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
-
-#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
-
-#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
-
-#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
+#define VFMADDPS_YR( y0,y1,y2 ) \
+ vmulps y1,y2,%ymm2;\
+ vaddps y0,%ymm2,y0
-#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
+#define VFMADDPS_YI( y0,y1,y2 ) \
+ vmulps y1,y2,%ymm3;\
+ vaddps y0,%ymm3,y0
-#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define VFMADDPS_R( y0,y1,y2 ) \
+ vmulps y1,y2,%xmm2;\
+ vaddps y0,%xmm2,y0
-#define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
+#define VFMADDPS_I( y0,y1,y2 ) \
+ vmulps y1,y2,%xmm3;\
+ vaddps y0,%xmm3,y0
-#define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
-#else
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
-#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
+#define VFMADDPS_YR( y0,y1,y2 ) \
+ vmulps y1,y2,%ymm2;\
+ vsubps %ymm2,y0,y0
-#define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
+#define VFMADDPS_YI( y0,y1,y2 ) \
+ vmulps y1,y2,%ymm3;\
+ vaddps y0,%ymm3,y0
-#endif
+#define VFMADDPS_R( y0,y1,y2 ) \
+ vmulps y1,y2,%xmm2;\
+ vsubps %xmm2,y0,y0
-#else
+#define VFMADDPS_I( y0,y1,y2 ) \
+ vmulps y1,y2,%xmm3;\
+ vaddps y0,%xmm3,y0
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
-#define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
-#define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0
+#define VFMADDPS_YR( y0,y1,y2 ) \
+ vmulps y1,y2,%ymm2;\
+ vaddps y0,%ymm2,y0
-#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define VFMADDPS_YI( y0,y1,y2 ) \
+ vmulps y1,y2,%ymm3;\
+ vsubps %ymm3,y0,y0
-#define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
+#define VFMADDPS_R( y0,y1,y2 ) \
+ vmulps y1,y2,%xmm2;\
+ vaddps y0,%xmm2,y0
-#define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0
+#define VFMADDPS_I( y0,y1,y2 ) \
+ vmulps y1,y2,%xmm3;\
+ vsubps %xmm3,y0,y0
-#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
-#define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0
+#else
-#define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
+#define VFMADDPS_YR( y0,y1,y2 ) \
+ vmulps y1,y2,%ymm2;\
+ vsubps %ymm2,y0,y0
-#else
+#define VFMADDPS_YI( y0,y1,y2 ) \
+ vmulps y1,y2,%ymm3;\
+ vsubps %ymm3,y0,y0
-#define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
+#define VFMADDPS_R( y0,y1,y2 ) \
+ vmulps y1,y2,%xmm2;\
+ vsubps %xmm2,y0,y0
-#define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
+#define VFMADDPS_I( y0,y1,y2 ) \
+ vmulps y1,y2,%xmm3;\
+ vsubps %xmm3,y0,y0
-#endif
#endif
@@ -202,18 +196,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4
- VFMADDPS_R( %ymm8,%ymm4,%ymm0 )
+ VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
- VFMADDPS_R( %ymm12,%ymm4,%ymm1 )
+ VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5
- VFMADDPS_I( %ymm9,%ymm5,%ymm0 )
- VFMADDPS_I( %ymm13,%ymm5,%ymm1 )
+ VFMADDPS_YI( %ymm9,%ymm5,%ymm0 )
+ VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6
- VFMADDPS_R( %ymm10,%ymm6,%ymm0 )
- VFMADDPS_R( %ymm14,%ymm6,%ymm1 )
+ VFMADDPS_YR( %ymm10,%ymm6,%ymm0 )
+ VFMADDPS_YR( %ymm14,%ymm6,%ymm1 )
vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7
- VFMADDPS_I( %ymm11,%ymm7,%ymm0 )
- VFMADDPS_I( %ymm15,%ymm7,%ymm1 )
+ VFMADDPS_YI( %ymm11,%ymm7,%ymm0 )
+ VFMADDPS_YI( %ymm15,%ymm7,%ymm1 )
addq $ 4 , BI
addq $ 16, %rax
.endm
@@ -551,11 +545,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4
- VFMADDPS_R( %ymm8,%ymm4,%ymm0 )
- VFMADDPS_R( %ymm12,%ymm4,%ymm1 )
+ VFMADDPS_YR( %ymm8,%ymm4,%ymm0 )
+ VFMADDPS_YR( %ymm12,%ymm4,%ymm1 )
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5
- VFMADDPS_I( %ymm9,%ymm5,%ymm0 )
- VFMADDPS_I( %ymm13,%ymm5,%ymm1 )
+ VFMADDPS_YI( %ymm9,%ymm5,%ymm0 )
+ VFMADDPS_YI( %ymm13,%ymm5,%ymm1 )
addq $ 2 , BI
addq $ 16, %rax
.endm
@@ -818,16 +812,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef WINDOWS_ABI
movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp)
- movups %xmm6, 64(%rsp)
- movups %xmm7, 80(%rsp)
- movups %xmm8, 96(%rsp)
- movups %xmm9, 112(%rsp)
- movups %xmm10, 128(%rsp)
- movups %xmm11, 144(%rsp)
- movups %xmm12, 160(%rsp)
- movups %xmm13, 176(%rsp)
- movups %xmm14, 192(%rsp)
- movups %xmm15, 208(%rsp)
+ vmovups %xmm6, 64(%rsp)
+ vmovups %xmm7, 80(%rsp)
+ vmovups %xmm8, 96(%rsp)
+ vmovups %xmm9, 112(%rsp)
+ vmovups %xmm10, 128(%rsp)
+ vmovups %xmm11, 144(%rsp)
+ vmovups %xmm12, 160(%rsp)
+ vmovups %xmm13, 176(%rsp)
+ vmovups %xmm14, 192(%rsp)
+ vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M
movq ARG2, OLD_N
@@ -840,6 +834,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
movsd OLD_OFFSET, %xmm12
#endif
vmovaps %xmm3, %xmm0
+ vmovsd OLD_ALPHA_I, %xmm1
#else
movq STACKSIZE + 8(%rsp), LDC
@@ -2255,6 +2250,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L999:
+ vzeroupper
+
movq SP, %rsp
movq (%rsp), %rbx
movq 8(%rsp), %rbp
@@ -2266,16 +2263,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef WINDOWS_ABI
movq 48(%rsp), %rdi
movq 56(%rsp), %rsi
- movups 64(%rsp), %xmm6
- movups 80(%rsp), %xmm7
- movups 96(%rsp), %xmm8
- movups 112(%rsp), %xmm9
- movups 128(%rsp), %xmm10
- movups 144(%rsp), %xmm11
- movups 160(%rsp), %xmm12
- movups 176(%rsp), %xmm13
- movups 192(%rsp), %xmm14
- movups 208(%rsp), %xmm15
+ vmovups 64(%rsp), %xmm6
+ vmovups 80(%rsp), %xmm7
+ vmovups 96(%rsp), %xmm8
+ vmovups 112(%rsp), %xmm9
+ vmovups 128(%rsp), %xmm10
+ vmovups 144(%rsp), %xmm11
+ vmovups 160(%rsp), %xmm12
+ vmovups 176(%rsp), %xmm13
+ vmovups 192(%rsp), %xmm14
+ vmovups 208(%rsp), %xmm15
#endif
addq $ STACKSIZE, %rsp
diff --git a/kernel/x86_64/cgemv_n.S b/kernel/x86_64/cgemv_n.S
index 64967d4..206beb6 100644
--- a/kernel/x86_64/cgemv_n.S
+++ b/kernel/x86_64/cgemv_n.S
@@ -48,7 +48,7 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 128
-
+
#define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp)
#define OLD_INCY 24 + STACKSIZE(%rsp)
@@ -75,7 +75,7 @@
#else
#define STACKSIZE 288
-
+
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
#define OLD_LDA 56 + STACKSIZE(%rsp)
@@ -207,7 +207,7 @@
ALIGN_3
subq $-32 * SIZE, A
-
+
movq BUFFER, Y1
pxor %xmm4, %xmm4
@@ -281,7 +281,7 @@
pcmpeqb %xmm7, %xmm7
psllq $63, %xmm7
-
+
pshufd $0x00, %xmm9, %xmm8
pshufd $0x55, %xmm9, %xmm9
pshufd $0x00, %xmm11, %xmm10
@@ -875,7 +875,7 @@
pcmpeqb %xmm11, %xmm11
psllq $63, %xmm11
-
+
pshufd $0x00, %xmm13, %xmm12
pshufd $0x55, %xmm13, %xmm13
pshufd $0x00, %xmm15, %xmm14
@@ -926,7 +926,7 @@
pshufd $0xb1, %xmm4, %xmm5
pshufd $0xb1, %xmm6, %xmm7
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm13, %xmm5
@@ -1134,7 +1134,7 @@
pshufd $0xb1, %xmm6, %xmm7
pshufd $0xb1, %xmm8, %xmm9
pshufd $0xb1, %xmm10, %xmm11
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm12, %xmm6
@@ -1206,7 +1206,7 @@
pshufd $0xb1, %xmm4, %xmm5
pshufd $0xb1, %xmm6, %xmm7
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm13, %xmm5
@@ -1244,12 +1244,12 @@
movsd ALPHA, %xmm8
unpcklpd %xmm8, %xmm8
#endif
-
+
pshufd $0xb1, %xmm8, %xmm9
pcmpeqb %xmm11, %xmm11
psllq $63, %xmm11
-
+
pshufd $0x00, %xmm13, %xmm12
pshufd $0x55, %xmm13, %xmm13
@@ -1285,7 +1285,7 @@
movsd -32 * SIZE(Y1), %xmm0
pshufd $0xb1, %xmm4, %xmm5
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm13, %xmm5
@@ -1449,7 +1449,7 @@
MOVUPS_A1(-32 * SIZE, A1, %xmm4)
pshufd $0xb1, %xmm4, %xmm5
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm13, %xmm5
@@ -1469,7 +1469,7 @@
movsd -32 * SIZE(A1), %xmm4
pshufd $0xb1, %xmm4, %xmm5
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm13, %xmm5
@@ -1515,7 +1515,7 @@
pcmpeqb %xmm7, %xmm7
psllq $63, %xmm7
-
+
pshufd $0x00, %xmm9, %xmm8
pshufd $0x55, %xmm9, %xmm9
pshufd $0x00, %xmm11, %xmm10
@@ -2130,7 +2130,7 @@
pcmpeqb %xmm11, %xmm11
psllq $63, %xmm11
-
+
pshufd $0x00, %xmm13, %xmm12
pshufd $0x55, %xmm13, %xmm13
pshufd $0x00, %xmm15, %xmm14
@@ -2181,7 +2181,7 @@
pshufd $0xb1, %xmm4, %xmm5
pshufd $0xb1, %xmm6, %xmm7
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm13, %xmm5
@@ -2399,7 +2399,7 @@
pshufd $0xb1, %xmm6, %xmm7
pshufd $0xb1, %xmm8, %xmm9
pshufd $0xb1, %xmm10, %xmm11
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm12, %xmm6
@@ -2472,7 +2472,7 @@
pshufd $0xb1, %xmm4, %xmm5
pshufd $0xb1, %xmm6, %xmm7
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm13, %xmm5
@@ -2512,12 +2512,12 @@
movsd ALPHA, %xmm8
unpcklpd %xmm8, %xmm8
#endif
-
+
pshufd $0xb1, %xmm8, %xmm9
pcmpeqb %xmm11, %xmm11
psllq $63, %xmm11
-
+
pshufd $0x00, %xmm13, %xmm12
pshufd $0x55, %xmm13, %xmm13
@@ -2553,7 +2553,7 @@
movsd -32 * SIZE(Y1), %xmm0
pshufd $0xb1, %xmm4, %xmm5
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm13, %xmm5
@@ -2717,7 +2717,7 @@
MOVUPS_A1(-32 * SIZE, A1, %xmm4)
pshufd $0xb1, %xmm4, %xmm5
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm13, %xmm5
@@ -2737,7 +2737,7 @@
movsd -32 * SIZE(A1), %xmm4
pshufd $0xb1, %xmm4, %xmm5
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm13, %xmm5
@@ -2780,7 +2780,7 @@
pcmpeqb %xmm11, %xmm11
psllq $63, %xmm11
-
+
pshufd $0x00, %xmm13, %xmm12
pshufd $0x55, %xmm13, %xmm13
pshufd $0x00, %xmm15, %xmm14
@@ -2831,7 +2831,7 @@
pshufd $0xb1, %xmm4, %xmm5
pshufd $0xb1, %xmm6, %xmm7
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm13, %xmm5
@@ -3165,7 +3165,7 @@
pshufd $0xb1, %xmm4, %xmm5
pshufd $0xb1, %xmm6, %xmm7
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm13, %xmm5
@@ -3200,12 +3200,12 @@
movsd ALPHA, %xmm8
unpcklpd %xmm8, %xmm8
#endif
-
+
pshufd $0xb1, %xmm8, %xmm9
pcmpeqb %xmm11, %xmm11
psllq $63, %xmm11
-
+
pshufd $0x00, %xmm13, %xmm12
pshufd $0x55, %xmm13, %xmm13
@@ -3241,7 +3241,7 @@
movsd -32 * SIZE(Y1), %xmm0
pshufd $0xb1, %xmm4, %xmm5
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm13, %xmm5
@@ -3454,7 +3454,7 @@
movsd -32 * SIZE(A1), %xmm4
pshufd $0xb1, %xmm4, %xmm5
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm13, %xmm5
@@ -3493,7 +3493,7 @@
pcmpeqb %xmm11, %xmm11
psllq $63, %xmm11
-
+
pshufd $0x00, %xmm13, %xmm12
pshufd $0x55, %xmm13, %xmm13
pshufd $0x00, %xmm15, %xmm14
@@ -3544,7 +3544,7 @@
pshufd $0xb1, %xmm4, %xmm5
pshufd $0xb1, %xmm6, %xmm7
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm13, %xmm5
@@ -3878,7 +3878,7 @@
pshufd $0xb1, %xmm4, %xmm5
pshufd $0xb1, %xmm6, %xmm7
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm13, %xmm5
@@ -3913,12 +3913,12 @@
movsd ALPHA, %xmm8
unpcklpd %xmm8, %xmm8
#endif
-
+
pshufd $0xb1, %xmm8, %xmm9
pcmpeqb %xmm11, %xmm11
psllq $63, %xmm11
-
+
pshufd $0x00, %xmm13, %xmm12
pshufd $0x55, %xmm13, %xmm13
@@ -3954,7 +3954,7 @@
movsd -32 * SIZE(Y1), %xmm0
pshufd $0xb1, %xmm4, %xmm5
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm13, %xmm5
@@ -4167,7 +4167,7 @@
movsd -32 * SIZE(A1), %xmm4
pshufd $0xb1, %xmm4, %xmm5
-
+
mulps %xmm12, %xmm4
addps %xmm4, %xmm0
mulps %xmm13, %xmm5
diff --git a/kernel/x86_64/cgemv_t.S b/kernel/x86_64/cgemv_t.S
index 49fc0eb..430586b 100644
--- a/kernel/x86_64/cgemv_t.S
+++ b/kernel/x86_64/cgemv_t.S
@@ -48,7 +48,7 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 128
-
+
#define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp)
#define OLD_INCY 24 + STACKSIZE(%rsp)
@@ -60,7 +60,7 @@
#define LDAX 88(%rsp)
#define ALPHAR 96(%rsp)
#define ALPHAI 104(%rsp)
-
+
#define M %rdi
#define N %rsi
#define A %rcx
@@ -73,7 +73,7 @@
#else
#define STACKSIZE 288
-
+
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
#define OLD_LDA 56 + STACKSIZE(%rsp)
@@ -202,7 +202,7 @@
jle .L999
subq $-32 * SIZE, A
-
+
movq BUFFER, X1
#ifdef ALIGNED_ACCESS
@@ -893,7 +893,7 @@
movaps %xmm4, %xmm6
shufps $0x88, %xmm5, %xmm4
shufps $0xdd, %xmm5, %xmm6
-
+
addps %xmm2, %xmm0
addps %xmm6, %xmm4
#endif
@@ -1306,7 +1306,7 @@
movaps %xmm0, %xmm2
shufps $0x88, %xmm1, %xmm0
shufps $0xdd, %xmm1, %xmm2
-
+
addps %xmm2, %xmm0
#endif
@@ -1576,7 +1576,7 @@
movaps %xmm0, %xmm2
shufps $0x88, %xmm1, %xmm0
shufps $0xdd, %xmm1, %xmm2
-
+
addps %xmm2, %xmm0
#endif
@@ -2227,7 +2227,7 @@
movaps %xmm4, %xmm6
shufps $0x88, %xmm5, %xmm4
shufps $0xdd, %xmm5, %xmm6
-
+
addps %xmm2, %xmm0
addps %xmm6, %xmm4
#endif
@@ -2653,7 +2653,7 @@
movaps %xmm0, %xmm2
shufps $0x88, %xmm1, %xmm0
shufps $0xdd, %xmm1, %xmm2
-
+
addps %xmm2, %xmm0
#endif
@@ -2923,7 +2923,7 @@
movaps %xmm0, %xmm2
shufps $0x88, %xmm1, %xmm0
shufps $0xdd, %xmm1, %xmm2
-
+
addps %xmm2, %xmm0
#endif
@@ -3365,7 +3365,7 @@
movaps %xmm0, %xmm2
shufps $0x88, %xmm1, %xmm0
shufps $0xdd, %xmm1, %xmm2
-
+
addps %xmm2, %xmm0
#endif
@@ -3650,7 +3650,7 @@
movaps %xmm0, %xmm2
shufps $0x88, %xmm1, %xmm0
shufps $0xdd, %xmm1, %xmm2
-
+
addps %xmm2, %xmm0
#endif
@@ -4087,7 +4087,7 @@
movaps %xmm0, %xmm2
shufps $0x88, %xmm1, %xmm0
shufps $0xdd, %xmm1, %xmm2
-
+
addps %xmm2, %xmm0
#endif
@@ -4372,7 +4372,7 @@
movaps %xmm0, %xmm2
shufps $0x88, %xmm1, %xmm0
shufps $0xdd, %xmm1, %xmm2
-
+
addps %xmm2, %xmm0
#endif
diff --git a/kernel/x86_64/copy.S b/kernel/x86_64/copy.S
index bb66d10..5729b29 100644
--- a/kernel/x86_64/copy.S
+++ b/kernel/x86_64/copy.S
@@ -50,18 +50,18 @@
#define INCY %r10
#define FLAG %r11
#endif
-
+
#include "l1param.h"
-
+
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
movq 40(%rsp), INCY
#endif
EMMS
-
+
testq N, N # if m == 0 goto End
jle .L999
@@ -363,4 +363,4 @@
ret
EPILOGUE
-
+
diff --git a/kernel/x86_64/daxpy_bulldozer.S b/kernel/x86_64/daxpy_bulldozer.S
index dfc10e8..799dad0 100644
--- a/kernel/x86_64/daxpy_bulldozer.S
+++ b/kernel/x86_64/daxpy_bulldozer.S
@@ -59,7 +59,7 @@
#define A_PRE 640
#include "l1param.h"
-
+
PROLOGUE
PROFCODE
@@ -88,7 +88,7 @@
testq M, M
jle .L47
-
+
cmpq $SIZE, INCX
jne .L40
cmpq $SIZE, INCY
@@ -290,7 +290,7 @@
je .L46
cmpq $0, INCY
je .L46
-
+
sarq $3, %rax
jle .L45
diff --git a/kernel/x86_64/ddot_bulldozer.S b/kernel/x86_64/ddot_bulldozer.S
index 503ec60..61c7571 100644
--- a/kernel/x86_64/ddot_bulldozer.S
+++ b/kernel/x86_64/ddot_bulldozer.S
@@ -62,8 +62,8 @@
SAVEREGISTERS
- leaq (, INCX, SIZE), INCX
- leaq (, INCY, SIZE), INCY
+ leaq (, INCX, SIZE), INCX
+ leaq (, INCY, SIZE), INCY
vxorps %xmm0, %xmm0 , %xmm0
vxorps %xmm1, %xmm1 , %xmm1
diff --git a/kernel/x86_64/dgemm_kernel_4x4_haswell.S b/kernel/x86_64/dgemm_kernel_4x4_haswell.S
index 4964d17..a49a51e 100644
--- a/kernel/x86_64/dgemm_kernel_4x4_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x4_haswell.S
@@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#define STACKSIZE 256
-#define L_BUFFER_SIZE 128*8*12+4096
+#define L_BUFFER_SIZE 128*8*12+512
#define OLD_A 40 + STACKSIZE(%rsp)
#define OLD_B 48 + STACKSIZE(%rsp)
diff --git a/kernel/x86_64/dgemm_kernel_4x8_sandy.S b/kernel/x86_64/dgemm_kernel_4x8_sandy.S
index 3b1b256..e86d306 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_sandy.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_sandy.S
@@ -13,19 +13,19 @@ notice, this list of conditions and the following disclaimer.
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
-3. Neither the name of the ISCAS nor the names of its contributors may
-be used to endorse or promote products derived from this software
+3. Neither the name of the ISCAS nor the names of its contributors may
+be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -138,10 +138,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define TEST testq
#define OR orq
#define JNE jne
-#define NOP
+#define NOP
#define XOR xorpd
#undef MOVQ
-#define MOVQ movq
+#define MOVQ movq
#define XOR_DY vxorpd
#define XOR_DX vxorpd
@@ -215,7 +215,7 @@ movq %r15, 40(%rsp);
movq ARG1, old_bm
movq ARG2, old_bn
movq ARG3, old_bk
- movq OLD_A, ba
+ movq OLD_A, ba
movq OLD_B, bb
movq OLD_C, C
movq old_ldc, ldc
@@ -269,7 +269,7 @@ ALIGN_5;
.L1_bodyB:;
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (, %rax, SIZE), %rax;
@@ -305,7 +305,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $8, %rax;
-#else
+#else
ADDQ $4, %rax;
#endif
MOVQ %rax, kkk;
@@ -316,7 +316,7 @@ ALIGN_5;
.L2_bodyB:;
# Computing kernel
-#### Unroll times 1 ####
+#### Unroll times 1 ####
LD_DY 4*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
SHUF_DY $0x03, yvec2, yvec2, yvec4;
@@ -372,7 +372,7 @@ MUL_DY yvec1, yvec5, yvec7;
ADD_DY yvec10, yvec6, yvec10;
ADD_DY yvec8, yvec7, yvec8;
-#### Unroll times 3 ####
+#### Unroll times 3 ####
LD_DY 20*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
SHUF_DY $0x03, yvec2, yvec2, yvec4;
@@ -438,14 +438,14 @@ PREFETCH2 0*SIZE(prebb);
ADDQ $8*SIZE, prebb;
#ifndef TRMMKERNEL
TEST $2, bk;
-#else
+#else
MOVQ kkk, %rax;
TEST $2, %rax;
#endif
JLE .L3_loopE;
ALIGN_5
.L3_bodyB:
-#### Unroll times 1 ####
+#### Unroll times 1 ####
PREFETCH0 64*SIZE(ptrba)
LD_DY 4*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
@@ -508,14 +508,14 @@ PREFETCH2 0*SIZE(prebb);
ADDQ $8*SIZE, prebb
#ifndef TRMMKERNEL
TEST $1, bk;
-#else
+#else
MOVQ kkk, %rax;
TEST $1, %rax;
#endif
JLE .L4_loopE;
ALIGN_5
.L4_bodyB:;
-#### Unroll times 1 ####
+#### Unroll times 1 ####
PREFETCH0 64*SIZE(ptrba)
LD_DY 4*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
@@ -767,8 +767,8 @@ JLE .L5_loopE;
ALIGN_5
.L5_bodyB:;
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
-MOVQ bb, ptrbb;
-#else
+MOVQ bb, ptrbb;
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (, %rax, SIZE), %rax;
@@ -793,7 +793,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $4, %rax;
-#else
+#else
ADDQ $4, %rax;
#endif
MOVQ %rax, kkk;
@@ -919,7 +919,7 @@ ADD_DY yvec9, yvec7, yvec9;
.L7_loopE:;
#ifndef TRMMKERNEL
TEST $1, bk
-#else
+#else
MOVQ kkk, %rax;
TEST $1, %rax;
#endif
@@ -1067,8 +1067,8 @@ JLE .L9_loopE;
ALIGN_5
.L9_bodyB:;
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
-MOVQ bb, ptrbb;
-#else
+MOVQ bb, ptrbb;
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (, %rax, SIZE), %rax;
@@ -1090,7 +1090,7 @@ MOVQ bk, k;
MOVQ bk, %rax;
SUBQ kk, %rax;
MOVQ %rax, kkk;
-#else
+#else
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $2, %rax;
@@ -1103,7 +1103,7 @@ SARQ $2, k;
JLE .L10_loopE;
ALIGN_5;
.L10_bodyB:;
-# Computing kernel
+# Computing kernel
##### Unroll time 1 ####
LD_DX 4*SIZE(ptrbb), xvec6;
@@ -1180,7 +1180,7 @@ ALIGN_5
.L10_loopE:;
#ifndef TRMMKERNEL
TEST $2, bk
-#else
+#else
MOVQ kkk, %rax;
TEST $2, %rax;
#endif
@@ -1337,7 +1337,7 @@ ALIGN_5
.L13_bodyB:;
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (,%rax, SIZE), %rax;
@@ -1356,7 +1356,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $1, %rax;
-#else
+#else
ADDQ $4, %rax;
#endif
MOVQ %rax, kkk;
@@ -1413,7 +1413,7 @@ ADDQ $8*SIZE, ptrbb;
.L15_loopE:;
#ifndef TRMMKERNEL
TEST $1, bk;
-#else
+#else
MOVQ kkk, %rax;
TEST $1, %rax;
#endif
@@ -1428,7 +1428,7 @@ ADDQ $1*SIZE, ptrba;
ADDQ $4*SIZE, ptrbb;
.L16_loopE:
-#### Load Alpha ####
+#### Load Alpha ####
BROAD_DY MEMALPHA, yvec7;
#### Multiply Alpha ####
MUL_DY yvec15, yvec7, yvec15;
@@ -1489,7 +1489,7 @@ ALIGN_5;
.L21_bodyB:;
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (, %rax, SIZE), %rax;
@@ -1511,11 +1511,11 @@ MOVQ bk, k;
MOVQ bk, %rax;
SUBQ kk, %rax;
MOVQ %rax, kkk;
-#else
+#else
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $8, %rax;
-#else
+#else
ADDQ $2, %rax;
#endif
MOVQ %rax, kkk;
@@ -1524,7 +1524,7 @@ SARQ $2, k;
JLE .L211_loopE;
ALIGN_5;
.L211_bodyB:
-# Computing kernel
+# Computing kernel
#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
@@ -1680,14 +1680,14 @@ ALIGN_5
.L211_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
-#else
+#else
MOVQ kkk, %rax;
TEST $2, %rax;
#endif
JLE .L212_loopE;
ALIGN_5;
.L212_bodyB:
-# Computing kernel
+# Computing kernel
#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
@@ -1767,7 +1767,7 @@ ADD_DX xvec7, xvec8, xvec8;
.L212_loopE:
#ifndef TRMMKERNEL
TEST $1, bk;
-#else
+#else
MOVQ kkk, %rax;
TEST $1, %rax;
#endif
@@ -1944,7 +1944,7 @@ ALIGN_5;
.L22_bodyB:;
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (,%rax, SIZE), %rax;
@@ -1962,11 +1962,11 @@ MOVQ bk, k;
MOVQ bk, %rax;
SUBQ kk, %rax;
MOVQ %rax, kkk;
-#else
+#else
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $4, %rax;
-#else
+#else
ADDQ $2, %rax;
#endif
MOVQ %rax, kkk;
@@ -1975,7 +1975,7 @@ SARQ $2, k;
JLE .L221_loopE;
ALIGN_5
.L221_bodyB:;
-# Computing kernel
+# Computing kernel
#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
@@ -2059,7 +2059,7 @@ ALIGN_5
.L221_loopE:;
#ifndef TRMMKERNEL
TEST $2, bk;
-#else
+#else
MOVQ kkk, %rax;
TEST $2, %rax;
#endif
@@ -2108,7 +2108,7 @@ ADD_DX xvec5, xvec10, xvec10;
.L222_loopE:
#ifndef TRMMKERNEL
TEST $1, bk
-#else
+#else
MOVQ kkk, %rax;
TEST $1, %rax;
#endif
@@ -2225,7 +2225,7 @@ ALIGN_5;
.L23_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (,%rax, SIZE), %rax;
@@ -2240,11 +2240,11 @@ MOVQ bk, k;
MOVQ bk, %rax;
SUBQ kk, %rax;
MOVQ %rax, kkk;
-#else
+#else
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $2, %rax;
-#else
+#else
ADDQ $2, %rax;
#endif
MOVQ %rax, kkk;
@@ -2253,7 +2253,7 @@ SARQ $2, k;
JLE .L231_loopE;
ALIGN_5
.L231_bodyB:
-# Computing kernel
+# Computing kernel
#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
@@ -2297,7 +2297,7 @@ ALIGN_5
.L231_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
-#else
+#else
MOVQ kkk, %rax;
TEST $2, %rax;
#endif
@@ -2326,7 +2326,7 @@ ADDQ $4*SIZE, ptrbb;
.L232_loopE:
#ifndef TRMMKERNEL
TEST $1, bk;
-#else
+#else
MOVQ kkk, %rax;
TEST $1, %rax;
#endif
@@ -2413,7 +2413,7 @@ ALIGN_5;
.L24_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (, %rax, SIZE), %rax;
@@ -2427,13 +2427,13 @@ MOVQ bk, k;
MOVQ bk, %rax;
SUBQ kk, %rax;
MOVQ %rax, kkk;
-#else
+#else
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $1, %rax;
-#else
+#else
ADDQ $2, %rax;
-#endif
+#endif
MOVQ %rax, kkk;
#endif
SARQ $2, k;
@@ -2467,7 +2467,7 @@ ALIGN_5
.L241_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
-#else
+#else
MOVQ kkk, %rax;
TEST $2, %rax;
#endif
@@ -2488,7 +2488,7 @@ ADDQ $4*SIZE, ptrbb;
.L242_loopE:
#ifndef TRMMKERNEL
TEST $1, bk;
-#else
+#else
MOVQ kkk, %rax;
TEST $1, %rax;
#endif
@@ -2550,7 +2550,7 @@ ALIGN_5
.L31_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax
LEAQ (, %rax, SIZE), %rax;
@@ -2566,11 +2566,11 @@ MOVQ bk, k;
MOVQ bk, %rax;
SUBQ kk, %rax;
MOVQ %rax, kkk;
-#else
+#else
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $8, %rax;
-#else
+#else
ADDQ $1, %rax;
#endif
MOVQ %rax, kkk;
@@ -2622,7 +2622,7 @@ ALIGN_5
.L311_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
-#else
+#else
MOVQ kkk, %rax;
TEST $2, %rax;
#endif
@@ -2769,11 +2769,11 @@ MOVQ bk, k;
MOVQ bk, %rax;
SUBQ kk, %rax;
MOVQ %rax, kkk;
-#else
+#else
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $4, %rax;
-#else
+#else
ADDQ $1, %rax;
#endif
MOVQ %rax, kkk
@@ -2809,7 +2809,7 @@ ALIGN_5
.L321_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
-#else
+#else
MOVQ kkk, %rax;
TEST $2, %rax;
#endif
@@ -2831,7 +2831,7 @@ ADDQ $2*SIZE, ptrbb;
.L322_loopE:
#ifndef TRMMKERNEL
TEST $1, bk;
-#else
+#else
MOVQ kkk, %rax;
TEST $1, %rax;
#endif
@@ -2909,13 +2909,13 @@ ALIGN_5
.L33_bodyB:
#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax
LEAQ (, %rax, SIZE), %rax
LEAQ (ptrba, %rax, 2), ptrba
ADDQ %rax, ptrbb;
-#endif
+#endif
#### Initial Result ####
XOR_DY yvec15, yvec15, yvec15;
#ifndef TRMMKERNEL
@@ -2924,7 +2924,7 @@ MOVQ bk, k;
MOVQ bk, %rax;
SUBQ kk, %rax;
MOVQ %rax, kkk;
-#else
+#else
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $2, %rax;
@@ -2964,7 +2964,7 @@ ALIGN_5
.L331_loopE:
#ifndef TRMMKERNEL
TEST $2,bk;
-#else
+#else
MOVQ kkk, %rax;
TEST $2, %rax
#endif
@@ -2985,7 +2985,7 @@ ADDQ $2*SIZE, ptrbb;
.L332_loopE:
#ifndef TRMMKERNEL
TEST $1, bk;
-#else
+#else
MOVQ kkk, %rax;
TEST $1, %rax;
#endif
@@ -3025,9 +3025,9 @@ TEST $1, bm
JLE .L34_loopE;
ALIGN_5
.L34_bodyB:
-#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (, %rax, SIZE), %rax;
@@ -3041,7 +3041,7 @@ MOVQ bk, k;
MOVQ bk, %rax;
SUBQ kk, %rax;
MOVQ %rax, kkk;
-#else
+#else
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $1, %rax;
@@ -3081,7 +3081,7 @@ ALIGN_5
.L341_loopE:
#ifndef TRMMKERNEL
TEST $2, bk;
-#else
+#else
MOVQ kkk, %rax;
TEST $2, %rax;
#endif
diff --git a/kernel/x86_64/dgemm_kernel_6x4_piledriver.S b/kernel/x86_64/dgemm_kernel_6x4_piledriver.S
index 7b5dd15..6677964 100644
--- a/kernel/x86_64/dgemm_kernel_6x4_piledriver.S
+++ b/kernel/x86_64/dgemm_kernel_6x4_piledriver.S
@@ -12,27 +12,27 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the OpenBLAS project nor the names of
- its contributors may be used to endorse or promote products
+ 3. Neither the name of the OpenBLAS project nor the names of
+ its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
// register blocking= 6x4. unloop k = 4.
-// Use FMA3 on piledriver.
+// Use FMA3 on piledriver.
// Todo: 1) deal with the edge. 2) Add windows abi.
-
+
#define ASSEMBLER
#include "common.h"
@@ -89,7 +89,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SALQ1280(imm,n) salq imm,n
#define JG jg
-#define JLE jle
+#define JLE jle
#define VLD2560(addr,reg) vmovapd addr,reg
#define VST2560(reg,addr) vmovapd reg,addr
@@ -168,7 +168,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define imm1 $0x05
#define imm3 $0x05
#define imm100 $0x05
-#define imm200 $0x0a
+#define imm200 $0x0a
#define XMM0 %xmm0
#define XMM1 %xmm1
diff --git a/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S b/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S
index 45f5c0c..40c5892 100644
--- a/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S
+++ b/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S
@@ -148,8 +148,8 @@
#endif
-#define L_BUFFER_SIZE 512*8*4
-#define LB2_OFFSET 512*8*2
+#define L_BUFFER_SIZE 8192
+#define LB2_OFFSET 4096
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
diff --git a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S
index e09e3b3..adc00cc 100644
--- a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S
+++ b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S
@@ -105,8 +105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
-#define L_BUFFER_SIZE 512*8*4
-#define LB2_OFFSET 512*8*2
+#define L_BUFFER_SIZE 8192
+#define LB2_OFFSET 4096
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
diff --git a/kernel/x86_64/dgemm_ncopy_2.S b/kernel/x86_64/dgemm_ncopy_2.S
index e4bde49..532fddf 100644
--- a/kernel/x86_64/dgemm_ncopy_2.S
+++ b/kernel/x86_64/dgemm_ncopy_2.S
@@ -87,7 +87,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %r15
pushq %r14
diff --git a/kernel/x86_64/dgemm_ncopy_4.S b/kernel/x86_64/dgemm_ncopy_4.S
index 1e44316..41eac95 100644
--- a/kernel/x86_64/dgemm_ncopy_4.S
+++ b/kernel/x86_64/dgemm_ncopy_4.S
@@ -107,7 +107,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %r15
pushq %r14
diff --git a/kernel/x86_64/dgemm_ncopy_8.S b/kernel/x86_64/dgemm_ncopy_8.S
index f35c3c5..7600c9a 100644
--- a/kernel/x86_64/dgemm_ncopy_8.S
+++ b/kernel/x86_64/dgemm_ncopy_8.S
@@ -93,7 +93,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %r15
pushq %r14
diff --git a/kernel/x86_64/dgemm_ncopy_8_bulldozer.S b/kernel/x86_64/dgemm_ncopy_8_bulldozer.S
index 1b934f6..43f9cd2 100644
--- a/kernel/x86_64/dgemm_ncopy_8_bulldozer.S
+++ b/kernel/x86_64/dgemm_ncopy_8_bulldozer.S
@@ -81,7 +81,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %r15
pushq %r14
diff --git a/kernel/x86_64/dgemm_tcopy_2.S b/kernel/x86_64/dgemm_tcopy_2.S
index b0b3590..9881610 100644
--- a/kernel/x86_64/dgemm_tcopy_2.S
+++ b/kernel/x86_64/dgemm_tcopy_2.S
@@ -114,7 +114,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %rdi
pushq %rsi
@@ -326,7 +326,7 @@
movlpd %xmm0, -16 * SIZE(B3)
ALIGN_4
-
+
.L999:
popq %rbp
popq %r13
diff --git a/kernel/x86_64/dgemm_tcopy_4.S b/kernel/x86_64/dgemm_tcopy_4.S
index 85b0253..98ba647 100644
--- a/kernel/x86_64/dgemm_tcopy_4.S
+++ b/kernel/x86_64/dgemm_tcopy_4.S
@@ -116,7 +116,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %rdi
pushq %rsi
@@ -505,7 +505,7 @@
movlpd %xmm0, -16 * SIZE(B3)
jmp .L999
ALIGN_4
-
+
.L999:
popq %rbp
popq %r12
diff --git a/kernel/x86_64/dgemm_tcopy_8.S b/kernel/x86_64/dgemm_tcopy_8.S
index 3d411cd..db97db7 100644
--- a/kernel/x86_64/dgemm_tcopy_8.S
+++ b/kernel/x86_64/dgemm_tcopy_8.S
@@ -99,7 +99,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %rdi
pushq %rsi
@@ -770,7 +770,7 @@
movlpd %xmm0, -16 * SIZE(B3)
jmp .L999
ALIGN_4
-
+
.L999:
popq %rbp
popq %r12
diff --git a/kernel/x86_64/dgemm_tcopy_8_bulldozer.S b/kernel/x86_64/dgemm_tcopy_8_bulldozer.S
index d7fc416..a9dd253 100644
--- a/kernel/x86_64/dgemm_tcopy_8_bulldozer.S
+++ b/kernel/x86_64/dgemm_tcopy_8_bulldozer.S
@@ -82,7 +82,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %rdi
pushq %rsi
@@ -650,7 +650,7 @@
vmovsd %xmm0, -16 * SIZE(B3)
jmp .L999
ALIGN_4
-
+
.L999:
popq %rbp
popq %r12
diff --git a/kernel/x86_64/dgemv_n.S b/kernel/x86_64/dgemv_n.S
index 5f4c404..58dd43b 100644
--- a/kernel/x86_64/dgemv_n.S
+++ b/kernel/x86_64/dgemv_n.S
@@ -48,7 +48,7 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 128
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_A %rcx
@@ -67,7 +67,7 @@
#else
#define STACKSIZE 288
-
+
#define OLD_M %rcx
#define OLD_N %rdx
#define OLD_A 40 + STACKSIZE(%rsp)
@@ -113,7 +113,7 @@
#define TMP_M %r15
#define Y2 %rbx
-
+
PROLOGUE
PROFCODE
@@ -176,7 +176,7 @@
addq M, I
jle .L999x
movq I, M
-
+
.L00t:
movq XX,X
movq AA,A
@@ -203,7 +203,7 @@
testq $SIZE, A
cmoveq M, MM
#endif
-
+
testq N, N # if n <= 0 goto END
jle .L999
testq M, M # if n <= 0 goto END
@@ -221,7 +221,7 @@
#endif
movq BUFFER, Y1
-
+
pxor %xmm4, %xmm4
movq M, %rax
@@ -316,7 +316,7 @@
movsd ALPHA, %xmm0
unpcklpd %xmm0, %xmm0
#endif
-
+
mulpd %xmm0, %xmm8
mulpd %xmm0, %xmm9
mulpd %xmm0, %xmm10
@@ -875,7 +875,7 @@
movsd ALPHA, %xmm0
unpcklpd %xmm0, %xmm0
#endif
-
+
mulpd %xmm0, %xmm12
mulpd %xmm0, %xmm13
mulpd %xmm0, %xmm14
@@ -1409,7 +1409,7 @@
.L36:
testq $2, MM
je .L37
-
+
MOVUPS_A1(-16 * SIZE, A1, %xmm8)
MOVUPS_A1(-16 * SIZE, A2, %xmm9)
@@ -1675,7 +1675,7 @@
movsd ALPHA, %xmm0
unpcklpd %xmm0, %xmm0
#endif
-
+
mulpd %xmm0, %xmm12
mulpd %xmm0, %xmm13
mulpd %xmm0, %xmm14
@@ -2241,7 +2241,7 @@
.L66:
testq $2, MM
je .L67
-
+
MOVUPS_A1(-16 * SIZE, A1, %xmm4)
MOVUPS_A1(-15 * SIZE, A2, %xmm5)
@@ -2469,7 +2469,7 @@
#endif
movq M, TMP_M
movq Y, Y1
-
+
cmpq $SIZE, INCY
jne .L950
@@ -2702,7 +2702,7 @@
jmp .L999
ALIGN_4
-.L950:
+.L950:
testq $SIZE, BUFFER
je .L960
diff --git a/kernel/x86_64/dgemv_n_atom.S b/kernel/x86_64/dgemv_n_atom.S
index 27a763a..ed6a585 100644
--- a/kernel/x86_64/dgemv_n_atom.S
+++ b/kernel/x86_64/dgemv_n_atom.S
@@ -47,7 +47,7 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 64
-
+
#define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp)
#define OLD_INCY 24 + STACKSIZE(%rsp)
@@ -66,7 +66,7 @@
#else
#define STACKSIZE 256
-
+
#define OLD_A 40 + STACKSIZE(%rsp)
#define OLD_LDA 48 + STACKSIZE(%rsp)
#define OLD_X 56 + STACKSIZE(%rsp)
@@ -87,7 +87,7 @@
#define INCY %r10
#endif
-
+
#define I %rax
#define J %r11
#define A1 %r12
@@ -95,7 +95,7 @@
#define Y1 %r14
#define BUFFER %r15
#define MM %rbx
-
+
#define ALPHA %xmm15
PROLOGUE
diff --git a/kernel/x86_64/dgemv_n_bulldozer.S b/kernel/x86_64/dgemv_n_bulldozer.S
index ef2c4e2..bc00d67 100644
--- a/kernel/x86_64/dgemv_n_bulldozer.S
+++ b/kernel/x86_64/dgemv_n_bulldozer.S
@@ -57,7 +57,7 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 64
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_A %rcx
@@ -71,7 +71,7 @@
#else
#define STACKSIZE 256
-
+
#define OLD_M %rcx
#define OLD_N %rdx
#define OLD_A 40 + STACKSIZE(%rsp)
@@ -172,7 +172,7 @@
testq $SIZE, A
cmoveq M, MM
#endif
-
+
testq N, N # if n <= 0 goto END
jle .L999
testq M, M # if n <= 0 goto END
@@ -190,7 +190,7 @@
#endif
movq BUFFER, Y1
-
+
vxorpd %xmm4, %xmm4, %xmm4
movq M, %rax
@@ -255,7 +255,7 @@
addq INCX, X
vmovddup ALPHA, %xmm0
-
+
vmulpd %xmm0, %xmm8 , %xmm8
vmulpd %xmm0, %xmm9 , %xmm9
vmulpd %xmm0, %xmm10 , %xmm10
@@ -561,7 +561,7 @@
vfmaddsd %xmm0 , %xmm14, %xmm6 , %xmm0
vfmaddsd %xmm0 , %xmm15, %xmm7 , %xmm0
-
+
vmovsd %xmm0, -16 * SIZE(Y1)
ALIGN_3
@@ -1035,7 +1035,7 @@
.L36:
testq $2, MM
je .L37
-
+
VMOVUPS_A1(-16 * SIZE, A1, %xmm8)
VMOVUPS_A1(-16 * SIZE, A2, %xmm9)
@@ -1255,7 +1255,7 @@
addq INCX, X
vmovddup ALPHA, %xmm0
-
+
vmulpd %xmm0, %xmm12 , %xmm12
vmulpd %xmm0, %xmm13 , %xmm13
vmulpd %xmm0, %xmm14 , %xmm14
@@ -1306,7 +1306,7 @@
.L53:
-
+
vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0
VMOVUPS_A1(-10 * SIZE, A1, %xmm7)
@@ -1724,7 +1724,7 @@
.L66:
testq $2, MM
je .L67
-
+
VMOVUPS_A1(-16 * SIZE, A1, %xmm4)
VMOVUPS_A1(-15 * SIZE, A2, %xmm5)
@@ -2142,7 +2142,7 @@
jmp .L999
ALIGN_4
-.L950:
+.L950:
testq $SIZE, BUFFER
je .L960
diff --git a/kernel/x86_64/dgemv_t.S b/kernel/x86_64/dgemv_t.S
index 3d132c3..9277774 100644
--- a/kernel/x86_64/dgemv_t.S
+++ b/kernel/x86_64/dgemv_t.S
@@ -48,7 +48,7 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 128
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_A %rcx
@@ -64,7 +64,7 @@
#else
#define STACKSIZE 256
-
+
#define OLD_M %rcx
#define OLD_N %rdx
#define OLD_A 40 + STACKSIZE(%rsp)
@@ -180,7 +180,7 @@
jle .L999x
movq %rax,M
-.L00:
+.L00:
movq LDAX,LDA
movq NN,N
movq AA,A
@@ -205,7 +205,7 @@
jle .L999
movq BUFFER, X1
-
+
#ifdef ALIGNED_ACCESS
testq $SIZE, A
je .L01
diff --git a/kernel/x86_64/dgemv_t_atom.S b/kernel/x86_64/dgemv_t_atom.S
index 246bdd3..1e63c42 100644
--- a/kernel/x86_64/dgemv_t_atom.S
+++ b/kernel/x86_64/dgemv_t_atom.S
@@ -47,12 +47,12 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 64
-
+
#define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp)
#define OLD_INCY 24 + STACKSIZE(%rsp)
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
-
+
#define M %rdi
#define N %rsi
#define A %rcx
@@ -66,7 +66,7 @@
#else
#define STACKSIZE 256
-
+
#define OLD_A 40 + STACKSIZE(%rsp)
#define OLD_LDA 48 + STACKSIZE(%rsp)
#define OLD_X 56 + STACKSIZE(%rsp)
@@ -95,7 +95,7 @@
#define Y1 %r15
#define ALPHA %xmm3
-
+
PROLOGUE
PROFCODE
@@ -130,7 +130,7 @@
movq OLD_Y, Y
movq OLD_INCY, INCY
movq OLD_BUFFER, BUFFER
-
+
leaq (,INCX, SIZE), INCX
leaq (,INCY, SIZE), INCY
leaq (, LDA, SIZE), LDA
diff --git a/kernel/x86_64/dgemv_t_bulldozer.S b/kernel/x86_64/dgemv_t_bulldozer.S
index 36ae2b9..9cd44ee 100644
--- a/kernel/x86_64/dgemv_t_bulldozer.S
+++ b/kernel/x86_64/dgemv_t_bulldozer.S
@@ -57,7 +57,7 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 128
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_A %rcx
@@ -73,7 +73,7 @@
#else
#define STACKSIZE 256
-
+
#define OLD_M %rcx
#define OLD_N %rdx
#define OLD_A 40 + STACKSIZE(%rsp)
@@ -185,7 +185,7 @@
jle .L999x
movq %rax,M
-.L00:
+.L00:
movq LDAX,LDA
movq NN,N
movq AA,A
@@ -210,7 +210,7 @@
jle .L999
movq BUFFER, X1
-
+
movq M, I
sarq $3, I
jle .L05
@@ -932,7 +932,7 @@
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vfmaddpd %xmm1 , -16 * SIZE(A2) , %xmm12 , %xmm1
-
+
addq $2 * SIZE, A1
addq $2 * SIZE, A2
addq $2 * SIZE, X1
diff --git a/kernel/x86_64/dot_atom.S b/kernel/x86_64/dot_atom.S
index bc67b28..794cf14 100644
--- a/kernel/x86_64/dot_atom.S
+++ b/kernel/x86_64/dot_atom.S
@@ -60,9 +60,9 @@
SAVEREGISTERS
- leaq (, INCX, SIZE), INCX
+ leaq (, INCX, SIZE), INCX
pxor %xmm0, %xmm0
- leaq (, INCY, SIZE), INCY
+ leaq (, INCY, SIZE), INCY
pxor %xmm1, %xmm1
pxor %xmm2, %xmm2
diff --git a/kernel/x86_64/dot_sse.S b/kernel/x86_64/dot_sse.S
index 985ce9f..6886222 100644
--- a/kernel/x86_64/dot_sse.S
+++ b/kernel/x86_64/dot_sse.S
@@ -60,8 +60,8 @@
SAVEREGISTERS
- leaq (, INCX, SIZE), INCX
- leaq (, INCY, SIZE), INCY
+ leaq (, INCX, SIZE), INCX
+ leaq (, INCY, SIZE), INCY
xorps %xmm0, %xmm0
xorps %xmm1, %xmm1
@@ -1278,7 +1278,7 @@
#ifndef HAVE_SSE3
movhlps %xmm0, %xmm1
addps %xmm1, %xmm0
-
+
movaps %xmm0, %xmm1
shufps $1, %xmm0, %xmm0
addss %xmm1, %xmm0
diff --git a/kernel/x86_64/dot_sse2.S b/kernel/x86_64/dot_sse2.S
index 875bf4e..ceb2d0c 100644
--- a/kernel/x86_64/dot_sse2.S
+++ b/kernel/x86_64/dot_sse2.S
@@ -60,8 +60,8 @@
SAVEREGISTERS
- leaq (, INCX, SIZE), INCX
- leaq (, INCY, SIZE), INCY
+ leaq (, INCX, SIZE), INCX
+ leaq (, INCY, SIZE), INCY
xorps %xmm0, %xmm0
xorps %xmm1, %xmm1
diff --git a/kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S b/kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S
index 9e15fa2..bccf1c9 100644
--- a/kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S
+++ b/kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,7 +49,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -147,79 +147,79 @@
vmovddup -16 * SIZE(AO), %xmm8
vmulpd %xmm0 , %xmm8 , %xmm0
vmovddup -15 * SIZE(AO), %xmm9
- vfnmaddpd %xmm1 , %xmm0 , %xmm9 , %xmm1
+ vfnmaddpd %xmm1 , %xmm0 , %xmm9 , %xmm1
vmovddup -14 * SIZE(AO), %xmm10
- vfnmaddpd %xmm2 , %xmm0 , %xmm10, %xmm2
+ vfnmaddpd %xmm2 , %xmm0 , %xmm10, %xmm2
vmovddup -13 * SIZE(AO), %xmm11
- vfnmaddpd %xmm3 , %xmm0 , %xmm11, %xmm3
+ vfnmaddpd %xmm3 , %xmm0 , %xmm11, %xmm3
vmovddup -12 * SIZE(AO), %xmm8
- vfnmaddpd %xmm4 , %xmm0 , %xmm8 , %xmm4
+ vfnmaddpd %xmm4 , %xmm0 , %xmm8 , %xmm4
vmovddup -11 * SIZE(AO), %xmm9
- vfnmaddpd %xmm5 , %xmm0 , %xmm9 , %xmm5
+ vfnmaddpd %xmm5 , %xmm0 , %xmm9 , %xmm5
vmovddup -10 * SIZE(AO), %xmm10
- vfnmaddpd %xmm6 , %xmm0 , %xmm10, %xmm6
+ vfnmaddpd %xmm6 , %xmm0 , %xmm10, %xmm6
vmovddup -9 * SIZE(AO), %xmm11
- vfnmaddpd %xmm7 , %xmm0 , %xmm11, %xmm7
+ vfnmaddpd %xmm7 , %xmm0 , %xmm11, %xmm7
vmovddup -7 * SIZE(AO), %xmm8
vmulpd %xmm1 , %xmm8 , %xmm1
vmovddup -6 * SIZE(AO), %xmm10
- vfnmaddpd %xmm2 , %xmm1 , %xmm10, %xmm2
+ vfnmaddpd %xmm2 , %xmm1 , %xmm10, %xmm2
vmovddup -5 * SIZE(AO), %xmm11
- vfnmaddpd %xmm3 , %xmm1 , %xmm11, %xmm3
+ vfnmaddpd %xmm3 , %xmm1 , %xmm11, %xmm3
vmovddup -4 * SIZE(AO), %xmm8
- vfnmaddpd %xmm4 , %xmm1 , %xmm8 , %xmm4
+ vfnmaddpd %xmm4 , %xmm1 , %xmm8 , %xmm4
vmovddup -3 * SIZE(AO), %xmm9
- vfnmaddpd %xmm5 , %xmm1 , %xmm9 , %xmm5
+ vfnmaddpd %xmm5 , %xmm1 , %xmm9 , %xmm5
vmovddup -2 * SIZE(AO), %xmm10
- vfnmaddpd %xmm6 , %xmm1 , %xmm10, %xmm6
+ vfnmaddpd %xmm6 , %xmm1 , %xmm10, %xmm6
vmovddup -1 * SIZE(AO), %xmm11
- vfnmaddpd %xmm7 , %xmm1 , %xmm11, %xmm7
+ vfnmaddpd %xmm7 , %xmm1 , %xmm11, %xmm7
vmovddup 2 * SIZE(AO), %xmm8
vmulpd %xmm2 , %xmm8 , %xmm2
vmovddup 3 * SIZE(AO), %xmm11
- vfnmaddpd %xmm3 , %xmm2 , %xmm11, %xmm3
+ vfnmaddpd %xmm3 , %xmm2 , %xmm11, %xmm3
vmovddup 4 * SIZE(AO), %xmm8
- vfnmaddpd %xmm4 , %xmm2 , %xmm8 , %xmm4
+ vfnmaddpd %xmm4 , %xmm2 , %xmm8 , %xmm4
vmovddup 5 * SIZE(AO), %xmm9
- vfnmaddpd %xmm5 , %xmm2 , %xmm9 , %xmm5
+ vfnmaddpd %xmm5 , %xmm2 , %xmm9 , %xmm5
vmovddup 6 * SIZE(AO), %xmm10
- vfnmaddpd %xmm6 , %xmm2 , %xmm10, %xmm6
+ vfnmaddpd %xmm6 , %xmm2 , %xmm10, %xmm6
vmovddup 7 * SIZE(AO), %xmm11
- vfnmaddpd %xmm7 , %xmm2 , %xmm11, %xmm7
+ vfnmaddpd %xmm7 , %xmm2 , %xmm11, %xmm7
vmovddup 11 * SIZE(AO), %xmm8
vmulpd %xmm3 , %xmm8 , %xmm3
vmovddup 12 * SIZE(AO), %xmm11
- vfnmaddpd %xmm4 , %xmm3 , %xmm11, %xmm4
+ vfnmaddpd %xmm4 , %xmm3 , %xmm11, %xmm4
vmovddup 13 * SIZE(AO), %xmm9
- vfnmaddpd %xmm5 , %xmm3 , %xmm9 , %xmm5
+ vfnmaddpd %xmm5 , %xmm3 , %xmm9 , %xmm5
vmovddup 14 * SIZE(AO), %xmm10
- vfnmaddpd %xmm6 , %xmm3 , %xmm10, %xmm6
+ vfnmaddpd %xmm6 , %xmm3 , %xmm10, %xmm6
vmovddup 15 * SIZE(AO), %xmm11
- vfnmaddpd %xmm7 , %xmm3 , %xmm11, %xmm7
+ vfnmaddpd %xmm7 , %xmm3 , %xmm11, %xmm7
vmovddup 20 * SIZE(AO), %xmm8
vmulpd %xmm4 , %xmm8 , %xmm4
vmovddup 21 * SIZE(AO), %xmm9
- vfnmaddpd %xmm5 , %xmm4 , %xmm9 , %xmm5
+ vfnmaddpd %xmm5 , %xmm4 , %xmm9 , %xmm5
vmovddup 22 * SIZE(AO), %xmm10
- vfnmaddpd %xmm6 , %xmm4 , %xmm10, %xmm6
+ vfnmaddpd %xmm6 , %xmm4 , %xmm10, %xmm6
vmovddup 23 * SIZE(AO), %xmm11
- vfnmaddpd %xmm7 , %xmm4 , %xmm11, %xmm7
+ vfnmaddpd %xmm7 , %xmm4 , %xmm11, %xmm7
vmovddup 29 * SIZE(AO), %xmm8
vmulpd %xmm5 , %xmm8 , %xmm5
vmovddup 30 * SIZE(AO), %xmm10
- vfnmaddpd %xmm6 , %xmm5 , %xmm10, %xmm6
+ vfnmaddpd %xmm6 , %xmm5 , %xmm10, %xmm6
vmovddup 31 * SIZE(AO), %xmm11
- vfnmaddpd %xmm7 , %xmm5 , %xmm11, %xmm7
+ vfnmaddpd %xmm7 , %xmm5 , %xmm11, %xmm7
vmovddup 38 * SIZE(AO), %xmm8
vmulpd %xmm6 , %xmm8 , %xmm6
vmovddup 39 * SIZE(AO), %xmm11
- vfnmaddpd %xmm7 , %xmm6 , %xmm11, %xmm7
+ vfnmaddpd %xmm7 , %xmm6 , %xmm11, %xmm7
vmovddup 47 * SIZE(AO), %xmm8
vmulpd %xmm7 , %xmm8 , %xmm7
@@ -292,23 +292,23 @@
vmovddup -16 * SIZE(AO), %xmm8
vmulpd %xmm0 , %xmm8 , %xmm0
vmovddup -15 * SIZE(AO), %xmm9
- vfnmaddpd %xmm1 , %xmm0 , %xmm9 , %xmm1
+ vfnmaddpd %xmm1 , %xmm0 , %xmm9 , %xmm1
vmovddup -14 * SIZE(AO), %xmm10
- vfnmaddpd %xmm2 , %xmm0 , %xmm10, %xmm2
+ vfnmaddpd %xmm2 , %xmm0 , %xmm10, %xmm2
vmovddup -13 * SIZE(AO), %xmm11
- vfnmaddpd %xmm3 , %xmm0 , %xmm11, %xmm3
+ vfnmaddpd %xmm3 , %xmm0 , %xmm11, %xmm3
vmovddup -11 * SIZE(AO), %xmm8
vmulpd %xmm1 , %xmm8 , %xmm1
vmovddup -10 * SIZE(AO), %xmm10
- vfnmaddpd %xmm2 , %xmm1 , %xmm10, %xmm2
+ vfnmaddpd %xmm2 , %xmm1 , %xmm10, %xmm2
vmovddup -9 * SIZE(AO), %xmm11
- vfnmaddpd %xmm3 , %xmm1 , %xmm11, %xmm3
+ vfnmaddpd %xmm3 , %xmm1 , %xmm11, %xmm3
vmovddup -6 * SIZE(AO), %xmm8
vmulpd %xmm2 , %xmm8 , %xmm2
vmovddup -5 * SIZE(AO), %xmm11
- vfnmaddpd %xmm3 , %xmm2 , %xmm11, %xmm3
+ vfnmaddpd %xmm3 , %xmm2 , %xmm11, %xmm3
vmovddup -1 * SIZE(AO), %xmm8
vmulpd %xmm3 , %xmm8 , %xmm3
@@ -356,7 +356,7 @@
vmovddup -16 * SIZE(AO), %xmm8
vmulpd %xmm0 , %xmm8 , %xmm0
vmovddup -15 * SIZE(AO), %xmm9
- vfnmaddpd %xmm1 , %xmm0 , %xmm9 , %xmm1
+ vfnmaddpd %xmm1 , %xmm0 , %xmm9 , %xmm1
vmovddup -13 * SIZE(AO), %xmm8
vmulpd %xmm1 , %xmm8 , %xmm1
@@ -617,7 +617,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, (%rsp)
movq %rbp, 8(%rsp)
@@ -758,7 +758,7 @@
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
/*********************************************************************************/
@@ -828,7 +828,7 @@
addq $4, KK
- ALIGN_4
+ ALIGN_4
/*********************************************************************************/
@@ -896,7 +896,7 @@
addq $2, KK
- ALIGN_4
+ ALIGN_4
/********************************************************************************/
.L70:
testq $1, M
@@ -961,8 +961,8 @@
addq $1, KK
- ALIGN_4
-
+ ALIGN_4
+
.L79:
movq BO, B
@@ -1048,7 +1048,7 @@
decq I # i --
jg .L91
- ALIGN_4
+ ALIGN_4
/*****************************************************************************/
.L90_A:
@@ -1113,7 +1113,7 @@
addq $4, KK
- ALIGN_4
+ ALIGN_4
/*************************************************************************************/
.L100:
@@ -1178,7 +1178,7 @@
addq $2, KK
- ALIGN_4
+ ALIGN_4
.L110:
testq $1, M
@@ -1242,7 +1242,7 @@
addq $1, KK
- ALIGN_4
+ ALIGN_4
.L119:
@@ -1251,7 +1251,7 @@
ALIGN_4
-
+
.L999:
movq (%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S b/kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S
index 8d3964a..9f693f8 100644
--- a/kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S
+++ b/kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,7 +49,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -133,10 +133,10 @@
vmulpd %xmm3 , %xmm8 , %xmm3
vmovddup -15 * SIZE(BO), %xmm9
- vfnmaddpd %xmm4 , %xmm0 , %xmm9 , %xmm4
- vfnmaddpd %xmm5 , %xmm1 , %xmm9 , %xmm5
- vfnmaddpd %xmm6 , %xmm2 , %xmm9 , %xmm6
- vfnmaddpd %xmm7 , %xmm3 , %xmm9 , %xmm7
+ vfnmaddpd %xmm4 , %xmm0 , %xmm9 , %xmm4
+ vfnmaddpd %xmm5 , %xmm1 , %xmm9 , %xmm5
+ vfnmaddpd %xmm6 , %xmm2 , %xmm9 , %xmm6
+ vfnmaddpd %xmm7 , %xmm3 , %xmm9 , %xmm7
vmovddup -13 * SIZE(BO), %xmm10
vmulpd %xmm4 , %xmm10, %xmm4
@@ -198,8 +198,8 @@
vmulpd %xmm1 , %xmm8 , %xmm1
vmovddup -15 * SIZE(BO), %xmm9
- vfnmaddpd %xmm2 , %xmm0 , %xmm9 , %xmm2
- vfnmaddpd %xmm3 , %xmm1 , %xmm9 , %xmm3
+ vfnmaddpd %xmm2 , %xmm0 , %xmm9 , %xmm2
+ vfnmaddpd %xmm3 , %xmm1 , %xmm9 , %xmm3
vmovddup -13 * SIZE(BO), %xmm10
vmulpd %xmm2 , %xmm10, %xmm2
@@ -242,7 +242,7 @@
vmulpd %xmm0 , %xmm8 , %xmm0
vmovddup -15 * SIZE(BO), %xmm9
- vfnmaddpd %xmm2 , %xmm0 , %xmm9 , %xmm2
+ vfnmaddpd %xmm2 , %xmm0 , %xmm9 , %xmm2
vmovddup -13 * SIZE(BO), %xmm10
vmulpd %xmm2 , %xmm10, %xmm2
@@ -253,7 +253,7 @@
vmovups %xmm0 , -16 * SIZE(AO)
vmovups %xmm2 , -14 * SIZE(AO)
-
+
.endm
@@ -278,7 +278,7 @@
vmulsd %xmm2 , %xmm8 , %xmm2
vmovsd -15 * SIZE(BO), %xmm9
- vfnmaddsd %xmm0 , %xmm2 , %xmm9 , %xmm0
+ vfnmaddsd %xmm0 , %xmm2 , %xmm9 , %xmm0
vmovsd -13 * SIZE(BO), %xmm10
vmulsd %xmm0 , %xmm10, %xmm0
@@ -336,7 +336,7 @@
vmovups %xmm1 , -14 * SIZE(AO)
vmovups %xmm2 , -12 * SIZE(AO)
vmovups %xmm3 , -10 * SIZE(AO)
-
+
.endm
@@ -428,7 +428,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, (%rsp)
movq %rbp, 8(%rsp)
@@ -566,7 +566,7 @@
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
/*********************************************************************************/
@@ -634,7 +634,7 @@
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
- ALIGN_4
+ ALIGN_4
/*********************************************************************************/
@@ -700,7 +700,7 @@
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
- ALIGN_4
+ ALIGN_4
/********************************************************************************/
.L70:
testq $1, M
@@ -763,8 +763,8 @@
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
- ALIGN_4
-
+ ALIGN_4
+
.L79:
addq $2, KK // number of values in B # only for RN Kernel
@@ -847,7 +847,7 @@
decq I # i --
jg .L91
- ALIGN_4
+ ALIGN_4
/*****************************************************************************/
.L90_A:
@@ -909,7 +909,7 @@
leaq (AO, %rax, 4), AO
addq %rax, BO
- ALIGN_4
+ ALIGN_4
/*************************************************************************************/
.L100:
@@ -972,7 +972,7 @@
leaq (AO, %rax, 2), AO
addq %rax, BO
- ALIGN_4
+ ALIGN_4
.L110:
testq $1, M
@@ -1034,7 +1034,7 @@
addq %rax, AO
addq %rax, BO
- ALIGN_4
+ ALIGN_4
.L119:
@@ -1045,7 +1045,7 @@
ALIGN_4
-
+
.L999:
movq (%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/gemm_beta.S b/kernel/x86_64/gemm_beta.S
index 461df50..09df2b7 100644
--- a/kernel/x86_64/gemm_beta.S
+++ b/kernel/x86_64/gemm_beta.S
@@ -118,7 +118,7 @@
#ifdef OPTERON
prefetchw 32 * SIZE(C1)
#endif
-
+
MOVSD %xmm0, 0 * SIZE(C1)
MOVSD %xmm0, 1 * SIZE(C1)
MOVSD %xmm0, 2 * SIZE(C1)
diff --git a/kernel/x86_64/gemm_kernel_2x8_nehalem.S b/kernel/x86_64/gemm_kernel_2x8_nehalem.S
index 24e66d7..7e4b0d8 100644
--- a/kernel/x86_64/gemm_kernel_2x8_nehalem.S
+++ b/kernel/x86_64/gemm_kernel_2x8_nehalem.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -97,7 +97,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -142,7 +142,7 @@
#endif
movlps %xmm0, ALPHA
-
+
subq $-16 * SIZE, A
subq $-16 * SIZE, B
@@ -156,7 +156,7 @@
movq %r11, OFFSET
#ifndef LEFT
negq %r11
-#endif
+#endif
movq %r11, KK
#endif
@@ -202,7 +202,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
xorps %xmm1, %xmm1
xorps %xmm2, %xmm2
@@ -241,7 +241,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -256,7 +256,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm12
@@ -577,7 +577,7 @@
decq I
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $1, M
@@ -597,7 +597,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -611,7 +611,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -626,7 +626,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
mulpd %xmm0, %xmm1
addpd %xmm1, %xmm8
movaps -14 * SIZE(BO), %xmm1
@@ -773,7 +773,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
+ ALIGN_4
.L29:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -822,7 +822,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -844,7 +844,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -859,7 +859,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm8
@@ -1037,7 +1037,7 @@
decq I
BRANCH
jg .L31
- ALIGN_4
+ ALIGN_4
.L40:
testq $1, M
@@ -1057,7 +1057,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1071,7 +1071,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1086,7 +1086,7 @@
jle .L45
ALIGN_3
-.L42:
+.L42:
mulpd %xmm0, %xmm1
addpd %xmm1, %xmm8
movaps -14 * SIZE(BO), %xmm1
@@ -1190,7 +1190,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
+ ALIGN_4
.L49:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -1235,7 +1235,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -1253,7 +1253,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1268,7 +1268,7 @@
jle .L55
ALIGN_3
-.L52:
+.L52:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm8
@@ -1387,7 +1387,7 @@
decq I
BRANCH
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $1, M
@@ -1407,7 +1407,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1419,7 +1419,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1434,7 +1434,7 @@
jle .L65
ALIGN_3
-.L62:
+.L62:
mulpd %xmm0, %xmm1
movddup -15 * SIZE(AO), %xmm0
addpd %xmm1, %xmm8
@@ -1516,7 +1516,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
+ ALIGN_4
.L69:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -1560,7 +1560,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -1577,7 +1577,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1592,7 +1592,7 @@
jle .L75
ALIGN_3
-.L72:
+.L72:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm8
@@ -1683,7 +1683,7 @@
decq I
BRANCH
jg .L71
- ALIGN_4
+ ALIGN_4
.L80:
testq $1, M
@@ -1703,7 +1703,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
#ifndef TRMMKERNEL
movaps -16 * SIZE(AO), %xmm0
@@ -1739,7 +1739,7 @@
jle .L85
ALIGN_3
-.L82:
+.L82:
mulpd %xmm0, %xmm1
#ifndef TRMMKERNEL
movaps -14 * SIZE(AO), %xmm0
@@ -1818,7 +1818,7 @@
#endif
movsd %xmm8, (CO1)
- ALIGN_4
+ ALIGN_4
.L999:
movq 0(%rsp), %rbx
diff --git a/kernel/x86_64/gemm_kernel_4x2_atom.S b/kernel/x86_64/gemm_kernel_4x2_atom.S
index 47b16ce..e5f2e91 100644
--- a/kernel/x86_64/gemm_kernel_4x2_atom.S
+++ b/kernel/x86_64/gemm_kernel_4x2_atom.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -90,7 +90,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -134,27 +134,27 @@
#endif
movsd %xmm0, ALPHA
-
+
#ifdef TRMMKERNEL
movsd %xmm4, OFFSET
movsd %xmm4, KK
#ifndef LEFT
negq KK
-#endif
#endif
-
+#endif
+
leaq (, LDC, SIZE), LDC
movq N, J
sarq $1, J
jle .L40
ALIGN_4
-
+
.L10:
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq C, CO1
leaq (C, LDC, 1), CO2
@@ -165,7 +165,7 @@
movq K, %rax
salq $BASE_SHIFT + 1, %rax
leaq (B, %rax), BB
-
+
movq M, I
sarq $2, I
jle .L20
@@ -182,7 +182,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (B, %rax, 2), BO
-#endif
+#endif
prefetcht0 0 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -214,7 +214,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -237,7 +237,7 @@
addsd %xmm7, %xmm14
movsd 3 * SIZE(AO), %xmm7
mulsd %xmm3, %xmm2
-
+
addsd %xmm6, %xmm15
movaps %xmm4, %xmm6
mulsd %xmm1, %xmm4
@@ -499,7 +499,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (B, %rax, 2), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm2, %xmm2
@@ -521,7 +521,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -711,7 +711,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (B, %rax, 2), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm7, %xmm7
@@ -728,7 +728,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -846,7 +846,7 @@
addq $1, KK
#endif
ALIGN_4
-
+
.L39:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $2, KK
@@ -864,7 +864,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq C, CO1
addq LDC, C
@@ -887,7 +887,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (B, %rax, 1), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm9, %xmm9
@@ -911,7 +911,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1097,7 +1097,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (B, %rax, 1), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm2, %xmm2
@@ -1114,7 +1114,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1250,7 +1250,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (B, %rax, 1), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm5, %xmm5
@@ -1269,7 +1269,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1355,7 +1355,7 @@
movsd %xmm8, 0 * SIZE(CO1)
ALIGN_4
-
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/gemm_kernel_4x4_barcelona.S b/kernel/x86_64/gemm_kernel_4x4_barcelona.S
index f7015c0..9a29a80 100644
--- a/kernel/x86_64/gemm_kernel_4x4_barcelona.S
+++ b/kernel/x86_64/gemm_kernel_4x4_barcelona.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,7 +49,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -295,7 +295,7 @@
movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\
movapd %xmm0, %xmm2 ;\
addq $8 * SIZE, %rax ;\
-
+
#define KERNEL_SUB1(xx) \
mulpd %xmm1, %xmm0 ;\
mulpd -14 * SIZE(AO), %xmm1 ;\
@@ -400,7 +400,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, (%rsp)
movq %rbp, 8(%rsp)
@@ -458,7 +458,7 @@
movsd %xmm12, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
movq N, J
sarq $2, J # j = (n >> 2)
@@ -468,13 +468,13 @@
.L01:
movq C, CO1 # coffset1 = c
leaq (C, LDC, 2), CO2 # coffset2 = c + ldc
-
+
leaq (C, LDC, 4), C # c += 4 * ldc
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq A, AO # aoffset = a
@@ -497,7 +497,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (B, %rax, 4), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -526,7 +526,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -784,7 +784,7 @@
decq I # i --
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $3, M
@@ -804,7 +804,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (B, %rax, 4), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -821,7 +821,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -981,7 +981,7 @@
addq $2 * SIZE, CO1
addq $2 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -996,7 +996,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (B, %rax, 4), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1013,7 +1013,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1129,13 +1129,13 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $4, KK
#endif
-
+
movq BO, B
decq J # j --
@@ -1154,7 +1154,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -1179,7 +1179,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (B, %rax, 2), BO
-#endif
+#endif
movddup -16 * SIZE(BO), %xmm1
movddup -15 * SIZE(BO), %xmm5
@@ -1202,7 +1202,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1359,7 +1359,7 @@
addq $4 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1376,7 +1376,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (B, %rax, 2), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1392,7 +1392,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1480,7 +1480,7 @@
.L69:
addpd %xmm10, %xmm8
addpd %xmm11, %xmm9
-
+
#ifndef TRMMKERNEL
movupd (CO1), %xmm0
movupd (CO2), %xmm2
@@ -1514,7 +1514,7 @@
addq $2 * SIZE, CO1 # coffset += 4
addq $2 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L70:
testq $1, M
@@ -1531,7 +1531,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (B, %rax, 2), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1547,7 +1547,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1646,8 +1646,8 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L79:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $2, KK
@@ -1667,7 +1667,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq C, CO1 # coffset1 = c
movq A, AO # aoffset = a
@@ -1687,7 +1687,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (B, %rax, 1), BO
-#endif
+#endif
movapd -8 * SIZE(AO), %xmm2
xorps %xmm8, %xmm8
@@ -1706,7 +1706,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1824,7 +1824,7 @@
addq $4 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L91
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
@@ -1841,7 +1841,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (B, %rax, 1), BO
-#endif
+#endif
movddup -16 * SIZE(BO), %xmm0
xorps %xmm8, %xmm8
@@ -1857,7 +1857,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1958,7 +1958,7 @@
addq $2 * SIZE, CO1 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L110:
testq $1, M
@@ -1975,7 +1975,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (B, %rax, 1), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1987,7 +1987,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2063,7 +2063,7 @@
movsd %xmm8, (CO1)
ALIGN_4
-
+
.L999:
movq (%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/gemm_kernel_4x4_core2.S b/kernel/x86_64/gemm_kernel_4x4_core2.S
index fa79fe0..2f2ddc8 100644
--- a/kernel/x86_64/gemm_kernel_4x4_core2.S
+++ b/kernel/x86_64/gemm_kernel_4x4_core2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,7 +49,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -91,7 +91,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -142,7 +142,7 @@
unpcklpd %xmm0, %xmm0
movapd %xmm0, ALPHA
-
+
subq $-16 * SIZE, A
subq $-16 * SIZE, B
@@ -156,7 +156,7 @@
movsd %xmm12, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
movq N, J
@@ -168,11 +168,11 @@
.L01:
/* Copying to Sub Buffer */
leaq 16 * SIZE + BUFFER, BO
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movapd -16 * SIZE(B), %xmm0
movapd -8 * SIZE(B), %xmm4
@@ -182,7 +182,7 @@
NOBRANCH
jle .L05
ALIGN_3
-
+
.L02:
prefetcht0 (PREFETCH_R + 0) * SIZE(B)
prefetcht0 (PREFETCH_R + 8) * SIZE(B)
@@ -274,7 +274,7 @@
BRANCH
jne .L06
ALIGN_4
-
+
.L10:
leaq (PREFETCH_R + 0) * SIZE(B), BB
@@ -300,7 +300,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
prefetcht2 (BB)
@@ -334,7 +334,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -349,7 +349,7 @@
jle .L15
ALIGN_4
-.L12:
+.L12:
PADDING;
addpd %xmm2, %xmm10
movaps -15 * SIZE(BO), %xmm2
@@ -597,7 +597,7 @@
BRANCH
jg .L11
jmp .L20
- ALIGN_4
+ ALIGN_4
.L18x:
#ifndef TRMMKERNEL
@@ -665,7 +665,7 @@
decq I # i --
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -683,7 +683,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
pxor %xmm8, %xmm8
movapd -16 * SIZE(AO), %xmm0
@@ -701,7 +701,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -716,7 +716,7 @@
jle .L25
ALIGN_4
-.L21:
+.L21:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm2, %xmm8
movapd -16 * SIZE(BO), %xmm2
@@ -872,7 +872,7 @@
addq $2 * SIZE, CO1 # coffset += 4
addq $2 * SIZE, CO2 # coffset += 4
ALIGN_4
-
+
.L30:
testq $1, M
BRANCH
@@ -889,7 +889,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
pxor %xmm8, %xmm8
movsd -16 * SIZE(AO), %xmm0
@@ -908,7 +908,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -923,7 +923,7 @@
jle .L35
ALIGN_4
-.L31:
+.L31:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addsd %xmm2, %xmm8
movsd -16 * SIZE(BO), %xmm2
@@ -1088,11 +1088,11 @@
.L41:
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
sarq $3, %rax
@@ -1100,7 +1100,7 @@
addq %rax, %rax
ALIGN_4
-
+
.L42:
movddup -16 * SIZE(B), %xmm8
movddup -15 * SIZE(B), %xmm9
@@ -1146,7 +1146,7 @@
subq $1, %rax
jne .L44
ALIGN_4
-
+
.L45:
movq C, CO1
leaq (C, LDC, 1), CO2
@@ -1169,7 +1169,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1193,7 +1193,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1207,7 +1207,7 @@
jle .L55
ALIGN_4
-.L51:
+.L51:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm2, %xmm8
@@ -1369,7 +1369,7 @@
addq $4 * SIZE, CO2
subq $1, I
jg .L50
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1386,7 +1386,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
pxor %xmm8, %xmm8
movapd -16 * SIZE(AO), %xmm0
@@ -1406,7 +1406,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1420,7 +1420,7 @@
jle .L65
ALIGN_4
-.L61:
+.L61:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm2, %xmm8
@@ -1531,7 +1531,7 @@
addq $2 * SIZE, CO1
addq $2 * SIZE, CO2
ALIGN_4
-
+
.L70:
testq $1, M
jle .L79
@@ -1547,7 +1547,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movsd -15 * SIZE(AO), %xmm1
@@ -1566,7 +1566,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1580,7 +1580,7 @@
jle .L75
ALIGN_4
-.L71:
+.L71:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addsd %xmm2, %xmm8
@@ -1702,11 +1702,11 @@
.L81:
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
sarq $4, %rax
@@ -1714,7 +1714,7 @@
addq %rax, %rax
ALIGN_4
-
+
.L82:
movddup -16 * SIZE(B), %xmm8
movddup -15 * SIZE(B), %xmm9
@@ -1757,7 +1757,7 @@
subq $1, %rax
jne .L84
ALIGN_4
-
+
.L85:
movq C, CO1
movq A, AO
@@ -1779,7 +1779,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
movapd -16 * SIZE(BO), %xmm4
@@ -1801,7 +1801,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1815,7 +1815,7 @@
jle .L95
ALIGN_4
-.L91:
+.L91:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm0, %xmm8
@@ -1927,7 +1927,7 @@
addq $4 * SIZE, CO1
subq $1, I
jg .L90
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
@@ -1944,7 +1944,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
movapd -16 * SIZE(BO), %xmm4
@@ -1965,7 +1965,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1979,7 +1979,7 @@
jle .L105
ALIGN_4
-.L101:
+.L101:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm0, %xmm8
@@ -2068,7 +2068,7 @@
addq $2 * SIZE, CO1
addq $2 * SIZE, CO2
ALIGN_4
-
+
.L110:
testq $1, M
jle .L999
@@ -2084,7 +2084,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
movsd -16 * SIZE(BO), %xmm4
@@ -2105,7 +2105,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2119,7 +2119,7 @@
jle .L115
ALIGN_4
-.L111:
+.L111:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm0, %xmm8
diff --git a/kernel/x86_64/gemm_kernel_4x4_penryn.S b/kernel/x86_64/gemm_kernel_4x4_penryn.S
index 3179c7d..56611e5 100644
--- a/kernel/x86_64/gemm_kernel_4x4_penryn.S
+++ b/kernel/x86_64/gemm_kernel_4x4_penryn.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -121,7 +121,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -166,7 +166,7 @@
#endif
movlps %xmm0, ALPHA
-
+
subq $-16 * SIZE, A
subq $-17 * SIZE, B
@@ -180,7 +180,7 @@
movq %r11, OFFSET
#ifndef LEFT
negq %r11
-#endif
+#endif
movq %r11, KK
#endif
@@ -223,7 +223,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
xorpd %xmm3, %xmm3
@@ -255,7 +255,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -270,7 +270,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm3, %xmm11
movaps -15 * SIZE(BO), %xmm3
@@ -543,7 +543,7 @@
BRANCH
jg .L11
jmp .L20
- ALIGN_4
+ ALIGN_4
.L18x:
#ifndef TRMMKERNEL
@@ -611,7 +611,7 @@
decq I # i --
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -631,7 +631,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
movaps -17 * SIZE(BO), %xmm2
@@ -652,7 +652,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -667,7 +667,7 @@
jle .L25
ALIGN_4
-.L22:
+.L22:
addpd %xmm3, %xmm11
movaps -15 * SIZE(BO), %xmm3
pshufd $0x4e, %xmm2, %xmm7
@@ -831,7 +831,7 @@
addq $2 * SIZE, CO1 # coffset += 4
addq $2 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -851,7 +851,7 @@
leaq (, %rax, SIZE), %rax
addq %rax, AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movaps -17 * SIZE(BO), %xmm2
@@ -867,7 +867,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -882,7 +882,7 @@
jle .L35
ALIGN_4
-.L32:
+.L32:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
shufps $0x44, %xmm0, %xmm0
@@ -1002,7 +1002,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
+ ALIGN_4
.L39:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -1031,7 +1031,7 @@
movq OFFSET, %rax
movq %rax, KK
#endif
-
+
movq K, %rax
salq $BASE_SHIFT + 1, %rax
leaq (B, %rax), BB
@@ -1055,7 +1055,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
PREFETCHB -16 * SIZE(BB)
subq $-4 * SIZE, BB
@@ -1076,7 +1076,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1091,7 +1091,7 @@
jle .L55
ALIGN_4
-.L52:
+.L52:
movaps %xmm2, %xmm4
pshufd $0x4e, %xmm2, %xmm7
@@ -1265,7 +1265,7 @@
decq I
BRANCH
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1285,7 +1285,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1299,7 +1299,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1314,7 +1314,7 @@
jle .L65
ALIGN_4
-.L62:
+.L62:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x4e, %xmm2, %xmm7
@@ -1431,7 +1431,7 @@
addq $2 * SIZE, CO1
addq $2 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L70:
testq $1, M
@@ -1451,7 +1451,7 @@
leaq (, %rax, SIZE), %rax
addq %rax, AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movaps -17 * SIZE(BO), %xmm2
@@ -1464,7 +1464,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1479,7 +1479,7 @@
jle .L75
ALIGN_4
-.L72:
+.L72:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
shufps $0x44, %xmm0, %xmm0
@@ -1570,7 +1570,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
+ ALIGN_4
.L79:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -1613,7 +1613,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
addq %rax, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
movaps -14 * SIZE(AO), %xmm1
@@ -1630,7 +1630,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1645,7 +1645,7 @@
jle .L95
ALIGN_4
-.L92:
+.L92:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x44, %xmm2, %xmm3
@@ -1777,7 +1777,7 @@
decq I
BRANCH
jg .L91
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
@@ -1797,7 +1797,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
addq %rax, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1809,7 +1809,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1824,7 +1824,7 @@
jle .L105
ALIGN_4
-.L102:
+.L102:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x44, %xmm2, %xmm3
@@ -1923,13 +1923,13 @@
#endif
addq $2 * SIZE, CO1
- ALIGN_4
+ ALIGN_4
.L110:
testq $1, M
BRANCH
jle .L999
-
+
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
@@ -1942,7 +1942,7 @@
leaq (, %rax, SIZE), %rax
addq %rax, AO
addq %rax, BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movsd -17 * SIZE(BO), %xmm2
@@ -1955,7 +1955,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1970,7 +1970,7 @@
jle .L115
ALIGN_4
-.L112:
+.L112:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
mulsd %xmm0, %xmm2
@@ -2041,7 +2041,7 @@
#endif
movlpd %xmm8, 0 * SIZE(CO1)
- ALIGN_4
+ ALIGN_4
.L999:
movq 0(%rsp), %rbx
diff --git a/kernel/x86_64/gemm_kernel_4x4_sse2.S b/kernel/x86_64/gemm_kernel_4x4_sse2.S
index 1060197..bc317da 100644
--- a/kernel/x86_64/gemm_kernel_4x4_sse2.S
+++ b/kernel/x86_64/gemm_kernel_4x4_sse2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -328,11 +328,11 @@
movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
addpd %xmm6, %xmm15 ;\
movapd 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
-#endif
+#endif
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -398,7 +398,7 @@
movsd %xmm12, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
movq N, J
sarq $2, J # j = (n >> 2)
@@ -410,17 +410,17 @@
leaq 16 * SIZE + BUFFER, BO
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
sarq $2, %rax
jle .L03
ALIGN_3
-
+
#define RPREFETCHSIZE (8 * 7 + 4)
#define WPREFETCHSIZE (8 * 8 + 4)
@@ -528,7 +528,7 @@
subq $1, %rax
jne .L04
ALIGN_3
-
+
.L10:
movq A, AO # aoffset = a
@@ -551,7 +551,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
movapd -16 * SIZE(BO), %xmm1
@@ -582,7 +582,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -964,7 +964,7 @@
decq I # i --
BRANCH
jg .L11
- ALIGN_3
+ ALIGN_3
.L20:
testq $3, M
@@ -986,7 +986,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1005,7 +1005,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1212,7 +1212,7 @@
addq $2 * SIZE, CO1 # coffset += 4
addq $2 * SIZE, CO2 # coffset += 4
- ALIGN_3
+ ALIGN_3
.L30:
testq $1, M
@@ -1231,7 +1231,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1250,7 +1250,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1445,8 +1445,8 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_3
-
+ ALIGN_3
+
.L39:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $4, KK
@@ -1468,17 +1468,17 @@
.L41:
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
sarq $2, %rax
jle .L43
ALIGN_3
-
+
.L42:
PREFETCH 56 * SIZE(B)
@@ -1536,7 +1536,7 @@
decq %rax
jne .L44
ALIGN_3
-
+
.L50:
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -1559,7 +1559,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1583,7 +1583,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1794,7 +1794,7 @@
addq $4 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L51
- ALIGN_3
+ ALIGN_3
.L60:
testq $2, M
@@ -1813,7 +1813,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1832,7 +1832,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1976,7 +1976,7 @@
addq $2 * SIZE, CO1 # coffset += 4
addq $2 * SIZE, CO2 # coffset += 4
- ALIGN_3
+ ALIGN_3
.L70:
testq $1, M
@@ -1995,7 +1995,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2014,7 +2014,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2150,8 +2150,8 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_3
-
+ ALIGN_3
+
.L79:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $2, KK
@@ -2167,17 +2167,17 @@
.L81:
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
sarq $3, %rax
jle .L83
ALIGN_3
-
+
.L82:
PREFETCH 56 * SIZE(B)
@@ -2232,7 +2232,7 @@
decq %rax
jne .L84
ALIGN_3
-
+
.L90:
movq C, CO1 # coffset1 = c
movq A, AO # aoffset = a
@@ -2254,7 +2254,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2275,7 +2275,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2415,7 +2415,7 @@
addq $4 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L91
- ALIGN_3
+ ALIGN_3
.L100:
testq $2, M
@@ -2434,7 +2434,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2450,7 +2450,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2554,7 +2554,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $2, KK
#endif
- ALIGN_3
+ ALIGN_3
.L110:
testq $1, M
@@ -2573,7 +2573,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2589,7 +2589,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2673,7 +2673,7 @@
#endif
movsd %xmm8, 0 * SIZE(CO1)
ALIGN_3
-
+
.L999:
movq %rbx, %rsp
diff --git a/kernel/x86_64/gemm_kernel_4x4_sse3.S b/kernel/x86_64/gemm_kernel_4x4_sse3.S
index 8cbe6ed..ae153fe 100644
--- a/kernel/x86_64/gemm_kernel_4x4_sse3.S
+++ b/kernel/x86_64/gemm_kernel_4x4_sse3.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -333,7 +333,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -377,27 +377,27 @@
#endif
movsd %xmm0, ALPHA
-
+
#ifdef TRMMKERNEL
movsd %xmm4, OFFSET
movsd %xmm4, KK
#ifndef LEFT
negq KK
-#endif
#endif
-
+#endif
+
leaq (, LDC, SIZE), LDC
movq N, J
sarq $2, J # j = (n >> 2)
jle .L40
ALIGN_4
-
+
.L10:
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -406,7 +406,7 @@
movq K, %rax
salq $BASE_SHIFT + 2, %rax
leaq (B, %rax), BB
-
+
movq M, I
sarq $2, I # i = (m >> 2)
jle .L20
@@ -423,7 +423,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (B, %rax, 4), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -456,7 +456,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -473,7 +473,7 @@
NOBRANCH
je .L15
-.L1X:
+.L1X:
KERNEL1 (16 * 0)
KERNEL2 (16 * 0)
KERNEL3 (16 * 0)
@@ -1076,7 +1076,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (B, %rax, 4), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1092,7 +1092,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1300,7 +1300,7 @@
addq $2 * SIZE, CO1 # coffset += 4
addq $2 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -1318,7 +1318,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (B, %rax, 4), BO
-#endif
+#endif
movddup 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1334,7 +1334,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1472,8 +1472,8 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $4, KK
@@ -1493,7 +1493,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -1519,7 +1519,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (B, %rax, 2), BO
-#endif
+#endif
prefetcht0 0 * SIZE(BB)
subq $-4 * SIZE, BB
@@ -1546,7 +1546,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1766,7 +1766,7 @@
addq $4 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1784,7 +1784,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (B, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1800,7 +1800,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1944,7 +1944,7 @@
#endif
addq $2 * SIZE, CO1 # coffset += 4
addq $2 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L70:
testq $1, M
@@ -1962,7 +1962,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (B, %rax, 2), BO
-#endif
+#endif
movddup 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1978,7 +1978,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2081,8 +2081,8 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L79:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $2, KK
@@ -2099,7 +2099,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq C, CO1
movq A, AO
@@ -2120,7 +2120,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (B, %rax, 1), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2142,7 +2142,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2280,7 +2280,7 @@
addq $4 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L91
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
@@ -2298,7 +2298,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (B, %rax, 1), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2314,7 +2314,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2419,7 +2419,7 @@
#endif
addq $2 * SIZE, CO1 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L110:
testq $1, M
@@ -2437,7 +2437,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (B, %rax, 1), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2458,7 +2458,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2530,8 +2530,8 @@
#endif
movsd %xmm0, 0 * SIZE(CO1)
- ALIGN_4
-
+ ALIGN_4
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/gemm_kernel_4x8_nano.S b/kernel/x86_64/gemm_kernel_4x8_nano.S
index 4d81405..0745628 100644
--- a/kernel/x86_64/gemm_kernel_4x8_nano.S
+++ b/kernel/x86_64/gemm_kernel_4x8_nano.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
@@ -56,7 +56,7 @@
#define CO1 %r15
#define CO2 %rbp
#define BB %r12
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 64
@@ -90,7 +90,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -145,13 +145,13 @@
shufps $0, %xmm0, %xmm0
movaps %xmm0, ALPHA
-
+
#ifdef TRMMKERNEL
movsd %xmm4, OFFSET
movsd %xmm4, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
subq $-32 * SIZE, A
@@ -166,10 +166,10 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
leaq 32 * SIZE + BUFFER, BO
-
+
movaps 0 * SIZE(B), %xmm1
movaps 4 * SIZE(B), %xmm3
movaps 8 * SIZE(B), %xmm5
@@ -179,7 +179,7 @@
sarq $1, %rax
jle .L03
ALIGN_4
-
+
.L02:
PREFETCH (RPREFETCHSIZE + 0) * SIZE(B)
@@ -237,7 +237,7 @@
addq $ 8 * SIZE, B
subq $-16 * SIZE, BO
ALIGN_4
-
+
.L10:
movq C, CO1
leaq (C, LDC, 4), CO2
@@ -262,7 +262,7 @@
salq $BASE_SHIFT + 1, %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
leaq (LDC, LDC, 2), %rax
@@ -295,7 +295,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -568,7 +568,7 @@
addq $4 * SIZE, CO2
decq I
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -585,7 +585,7 @@
salq $BASE_SHIFT + 1, %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movddup -32 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -599,7 +599,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -760,7 +760,7 @@
addq $2 * SIZE, CO1
addq $2 * SIZE, CO2
ALIGN_4
-
+
.L30:
testq $1, M
je .L39
@@ -777,7 +777,7 @@
leaq (AO, %rax, 1), AO
addq %rax, %rax
leaq (BO, %rax, 8), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -791,7 +791,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -972,10 +972,10 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
leaq 32 * SIZE + BUFFER, BO
-
+
movaps 0 * SIZE(B), %xmm1
movaps 4 * SIZE(B), %xmm3
movaps 8 * SIZE(B), %xmm5
@@ -985,7 +985,7 @@
sarq $2, %rax
jle .L43
ALIGN_4
-
+
.L42:
PREFETCH (RPREFETCHSIZE + 0) * SIZE(B)
@@ -1043,7 +1043,7 @@
decq %rax
jne .L45
ALIGN_4
-
+
.L50:
movq C, CO1
leaq (C, LDC, 2), CO2
@@ -1066,7 +1066,7 @@
salq $BASE_SHIFT + 1, %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
movaps -32 * SIZE(BO), %xmm1
@@ -1085,7 +1085,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1253,7 +1253,7 @@
addq $4 * SIZE, CO2
decq I
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1270,7 +1270,7 @@
salq $BASE_SHIFT + 1, %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movddup -32 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1284,7 +1284,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1399,7 +1399,7 @@
addq $2 * SIZE, CO1
addq $2 * SIZE, CO2
ALIGN_4
-
+
.L70:
testq $1, M
je .L79
@@ -1415,7 +1415,7 @@
salq $BASE_SHIFT, %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1427,7 +1427,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1558,10 +1558,10 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
leaq 32 * SIZE + BUFFER, BO
-
+
movaps 0 * SIZE(B), %xmm1
movaps 4 * SIZE(B), %xmm3
@@ -1569,7 +1569,7 @@
sarq $2, %rax
jle .L83
ALIGN_4
-
+
.L82:
pshufd $0x50, %xmm1, %xmm0
movaps %xmm0, -32 * SIZE(BO)
@@ -1609,7 +1609,7 @@
decq %rax
jne .L85
ALIGN_4
-
+
.L90:
movq C, CO1
leaq (C, LDC), CO2
@@ -1632,7 +1632,7 @@
salq $BASE_SHIFT, %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
movaps -32 * SIZE(BO), %xmm1
@@ -1647,7 +1647,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1766,7 +1766,7 @@
addq $4 * SIZE, CO2
decq I
jg .L91
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
@@ -1783,7 +1783,7 @@
salq $BASE_SHIFT, %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movddup -32 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1794,7 +1794,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1891,7 +1891,7 @@
addq $2 * SIZE, CO1
addq $2 * SIZE, CO2
ALIGN_4
-
+
.L110:
testq $1, M
je .L119
@@ -1907,7 +1907,7 @@
salq $BASE_SHIFT, %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1918,7 +1918,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2031,10 +2031,10 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
leaq 32 * SIZE + BUFFER, BO
-
+
movsd 0 * SIZE(B), %xmm1
movhps 2 * SIZE(B), %xmm1
@@ -2042,7 +2042,7 @@
sarq $2, %rax
jle .L123
ALIGN_4
-
+
.L122:
pshufd $0x50, %xmm1, %xmm0
movaps %xmm0, -32 * SIZE(BO)
@@ -2076,7 +2076,7 @@
decq %rax
jne .L125
ALIGN_4
-
+
.L130:
movq C, CO1
movq A, AO
@@ -2098,7 +2098,7 @@
salq $BASE_SHIFT, %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
movddup -32 * SIZE(BO), %xmm1
@@ -2111,7 +2111,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2208,7 +2208,7 @@
addq $4 * SIZE, CO1
decq I
jg .L131
- ALIGN_4
+ ALIGN_4
.L140:
testq $2, M
@@ -2225,7 +2225,7 @@
salq $BASE_SHIFT, %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movddup -32 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2236,7 +2236,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2330,7 +2330,7 @@
addq $2 * SIZE, CO1
ALIGN_4
-
+
.L150:
testq $1, M
je .L999
@@ -2346,7 +2346,7 @@
salq $BASE_SHIFT, %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2357,7 +2357,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
diff --git a/kernel/x86_64/gemm_kernel_4x8_nehalem.S b/kernel/x86_64/gemm_kernel_4x8_nehalem.S
index 5d02ac6..549ea13 100644
--- a/kernel/x86_64/gemm_kernel_4x8_nehalem.S
+++ b/kernel/x86_64/gemm_kernel_4x8_nehalem.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %rbp
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -97,7 +97,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -143,7 +143,7 @@
unpcklps %xmm0, %xmm0
movlps %xmm0, ALPHA
-
+
subq $-32 * SIZE, A
subq $-32 * SIZE, B
@@ -157,7 +157,7 @@
movq %r11, OFFSET
#ifndef LEFT
negq %r11
-#endif
+#endif
movq %r11, KK
#endif
@@ -203,7 +203,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
leaq (LDC, LDC, 2), %rax
@@ -242,7 +242,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -257,7 +257,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm12
@@ -528,7 +528,7 @@
decq I
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -547,7 +547,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -566,7 +566,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -581,7 +581,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
addps %xmm1, %xmm8
pshufd $0x50, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -757,7 +757,7 @@
addq $2 * SIZE, CO1
addq $2 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -776,7 +776,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
xorps %xmm2, %xmm2
movsd -32 * SIZE(AO), %xmm0
@@ -789,7 +789,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -804,7 +804,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
pshufd $0x00, %xmm0, %xmm1
addps %xmm2, %xmm8
movaps -32 * SIZE(BO), %xmm2
@@ -932,8 +932,8 @@
addq $1, KK
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $8, KK
@@ -980,7 +980,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -1002,7 +1002,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1017,7 +1017,7 @@
jle .L45
ALIGN_3
-.L42:
+.L42:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm8
@@ -1193,7 +1193,7 @@
decq I
BRANCH
jg .L41
- ALIGN_4
+ ALIGN_4
.L50:
testq $2, M
@@ -1212,7 +1212,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -1226,7 +1226,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1241,7 +1241,7 @@
jle .L55
ALIGN_3
-.L52:
+.L52:
addps %xmm1, %xmm8
pshufd $0x50, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -1354,7 +1354,7 @@
addq $2 * SIZE, CO1
addq $2 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L60:
testq $1, M
@@ -1373,7 +1373,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
xorps %xmm2, %xmm2
movsd -32 * SIZE(AO), %xmm0
@@ -1385,7 +1385,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1400,7 +1400,7 @@
jle .L65
ALIGN_3
-.L62:
+.L62:
pshufd $0x00, %xmm0, %xmm1
addps %xmm2, %xmm8
movaps -32 * SIZE(BO), %xmm2
@@ -1494,8 +1494,8 @@
addq $1, KK
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L69:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $4, KK
@@ -1538,7 +1538,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -1555,7 +1555,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1570,7 +1570,7 @@
jle .L75
ALIGN_3
-.L72:
+.L72:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm8
@@ -1687,7 +1687,7 @@
decq I
BRANCH
jg .L71
- ALIGN_4
+ ALIGN_4
.L80:
testq $2, M
@@ -1706,7 +1706,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -1720,7 +1720,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1735,7 +1735,7 @@
jle .L85
ALIGN_3
-.L82:
+.L82:
addps %xmm1, %xmm8
movsd -32 * SIZE(BO), %xmm1
unpcklps %xmm1, %xmm1
@@ -1825,7 +1825,7 @@
addq $2 * SIZE, CO1
addq $2 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L90:
testq $1, M
@@ -1844,7 +1844,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
xorps %xmm2, %xmm2
movsd -32 * SIZE(AO), %xmm0
@@ -1856,7 +1856,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1871,7 +1871,7 @@
jle .L95
ALIGN_3
-.L92:
+.L92:
pshufd $0x00, %xmm0, %xmm1
addps %xmm2, %xmm8
movsd -32 * SIZE(BO), %xmm2
@@ -1959,8 +1959,8 @@
addq $1, KK
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $2, KK
@@ -2002,7 +2002,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -2015,7 +2015,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2030,7 +2030,7 @@
jle .L105
ALIGN_3
-.L102:
+.L102:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm8
@@ -2124,7 +2124,7 @@
decq I
BRANCH
jg .L101
- ALIGN_4
+ ALIGN_4
.L110:
testq $2, M
@@ -2143,7 +2143,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -2154,7 +2154,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2169,7 +2169,7 @@
jle .L115
ALIGN_3
-.L112:
+.L112:
addps %xmm1, %xmm8
movss -32 * SIZE(BO), %xmm1
unpcklps %xmm1, %xmm1
@@ -2255,7 +2255,7 @@
#endif
addq $2 * SIZE, CO1
- ALIGN_4
+ ALIGN_4
.L120:
testq $1, M
@@ -2274,7 +2274,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
xorps %xmm2, %xmm2
movss -32 * SIZE(AO), %xmm0
@@ -2285,7 +2285,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2300,7 +2300,7 @@
jle .L125
ALIGN_3
-.L122:
+.L122:
addss %xmm2, %xmm8
movss -32 * SIZE(BO), %xmm2
mulss %xmm0, %xmm2
@@ -2366,8 +2366,8 @@
#endif
movss %xmm8, (CO1)
- ALIGN_4
-
+ ALIGN_4
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/gemm_kernel_8x4_barcelona.S b/kernel/x86_64/gemm_kernel_8x4_barcelona.S
index becd195..1849565 100644
--- a/kernel/x86_64/gemm_kernel_8x4_barcelona.S
+++ b/kernel/x86_64/gemm_kernel_8x4_barcelona.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,14 +49,14 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
#define CO1 %r15
#define CO2 %r12
#define BB %rbp
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 64
@@ -295,7 +295,7 @@
movaps 100 * SIZE(BO, %rax, 8), %xmm3 ;\
movaps %xmm0, %xmm2 ;\
addq $16 * SIZE, %rax
-
+
#define KERNEL_SUB1(xx) \
mulps %xmm1, %xmm0 ;\
mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\
@@ -403,7 +403,7 @@
#endif
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -458,13 +458,13 @@
shufps $0, %xmm0, %xmm0
movaps %xmm0, ALPHA
-
+
#ifdef TRMMKERNEL
movsd %xmm12, OFFSET
movsd %xmm12, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
subq $-32 * SIZE, A
@@ -479,16 +479,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $2, %rax
jle .L03
ALIGN_4
-
+
.L02:
prefetch (RPREFETCHSIZE + 0) * SIZE(B)
@@ -575,7 +575,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L10:
movq C, CO1
leaq (C, LDC, 1), CO2
@@ -600,7 +600,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
movaps -32 * SIZE(BO), %xmm1
@@ -629,7 +629,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -876,7 +876,7 @@
addq $8 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $4, M
@@ -893,7 +893,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -913,7 +913,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1124,7 +1124,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L30:
testq $2, M
@@ -1141,7 +1141,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -24 * SIZE(AO), %xmm10
@@ -1161,7 +1161,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1368,7 +1368,7 @@
addq $2 * SIZE, CO1 # coffset += 4
addq $2 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L40:
testq $1, M
@@ -1386,7 +1386,7 @@
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm8
movss -28 * SIZE(AO), %xmm10
@@ -1406,7 +1406,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1611,8 +1611,8 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L49:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $4, KK
@@ -1629,16 +1629,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $2, %rax
jle .L53
ALIGN_4
-
+
.L52:
prefetch (RPREFETCHSIZE + 0) * SIZE(B)
@@ -1701,7 +1701,7 @@
decq %rax
jne .L54
ALIGN_4
-
+
.L60:
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -1724,7 +1724,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -1749,7 +1749,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1970,7 +1970,7 @@
addq $8 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L61
- ALIGN_4
+ ALIGN_4
.L70:
testq $4, M
@@ -1988,7 +1988,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -2008,7 +2008,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2154,7 +2154,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L80:
testq $2, M
@@ -2171,7 +2171,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -24 * SIZE(AO), %xmm10
@@ -2191,7 +2191,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2341,7 +2341,7 @@
addq $2 * SIZE, CO1 # coffset += 4
addq $2 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L90:
testq $1, M
@@ -2358,7 +2358,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm8
movss -28 * SIZE(AO), %xmm10
@@ -2378,7 +2378,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2524,8 +2524,8 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $2, KK
@@ -2542,16 +2542,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $3, %rax
jle .L103
ALIGN_4
-
+
.L102:
prefetch (RPREFETCHSIZE + 0) * SIZE(B)
@@ -2608,7 +2608,7 @@
decq %rax
jne .L104
ALIGN_4
-
+
.L110:
movq C, CO1 # coffset1 = c
movq A, AO # aoffset = a
@@ -2630,7 +2630,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -2654,7 +2654,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2806,7 +2806,7 @@
addq $8 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L111
- ALIGN_4
+ ALIGN_4
.L120:
testq $4, M
@@ -2823,7 +2823,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -2841,7 +2841,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2952,7 +2952,7 @@
#endif
addq $4 * SIZE, CO1 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L130:
testq $2, M
@@ -2969,7 +2969,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -24 * SIZE(AO), %xmm10
@@ -2987,7 +2987,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -3099,7 +3099,7 @@
#endif
addq $2 * SIZE, CO1 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L140:
testq $1, M
@@ -3116,7 +3116,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm8
movss -28 * SIZE(AO), %xmm10
@@ -3134,7 +3134,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -3221,8 +3221,8 @@
addss %xmm8, %xmm0
#endif
movss %xmm0, 0 * SIZE(CO1)
- ALIGN_4
-
+ ALIGN_4
+
.L999:
movq %rbx, %rsp
movq 0(%rsp), %rbx
diff --git a/kernel/x86_64/gemm_kernel_8x4_core2.S b/kernel/x86_64/gemm_kernel_8x4_core2.S
index 285d644..c31dc90 100644
--- a/kernel/x86_64/gemm_kernel_8x4_core2.S
+++ b/kernel/x86_64/gemm_kernel_8x4_core2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,7 +49,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -91,7 +91,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -142,7 +142,7 @@
shufps $0, %xmm0, %xmm0
movaps %xmm0, ALPHA
-
+
subq $-32 * SIZE, A
subq $-32 * SIZE, B
@@ -151,7 +151,7 @@
movsd %xmm12, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
movq OLD_M, M
@@ -168,18 +168,18 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq 32 * SIZE + BUFFER, BO
-
+
movaps -32 * SIZE(B), %xmm3
movq K, %rax
sarq $2, %rax
jle .L05
ALIGN_4
-
+
.L02:
prefetcht0 (PREFETCH_R + 0) * SIZE(B)
movaps -28 * SIZE(B), %xmm7
@@ -261,7 +261,7 @@
subq $1, %rax
jne .L06
ALIGN_4
-
+
.L10:
leaq (PREFETCH_R + 0) * SIZE(B), BB
@@ -286,7 +286,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
pxor %xmm8, %xmm8
movaps -32 * SIZE(AO), %xmm0
@@ -319,7 +319,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -333,7 +333,7 @@
jle .L15
ALIGN_4
-.L12:
+.L12:
addps %xmm2, %xmm10
movaps -32 * SIZE(BO), %xmm2
addps %xmm3, %xmm14
@@ -603,7 +603,7 @@
addq $8 * SIZE, CO2
subq $1, I
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $4, M
@@ -620,7 +620,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -632,7 +632,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -646,7 +646,7 @@
jle .L25
ALIGN_4
-.L21:
+.L21:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movaps -32 * SIZE(AO), %xmm0
@@ -806,7 +806,7 @@
addq $4 * SIZE, CO2
subq $1, I
ALIGN_4
-
+
.L30:
testq $2, M
jle .L40
@@ -822,7 +822,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -839,7 +839,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -853,7 +853,7 @@
jle .L35
ALIGN_4
-.L31:
+.L31:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movsd -32 * SIZE(AO), %xmm0
@@ -1019,7 +1019,7 @@
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
leaq (BO, %rax, 8), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1036,7 +1036,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1050,7 +1050,7 @@
jle .L45
ALIGN_4
-.L41:
+.L41:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movss -32 * SIZE(AO), %xmm0
@@ -1216,11 +1216,11 @@
.L51:
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
sarq $3, %rax
@@ -1228,7 +1228,7 @@
addq %rax, %rax
ALIGN_4
-
+
.L52:
movaps -32 * SIZE(B), %xmm3
movaps -28 * SIZE(B), %xmm7
@@ -1284,7 +1284,7 @@
subq $1, %rax
jne .L54
ALIGN_4
-
+
.L55:
movq C, CO1
leaq (C, LDC, 1), CO2
@@ -1307,7 +1307,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1322,7 +1322,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1336,7 +1336,7 @@
jle .L65
ALIGN_4
-.L61:
+.L61:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movaps -32 * SIZE(AO), %xmm0
@@ -1504,7 +1504,7 @@
addq $8 * SIZE, CO2
subq $1, I
jg .L60
- ALIGN_4
+ ALIGN_4
.L70:
testq $4, M
@@ -1521,7 +1521,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1533,7 +1533,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1547,7 +1547,7 @@
jle .L75
ALIGN_4
-.L71:
+.L71:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movaps -32 * SIZE(AO), %xmm0
@@ -1658,7 +1658,7 @@
addq $4 * SIZE, CO1
addq $4 * SIZE, CO2
ALIGN_4
-
+
.L80:
testq $2, M
jle .L90
@@ -1674,7 +1674,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1686,7 +1686,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1700,7 +1700,7 @@
jle .L85
ALIGN_4
-.L81:
+.L81:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movsd -32 * SIZE(AO), %xmm0
@@ -1824,7 +1824,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1836,7 +1836,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1850,7 +1850,7 @@
jle .L95
ALIGN_4
-.L91:
+.L91:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
@@ -1975,11 +1975,11 @@
.L101:
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
sarq $4, %rax
@@ -1987,7 +1987,7 @@
addq %rax, %rax
ALIGN_4
-
+
.L102:
movss -32 * SIZE(B), %xmm0
movss -31 * SIZE(B), %xmm1
@@ -2041,7 +2041,7 @@
subq $1, %rax
jne .L104
ALIGN_4
-
+
.L105:
movq C, CO1
movq A, AO
@@ -2063,7 +2063,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -2077,7 +2077,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2091,7 +2091,7 @@
jle .L115
ALIGN_4
-.L111:
+.L111:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movaps -32 * SIZE(AO), %xmm0
@@ -2210,7 +2210,7 @@
addq $8 * SIZE, CO1
subq $1, I
jg .L110
- ALIGN_4
+ ALIGN_4
.L120:
testq $4, M
@@ -2227,7 +2227,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -2239,7 +2239,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2253,7 +2253,7 @@
jle .L125
ALIGN_4
-.L121:
+.L121:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movaps -32 * SIZE(AO), %xmm0
@@ -2344,7 +2344,7 @@
addq $4 * SIZE, CO1
addq $4 * SIZE, CO2
ALIGN_4
-
+
.L130:
testq $2, M
jle .L140
@@ -2360,7 +2360,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -2372,7 +2372,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2386,7 +2386,7 @@
jle .L135
ALIGN_4
-.L131:
+.L131:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movsd -32 * SIZE(AO), %xmm0
@@ -2487,7 +2487,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -2499,7 +2499,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2513,7 +2513,7 @@
jle .L145
ALIGN_4
-.L141:
+.L141:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movss -32 * SIZE(AO), %xmm0
diff --git a/kernel/x86_64/gemm_kernel_8x4_penryn.S b/kernel/x86_64/gemm_kernel_8x4_penryn.S
index 68ca5fc..b381de9 100644
--- a/kernel/x86_64/gemm_kernel_8x4_penryn.S
+++ b/kernel/x86_64/gemm_kernel_8x4_penryn.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -97,7 +97,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -143,7 +143,7 @@
unpcklps %xmm0, %xmm0
movlps %xmm0, ALPHA
-
+
subq $-32 * SIZE, A
subq $-32 * SIZE, B
@@ -157,7 +157,7 @@
movq %r11, OFFSET
#ifndef LEFT
negq %r11
-#endif
+#endif
movq %r11, KK
#endif
@@ -200,7 +200,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 8), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
xorpd %xmm3, %xmm3
@@ -235,7 +235,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -250,7 +250,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH -32 * SIZE(PREA)
addps %xmm6, %xmm10
addps %xmm3, %xmm14
@@ -665,7 +665,7 @@
decq I
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $4, M
@@ -684,7 +684,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
xorps %xmm3, %xmm3
@@ -704,7 +704,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -719,7 +719,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
addps %xmm6, %xmm10
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x39, %xmm2, %xmm7
@@ -890,7 +890,7 @@
addq $4 * SIZE, CO1
addq $4 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L30:
testq $2, M
@@ -909,7 +909,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
xorps %xmm3, %xmm3
@@ -928,7 +928,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -943,7 +943,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x44, %xmm0, %xmm1
@@ -1064,7 +1064,7 @@
addq $2 * SIZE, CO1
addq $2 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L40:
testq $1, M
@@ -1083,7 +1083,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movsd -32 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1095,7 +1095,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1110,7 +1110,7 @@
jle .L45
ALIGN_3
-.L42:
+.L42:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x00, %xmm0, %xmm1
@@ -1257,7 +1257,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 8), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
prefetcht2 -32 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -1284,7 +1284,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1299,7 +1299,7 @@
jle .L55
ALIGN_3
-.L52:
+.L52:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm3, %xmm8
@@ -1471,7 +1471,7 @@
decq I
BRANCH
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $4, M
@@ -1490,7 +1490,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
xorps %xmm3, %xmm3
@@ -1507,7 +1507,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1522,7 +1522,7 @@
jle .L65
ALIGN_3
-.L62:
+.L62:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm3, %xmm8
@@ -1638,7 +1638,7 @@
addq $4 * SIZE, CO1
addq $4 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L70:
testq $2, M
@@ -1657,7 +1657,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
xorps %xmm3, %xmm3
@@ -1670,7 +1670,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1685,7 +1685,7 @@
jle .L75
ALIGN_3
-.L72:
+.L72:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm3, %xmm8
@@ -1780,7 +1780,7 @@
addq $2 * SIZE, CO1
addq $2 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L80:
testq $1, M
@@ -1799,7 +1799,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movsd -32 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1811,7 +1811,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1826,7 +1826,7 @@
jle .L85
ALIGN_3
-.L82:
+.L82:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x00, %xmm0, %xmm1
@@ -1958,7 +1958,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 8), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1974,7 +1974,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1989,7 +1989,7 @@
jle .L95
ALIGN_3
-.L92:
+.L92:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x00, %xmm2, %xmm3
@@ -2106,7 +2106,7 @@
decq I
BRANCH
jg .L91
- ALIGN_4
+ ALIGN_4
.L100:
testq $4, M
@@ -2125,7 +2125,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -2137,7 +2137,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2152,7 +2152,7 @@
jle .L105
ALIGN_3
-.L102:
+.L102:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x00, %xmm2, %xmm3
@@ -2241,7 +2241,7 @@
#endif
addq $4 * SIZE, CO1
- ALIGN_4
+ ALIGN_4
.L110:
testq $2, M
@@ -2260,7 +2260,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
xorps %xmm3, %xmm3
@@ -2273,7 +2273,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2288,7 +2288,7 @@
jle .L115
ALIGN_3
-.L112:
+.L112:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x00, %xmm2, %xmm3
@@ -2374,7 +2374,7 @@
#endif
addq $2 * SIZE, CO1
- ALIGN_4
+ ALIGN_4
.L120:
testq $1, M
@@ -2393,7 +2393,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -2405,7 +2405,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2420,7 +2420,7 @@
jle .L125
ALIGN_3
-.L122:
+.L122:
mulss %xmm0, %xmm2
movss -31 * SIZE(AO), %xmm0
addss %xmm2, %xmm8
diff --git a/kernel/x86_64/gemm_kernel_8x4_sse.S b/kernel/x86_64/gemm_kernel_8x4_sse.S
index 218cb04..c4ef1f8 100644
--- a/kernel/x86_64/gemm_kernel_8x4_sse.S
+++ b/kernel/x86_64/gemm_kernel_8x4_sse.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
@@ -56,7 +56,7 @@
#define CO1 %r15
#define CO2 %rbp
#define BB %r12
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 64
@@ -273,7 +273,7 @@
addps %xmm5, %xmm14 ;\
movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
addps %xmm6, %xmm15 ;\
- movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
+ movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
#define KERNEL5(xx) \
mulps %xmm0, %xmm1 ;\
@@ -336,7 +336,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -381,7 +381,7 @@
#endif
EMMS
-
+
movq %rsp, %rbx # save old stack
subq $128 + LOCAL_BUFFER_SIZE, %rsp
andq $-4096, %rsp # align stack
@@ -393,13 +393,13 @@
shufps $0, %xmm0, %xmm0
movaps %xmm0, ALPHA
-
+
#ifdef TRMMKERNEL
movsd %xmm4, OFFSET
movsd %xmm4, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
subq $-32 * SIZE, A
@@ -414,11 +414,11 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movd 0 * SIZE(B), %mm0
movq K, %rax
@@ -427,7 +427,7 @@
addq %rax, %rax
ALIGN_4
-
+
.L02:
PREFETCH (RPREFETCHSIZE + 0) * SIZE(B)
@@ -510,7 +510,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L10:
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -535,7 +535,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
movaps -32 * SIZE(BO), %xmm1
@@ -565,7 +565,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -937,7 +937,7 @@
addq $8 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $4, M
@@ -954,7 +954,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -974,7 +974,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1185,7 +1185,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L30:
testq $2, M
@@ -1202,7 +1202,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -24 * SIZE(AO), %xmm10
@@ -1222,7 +1222,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1441,7 +1441,7 @@
addq $2 * SIZE, CO1 # coffset += 4
addq $2 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L40:
testq $1, M
@@ -1459,7 +1459,7 @@
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm8
movss -28 * SIZE(AO), %xmm10
@@ -1479,7 +1479,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1684,8 +1684,8 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L49:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $4, KK
@@ -1702,16 +1702,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $2, %rax
jle .L53
ALIGN_4
-
+
.L52:
#if defined(PENTIUM4) || defined(GENERIC)
movss 0 * SIZE(B), %xmm0
@@ -1767,7 +1767,7 @@
punpckldq %mm5, %mm5
punpckldq %mm6, %mm6
punpckldq %mm7, %mm7
-
+
movq %mm0, 0 * SIZE(BO)
movq %mm0, 2 * SIZE(BO)
movq %mm1, 4 * SIZE(BO)
@@ -1830,7 +1830,7 @@
decq %rax
jne .L54
ALIGN_4
-
+
.L60:
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -1853,7 +1853,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -1878,7 +1878,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2099,7 +2099,7 @@
addq $8 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L61
- ALIGN_4
+ ALIGN_4
.L70:
testq $4, M
@@ -2117,7 +2117,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -2137,7 +2137,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2283,7 +2283,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L80:
testq $2, M
@@ -2300,7 +2300,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -24 * SIZE(AO), %xmm10
@@ -2320,7 +2320,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2476,7 +2476,7 @@
addq $2 * SIZE, CO1 # coffset += 4
addq $2 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L90:
testq $1, M
@@ -2493,7 +2493,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm8
movss -28 * SIZE(AO), %xmm10
@@ -2513,7 +2513,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2659,8 +2659,8 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $2, KK
@@ -2677,16 +2677,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $3, %rax
jle .L103
ALIGN_4
-
+
.L102:
#if defined(PENTIUM4) || defined(GENERIC)
@@ -2743,7 +2743,7 @@
punpckldq %mm5, %mm5
punpckldq %mm6, %mm6
punpckldq %mm7, %mm7
-
+
movq %mm0, 0 * SIZE(BO)
movq %mm0, 2 * SIZE(BO)
movq %mm1, 4 * SIZE(BO)
@@ -2795,7 +2795,7 @@
decq %rax
jne .L104
ALIGN_4
-
+
.L110:
movq C, CO1 # coffset1 = c
movq A, AO # aoffset = a
@@ -2817,7 +2817,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -2841,7 +2841,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2993,7 +2993,7 @@
addq $8 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L111
- ALIGN_4
+ ALIGN_4
.L120:
testq $4, M
@@ -3010,7 +3010,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -3028,7 +3028,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -3139,7 +3139,7 @@
#endif
addq $4 * SIZE, CO1 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L130:
testq $2, M
@@ -3156,7 +3156,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -24 * SIZE(AO), %xmm10
@@ -3174,7 +3174,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -3289,7 +3289,7 @@
#endif
addq $2 * SIZE, CO1 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L140:
testq $1, M
@@ -3306,7 +3306,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm8
movss -28 * SIZE(AO), %xmm10
@@ -3324,7 +3324,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -3411,8 +3411,8 @@
addss %xmm8, %xmm0
#endif
movss %xmm0, 0 * SIZE(CO1)
- ALIGN_4
-
+ ALIGN_4
+
.L999:
movq %rbx, %rsp
diff --git a/kernel/x86_64/gemm_kernel_8x4_sse3.S b/kernel/x86_64/gemm_kernel_8x4_sse3.S
index c7954fe..c853e46 100644
--- a/kernel/x86_64/gemm_kernel_8x4_sse3.S
+++ b/kernel/x86_64/gemm_kernel_8x4_sse3.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -54,7 +54,7 @@
#define CO1 %r14
#define CO2 %r15
#define BB %rbp
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 64
@@ -328,7 +328,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -381,13 +381,13 @@
shufps $0, %xmm0, %xmm0
movaps %xmm0, ALPHA
-
+
#ifdef TRMMKERNEL
movsd %xmm4, OFFSET
movsd %xmm4, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
leaq (, LDC, SIZE), LDC
@@ -400,16 +400,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $2, %rax
jle .L03
ALIGN_4
-
+
.L02:
movddup 0 * SIZE(B), %xmm0
movddup 2 * SIZE(B), %xmm1
@@ -458,7 +458,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L10:
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -487,7 +487,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -518,7 +518,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -534,7 +534,7 @@
salq $4, %rax
je .L15
-.L1X:
+.L1X:
KERNEL1 (64 * 0)
KERNEL2 (64 * 0)
KERNEL3 (64 * 0)
@@ -860,7 +860,7 @@
addq $8 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $4, M
@@ -877,7 +877,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -897,7 +897,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1110,7 +1110,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L30:
testq $2, M
@@ -1127,7 +1127,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movddup 0 * SIZE(AO), %xmm8
movddup 8 * SIZE(AO), %xmm10
@@ -1144,7 +1144,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1307,7 +1307,7 @@
addq $2 * SIZE, CO1 # coffset += 4
addq $2 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L40:
testq $1, M
@@ -1324,7 +1324,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movss 0 * SIZE(AO), %xmm8
movss 4 * SIZE(AO), %xmm10
@@ -1339,7 +1339,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1483,8 +1483,8 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L49:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $4, KK
@@ -1501,16 +1501,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $3, %rax
jle .L53
ALIGN_4
-
+
.L52:
movddup 0 * SIZE(B), %xmm0
movddup 2 * SIZE(B), %xmm1
@@ -1556,7 +1556,7 @@
decq %rax
jne .L54
ALIGN_4
-
+
.L60:
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -1579,7 +1579,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -1601,7 +1601,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1823,7 +1823,7 @@
addq $8 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L61
- ALIGN_4
+ ALIGN_4
.L70:
testq $4, M
@@ -1840,7 +1840,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movsldup 0 * SIZE(BO), %xmm9
@@ -1857,7 +1857,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2002,7 +2002,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L80:
testq $2, M
@@ -2019,7 +2019,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movddup 0 * SIZE(AO), %xmm8
movddup 8 * SIZE(AO), %xmm10
@@ -2034,7 +2034,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2152,7 +2152,7 @@
addq $2 * SIZE, CO1 # coffset += 4
addq $2 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L90:
testq $1, M
@@ -2169,7 +2169,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movss 0 * SIZE(AO), %xmm8
movss 4 * SIZE(AO), %xmm10
@@ -2184,7 +2184,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2306,8 +2306,8 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $2, KK
@@ -2323,16 +2323,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $3, %rax
jle .L103
ALIGN_4
-
+
.L102:
movss 0 * SIZE(B), %xmm0
@@ -2385,7 +2385,7 @@
decq %rax
jne .L104
ALIGN_4
-
+
.L110:
movq C, CO1 # coffset1 = c
movq A, AO # aoffset = a
@@ -2407,7 +2407,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -2428,7 +2428,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2583,7 +2583,7 @@
addq $8 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L111
- ALIGN_4
+ ALIGN_4
.L120:
testq $4, M
@@ -2601,7 +2601,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -2622,7 +2622,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2731,7 +2731,7 @@
#endif
addq $4 * SIZE, CO1 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L130:
testq $2, M
@@ -2748,7 +2748,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 0 * SIZE(BO), %xmm9
@@ -2765,7 +2765,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2870,7 +2870,7 @@
#endif
addq $2 * SIZE, CO1 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L140:
testq $1, M
@@ -2887,7 +2887,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movss 0 * SIZE(AO), %xmm8
movss 4 * SIZE(AO), %xmm10
@@ -2904,7 +2904,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2990,8 +2990,8 @@
addss %xmm8, %xmm0
#endif
movss %xmm0, 0 * SIZE(CO1)
- ALIGN_4
-
+ ALIGN_4
+
.L999:
movq %rbx, %rsp
movq 0(%rsp), %rbx
diff --git a/kernel/x86_64/gemm_ncopy_2.S b/kernel/x86_64/gemm_ncopy_2.S
index 06a0fea..b069f9c 100644
--- a/kernel/x86_64/gemm_ncopy_2.S
+++ b/kernel/x86_64/gemm_ncopy_2.S
@@ -86,7 +86,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %r15
pushq %r14
diff --git a/kernel/x86_64/gemm_ncopy_2_bulldozer.S b/kernel/x86_64/gemm_ncopy_2_bulldozer.S
index 02d72f0..1911d3c 100644
--- a/kernel/x86_64/gemm_ncopy_2_bulldozer.S
+++ b/kernel/x86_64/gemm_ncopy_2_bulldozer.S
@@ -73,7 +73,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %r15
pushq %r14
diff --git a/kernel/x86_64/gemm_ncopy_4.S b/kernel/x86_64/gemm_ncopy_4.S
index cac647f..7192cec 100644
--- a/kernel/x86_64/gemm_ncopy_4.S
+++ b/kernel/x86_64/gemm_ncopy_4.S
@@ -114,7 +114,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %r15
pushq %r14
diff --git a/kernel/x86_64/gemm_ncopy_4_opteron.S b/kernel/x86_64/gemm_ncopy_4_opteron.S
index e5cbd62..ea39f89 100644
--- a/kernel/x86_64/gemm_ncopy_4_opteron.S
+++ b/kernel/x86_64/gemm_ncopy_4_opteron.S
@@ -87,7 +87,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %r15
pushq %r14
@@ -361,7 +361,7 @@
.L999:
EMMS
-
+
#ifdef WINDOWS_ABI
movups 0(%rsp), %xmm6
movups 16(%rsp), %xmm7
diff --git a/kernel/x86_64/gemm_tcopy_2.S b/kernel/x86_64/gemm_tcopy_2.S
index 190cebb..f35427b 100644
--- a/kernel/x86_64/gemm_tcopy_2.S
+++ b/kernel/x86_64/gemm_tcopy_2.S
@@ -100,7 +100,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %rdi
pushq %rsi
@@ -174,7 +174,7 @@
movapd %xmm0, 0 * SIZE(BO)
movapd %xmm1, 2 * SIZE(BO)
#endif
-
+
leaq (BO, M8, 2), BO
addq $2 * SIZE, AO1
addq $2 * SIZE, AO2
diff --git a/kernel/x86_64/gemm_tcopy_2_bulldozer.S b/kernel/x86_64/gemm_tcopy_2_bulldozer.S
index b8d61b0..d755204 100644
--- a/kernel/x86_64/gemm_tcopy_2_bulldozer.S
+++ b/kernel/x86_64/gemm_tcopy_2_bulldozer.S
@@ -86,7 +86,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %rdi
pushq %rsi
@@ -202,7 +202,7 @@
leaq (BO, M8, 2), BO
#endif
-
+
addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
decq I
@@ -246,7 +246,7 @@
leaq (BO, M8, 2), BO
#endif
-
+
addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
ALIGN_4
@@ -257,7 +257,7 @@
jle .L14
#ifndef DOUBLE
vmovsd 0 * SIZE(AO1), %xmm0
- vmovsd 0 * SIZE(AO2), %xmm1
+ vmovsd 0 * SIZE(AO2), %xmm1
vmovsd %xmm0, 0 * SIZE(BO)
vmovsd %xmm1, 2 * SIZE(BO)
@@ -268,7 +268,7 @@
vmovups %xmm0, 0 * SIZE(BO)
vmovups %xmm1, 2 * SIZE(BO)
#endif
-
+
leaq (BO, M8, 2), BO
addq $2 * SIZE, AO1
addq $2 * SIZE, AO2
diff --git a/kernel/x86_64/gemm_tcopy_4.S b/kernel/x86_64/gemm_tcopy_4.S
index c230816..ba7714b 100644
--- a/kernel/x86_64/gemm_tcopy_4.S
+++ b/kernel/x86_64/gemm_tcopy_4.S
@@ -130,7 +130,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %rdi
pushq %rsi
diff --git a/kernel/x86_64/gemm_tcopy_4_opteron.S b/kernel/x86_64/gemm_tcopy_4_opteron.S
index 105fe3b..e8207ac 100644
--- a/kernel/x86_64/gemm_tcopy_4_opteron.S
+++ b/kernel/x86_64/gemm_tcopy_4_opteron.S
@@ -104,7 +104,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %rdi
pushq %rsi
diff --git a/kernel/x86_64/iamax.S b/kernel/x86_64/iamax.S
index 27637c5..79e1bae 100644
--- a/kernel/x86_64/iamax.S
+++ b/kernel/x86_64/iamax.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1
#define X ARG2
#define INCX ARG3
@@ -76,7 +76,7 @@
FLD (X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
addq INCX, X
decq M
@@ -89,7 +89,7 @@
sarq $3, I
jle .L20
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -97,7 +97,7 @@
FLD 0 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -108,7 +108,7 @@
FLD 1 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -119,7 +119,7 @@
FLD 2 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -130,7 +130,7 @@
FLD 3 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -141,7 +141,7 @@
FLD 4 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -152,7 +152,7 @@
FLD 5 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -163,7 +163,7 @@
FLD 6 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -174,7 +174,7 @@
FLD 7 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -199,7 +199,7 @@
.L21:
FLD 0 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -219,12 +219,12 @@
sarq $3, I
jle .L60
ALIGN_4
-
+
.L50:
FLD 0 * SIZE(X)
addq INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -236,7 +236,7 @@
FLD 0 * SIZE(X)
addq INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -248,7 +248,7 @@
FLD 0 * SIZE(X)
addq INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -260,7 +260,7 @@
FLD 0 * SIZE(X)
addq INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -272,7 +272,7 @@
FLD 0 * SIZE(X)
addq INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -284,7 +284,7 @@
FLD 0 * SIZE(X)
addq INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -296,7 +296,7 @@
FLD 0 * SIZE(X)
addq INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -308,7 +308,7 @@
FLD 0 * SIZE(X)
addq INCX, X
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
@@ -331,7 +331,7 @@
.L61:
FLD 0 * SIZE(X)
#ifdef USE_ABS
- fabs
+ fabs
#endif
fcomi
FMOV %st(1), %st(0)
diff --git a/kernel/x86_64/iamax_sse.S b/kernel/x86_64/iamax_sse.S
index 8b7de07..f22e34a 100644
--- a/kernel/x86_64/iamax_sse.S
+++ b/kernel/x86_64/iamax_sse.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
@@ -47,12 +47,12 @@
#define I ARG4
#define XX %r10
#define MM %r11
-
+
#ifdef USE_MIN
#define maxps minps
#define maxss minss
#endif
-
+
#include "l1param.h"
PROLOGUE
@@ -127,7 +127,7 @@
sarq $4, I
jle .L15
ALIGN_4
-
+
.L11:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -193,7 +193,7 @@
#endif
maxps %xmm6, %xmm2
addq $4 * SIZE, X
- ALIGN_3
+ ALIGN_3
.L17:
testq $2, M
@@ -206,7 +206,7 @@
#endif
maxps %xmm7, %xmm3
addq $2 * SIZE, X
-
+
.L18:
testq $1, M
je .L20
@@ -276,7 +276,7 @@
sarq $3, I
jle .L25
ALIGN_4
-
+
.L23:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -384,7 +384,7 @@
incq RET
comiss %xmm0, %xmm4
je .L999
- ALIGN_3
+ ALIGN_3
.L26:
testq $2, M
@@ -404,7 +404,7 @@
comiss %xmm0, %xmm2
je .L999
ALIGN_3
-
+
.L27:
incq RET
jmp .L999
@@ -416,7 +416,7 @@
sarq $4, I
jle .L35
ALIGN_4
-
+
.L31:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -490,7 +490,7 @@
#endif
maxps %xmm6, %xmm2
addq $4 * SIZE, X
- ALIGN_3
+ ALIGN_3
.L37:
testq $2, M
@@ -503,7 +503,7 @@
#endif
maxps %xmm7, %xmm3
addq $2 * SIZE, X
-
+
.L38:
testq $1, M
je .L40
@@ -535,7 +535,7 @@
sarq $3, I
jle .L45
ALIGN_4
-
+
.L43:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -645,7 +645,7 @@
incq RET
comiss %xmm0, %xmm4
je .L999
- ALIGN_3
+ ALIGN_3
.L46:
testq $2, M
@@ -665,7 +665,7 @@
comiss %xmm0, %xmm2
je .L999
ALIGN_3
-
+
.L47:
incq RET
jmp .L999
@@ -676,7 +676,7 @@
sarq $3, I
jle .L85
ALIGN_4
-
+
.L81:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -776,7 +776,7 @@
andps %xmm15, %xmm7
#endif
maxss %xmm7, %xmm3
- ALIGN_3
+ ALIGN_3
.L86:
testq $2, M
@@ -796,7 +796,7 @@
#endif
maxss %xmm5, %xmm1
ALIGN_3
-
+
.L87:
testq $1, M
je .L90
@@ -822,7 +822,7 @@
sarq $3, I
jle .L95
ALIGN_4
-
+
.L93:
movss 0 * SIZE(X), %xmm1
addq INCX, X
@@ -985,7 +985,7 @@
incq RET
comiss %xmm0, %xmm4
je .L999
- ALIGN_3
+ ALIGN_3
.L96:
testq $2, M
@@ -1006,7 +1006,7 @@
comiss %xmm0, %xmm2
je .L999
ALIGN_3
-
+
.L97:
incq RET
ALIGN_3
diff --git a/kernel/x86_64/iamax_sse2.S b/kernel/x86_64/iamax_sse2.S
index c17a81a..6808f19 100644
--- a/kernel/x86_64/iamax_sse2.S
+++ b/kernel/x86_64/iamax_sse2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
@@ -47,14 +47,14 @@
#define I ARG4
#define XX %r10
#define MM %r11
-
+
#ifdef USE_MIN
#define maxpd minpd
#define maxsd minsd
#endif
#include "l1param.h"
-
+
PROLOGUE
PROFCODE
@@ -114,7 +114,7 @@
sarq $4, I
jle .L15
ALIGN_4
-
+
.L11:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -226,7 +226,7 @@
#endif
maxpd %xmm5, %xmm1
addq $4 * SIZE, X
- ALIGN_3
+ ALIGN_3
.L17:
testq $2, M
@@ -238,7 +238,7 @@
#endif
maxpd %xmm6, %xmm2
addq $2 * SIZE, X
-
+
.L18:
testq $1, M
je .L20
@@ -284,7 +284,7 @@
sarq $3, I
jle .L25
ALIGN_4
-
+
.L22:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -407,7 +407,7 @@
incq RET
comisd %xmm0, %xmm4
je .L999
- ALIGN_3
+ ALIGN_3
.L27:
testq $2, M
@@ -427,7 +427,7 @@
comisd %xmm0, %xmm2
je .L999
ALIGN_3
-
+
.L28:
incq RET
jmp .L999
@@ -566,7 +566,7 @@
#endif
maxpd %xmm5, %xmm1
addq $4 * SIZE, X
- ALIGN_3
+ ALIGN_3
.L57:
testq $2, M
@@ -579,7 +579,7 @@
#endif
maxpd %xmm6, %xmm2
addq $2 * SIZE, X
-
+
.L58:
testq $1, M
je .L60
@@ -608,7 +608,7 @@
sarq $3, I
jle .L65
ALIGN_4
-
+
.L62:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -735,7 +735,7 @@
incq RET
comisd %xmm0, %xmm4
je .L999
- ALIGN_3
+ ALIGN_3
.L67:
testq $2, M
@@ -755,7 +755,7 @@
comisd %xmm0, %xmm2
je .L999
ALIGN_3
-
+
.L68:
incq RET
jmp .L999
@@ -766,7 +766,7 @@
sarq $4, I
jle .L85
ALIGN_4
-
+
.L81:
movsd 0 * SIZE(X), %xmm4
addq INCX, X
@@ -909,7 +909,7 @@
andpd %xmm15, %xmm5
#endif
maxpd %xmm5, %xmm1
- ALIGN_3
+ ALIGN_3
.L87:
testq $2, M
@@ -924,7 +924,7 @@
#endif
maxpd %xmm6, %xmm2
ALIGN_3
-
+
.L88:
testq $1, M
je .L90
@@ -960,7 +960,7 @@
sarq $3, I
jle .L95
ALIGN_4
-
+
.L92:
movsd 0 * SIZE(X), %xmm1
addq INCX, X
@@ -1101,7 +1101,7 @@
incq RET
comisd %xmm0, %xmm4
je .L999
- ALIGN_3
+ ALIGN_3
.L97:
testq $2, M
@@ -1122,7 +1122,7 @@
comisd %xmm0, %xmm2
je .L999
ALIGN_3
-
+
.L98:
incq RET
ALIGN_3
diff --git a/kernel/x86_64/izamax.S b/kernel/x86_64/izamax.S
index a77b06d..c066acd 100644
--- a/kernel/x86_64/izamax.S
+++ b/kernel/x86_64/izamax.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1
#define X ARG2
#define INCX ARG3
@@ -75,9 +75,9 @@
movq $1, RET
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
addq INCX, X
decq M
@@ -90,16 +90,16 @@
sarq $2, I
jle .L20
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi
FMOV %st(1), %st(0)
@@ -109,9 +109,9 @@
incq NUM
FLD 2 * SIZE(X)
- fabs
+ fabs
FLD 3 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi
FMOV %st(1), %st(0)
@@ -121,9 +121,9 @@
incq NUM
FLD 4 * SIZE(X)
- fabs
+ fabs
FLD 5 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi
FMOV %st(1), %st(0)
@@ -133,9 +133,9 @@
incq NUM
FLD 6 * SIZE(X)
- fabs
+ fabs
FLD 7 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi
FMOV %st(1), %st(0)
@@ -158,9 +158,9 @@
.L21:
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi
FMOV %st(1), %st(0)
@@ -180,12 +180,12 @@
sarq $2, I
jle .L60
ALIGN_4
-
+
.L50:
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
addq INCX, X
faddp %st, %st(1)
fcomi
@@ -196,9 +196,9 @@
incq NUM
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
addq INCX, X
faddp %st, %st(1)
fcomi
@@ -209,9 +209,9 @@
incq NUM
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
addq INCX, X
faddp %st, %st(1)
fcomi
@@ -222,9 +222,9 @@
incq NUM
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
addq INCX, X
faddp %st, %st(1)
fcomi
@@ -247,9 +247,9 @@
.L61:
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi
FMOV %st(1), %st(0)
diff --git a/kernel/x86_64/izamax_sse.S b/kernel/x86_64/izamax_sse.S
index 2dfeb93..e273b8c 100644
--- a/kernel/x86_64/izamax_sse.S
+++ b/kernel/x86_64/izamax_sse.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
@@ -47,12 +47,12 @@
#define I ARG4
#define XX %r10
#define MM %r11
-
+
#ifdef USE_MIN
#define maxps minps
#define maxss minss
#endif
-
+
#include "l1param.h"
PROLOGUE
@@ -91,7 +91,7 @@
sarq $3, I
jle .L35
ALIGN_4
-
+
.L31:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -172,7 +172,7 @@
maxss %xmm4, %xmm0
maxss %xmm6, %xmm1
addq $4 * SIZE, X
- ALIGN_3
+ ALIGN_3
.L37:
testq $1, M
@@ -203,7 +203,7 @@
sarq $2, I
jle .L45
ALIGN_4
-
+
.L41:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -301,7 +301,7 @@
incq RET
comiss %xmm0, %xmm3
je .L999
- ALIGN_3
+ ALIGN_3
.L47:
incq RET
@@ -313,7 +313,7 @@
sarq $3, I
jle .L75
ALIGN_4
-
+
.L71:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -384,7 +384,7 @@
andps %xmm15, %xmm6
addps %xmm6, %xmm4
maxps %xmm4, %xmm0
- ALIGN_3
+ ALIGN_3
.L76:
testq $2, M
@@ -405,7 +405,7 @@
maxss %xmm4, %xmm0
maxss %xmm6, %xmm1
ALIGN_3
-
+
.L77:
testq $1, M
je .L80
@@ -435,7 +435,7 @@
sarq $2, I
jle .L85
ALIGN_4
-
+
.L81:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -540,7 +540,7 @@
incq RET
comiss %xmm0, %xmm3
je .L999
- ALIGN_3
+ ALIGN_3
.L87:
incq RET
diff --git a/kernel/x86_64/izamax_sse2.S b/kernel/x86_64/izamax_sse2.S
index 4046082..c656a65 100644
--- a/kernel/x86_64/izamax_sse2.S
+++ b/kernel/x86_64/izamax_sse2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
@@ -47,7 +47,7 @@
#define I ARG4
#define XX %r10
#define MM %r11
-
+
#ifdef USE_MIN
#define maxpd minpd
#define maxsd minsd
@@ -188,7 +188,7 @@
andpd %xmm15, %xmm5
addpd %xmm5, %xmm4
maxpd %xmm4, %xmm0
- ALIGN_3
+ ALIGN_3
.L27:
testq $1, M
@@ -323,7 +323,7 @@
incq RET
comisd %xmm0, %xmm3
je .L999
- ALIGN_3
+ ALIGN_3
.L36:
incq RET
@@ -335,7 +335,7 @@
sarq $3, I
jle .L65
ALIGN_4
-
+
.L61:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -437,7 +437,7 @@
andpd %xmm15, %xmm5
addpd %xmm5, %xmm4
maxpd %xmm4, %xmm2
- ALIGN_3
+ ALIGN_3
.L67:
testq $1, M
@@ -583,7 +583,7 @@
incq RET
comisd %xmm0, %xmm3
je .L999
- ALIGN_3
+ ALIGN_3
.L76:
incq RET
diff --git a/kernel/x86_64/nrm2.S b/kernel/x86_64/nrm2.S
index d375e8e..e9be126 100644
--- a/kernel/x86_64/nrm2.S
+++ b/kernel/x86_64/nrm2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
@@ -49,7 +49,7 @@
PROLOGUE
PROFCODE
-
+
fldz
testq M, M
jle .L999
@@ -68,7 +68,7 @@
sarq $3, I
jle .L20
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -129,7 +129,7 @@
sarq $3, I
jle .L60
ALIGN_4
-
+
.L50:
FLD (X)
addq INCX, X
diff --git a/kernel/x86_64/nrm2_sse.S b/kernel/x86_64/nrm2_sse.S
index 37762ab..33b1ee4 100644
--- a/kernel/x86_64/nrm2_sse.S
+++ b/kernel/x86_64/nrm2_sse.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
@@ -49,7 +49,7 @@
PROLOGUE
PROFCODE
-
+
SAVEREGISTERS
pxor %xmm0, %xmm0
@@ -67,7 +67,7 @@
testq $SIZE, X
je .L05
-
+
movss 0 * SIZE(X), %xmm4
cvtss2sd %xmm4, %xmm6
mulsd %xmm6, %xmm6
@@ -81,7 +81,7 @@
movq M, I
sarq $3, I
jle .L14
-
+
movsd 0 * SIZE(X), %xmm4
movsd 2 * SIZE(X), %xmm5
movsd 4 * SIZE(X), %xmm6
@@ -181,7 +181,7 @@
sarq $3, I
jle .L44
ALIGN_4
-
+
.L41:
movss (X), %xmm4
addq INCX, X
diff --git a/kernel/x86_64/qdot.S b/kernel/x86_64/qdot.S
index c958fc5..a48a04f 100644
--- a/kernel/x86_64/qdot.S
+++ b/kernel/x86_64/qdot.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define STACK_N 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
diff --git a/kernel/x86_64/qgemm_kernel_2x2.S b/kernel/x86_64/qgemm_kernel_2x2.S
index 9db145b..99db396 100644
--- a/kernel/x86_64/qgemm_kernel_2x2.S
+++ b/kernel/x86_64/qgemm_kernel_2x2.S
@@ -46,7 +46,7 @@
#define B ARG5
#define C ARG6
#define LDC %r10
-
+
#define I %r12
#define J %r13
#define AO %r14
@@ -73,7 +73,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -89,10 +89,10 @@
negq %rax
movq %rax, KK
#endif
-
+
addq $8 * SIZE, A
addq $8 * SIZE, B
-
+
salq $BASE_SHIFT, LDC
movq N, %rax
@@ -105,7 +105,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq A, AO
@@ -128,7 +128,7 @@
salq $BASE_SHIFT, %rax
leaq (AO, %rax, 2), AO
leaq (B, %rax, 2), BO
-#endif
+#endif
fldz
fldz
@@ -148,7 +148,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -174,7 +174,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -192,7 +192,7 @@
FLD -5 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -5 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -212,7 +212,7 @@
FLD -3 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -3 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -230,7 +230,7 @@
FLD -1 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -1 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -266,7 +266,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -347,7 +347,7 @@
salq $BASE_SHIFT, %rax
leaq (AO, %rax, 1), AO
leaq ( B, %rax, 2), BO
-#endif
+#endif
fldz
fldz
@@ -357,7 +357,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -496,13 +496,13 @@
.L30:
movq N, %rax
- testq $1, %rax
+ testq $1, %rax
je .L999
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq A, AO
@@ -524,7 +524,7 @@
salq $BASE_SHIFT, %rax
leaq (AO, %rax, 2), AO
leaq ( B, %rax, 1), BO
-#endif
+#endif
fldz
fldz
@@ -540,7 +540,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -681,7 +681,7 @@
salq $BASE_SHIFT, %rax
leaq (AO, %rax, 1), AO
leaq ( B, %rax, 1), BO
-#endif
+#endif
fldz
@@ -690,7 +690,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
diff --git a/kernel/x86_64/qgemv_n.S b/kernel/x86_64/qgemv_n.S
index 28415ec..630d03f 100644
--- a/kernel/x86_64/qgemv_n.S
+++ b/kernel/x86_64/qgemv_n.S
@@ -41,9 +41,9 @@
#include "l2param.h"
#define P 32
-
+
#define STACKSIZE 80
-
+
#define ALPHA 8 + STACKSIZE(%rsp)
#define OLD_INCX 24 + STACKSIZE(%rsp)
#define OLD_Y 32 + STACKSIZE(%rsp)
@@ -71,7 +71,7 @@
#define XP %r15
/* #define BUFFER %r15 */
#define MIN_N %rbx
-
+
PROLOGUE
PROFCODE
@@ -175,7 +175,7 @@
ALIGN_2
.L48:
- movq A, A1 # a_offset = a
+ movq A, A1 # a_offset = a
fldz
addq $4 * SIZE, A # a += 4
fldz
@@ -239,7 +239,7 @@
FLD 0 * SIZE(A1) # at1 = *(a_offset + 0)
fmul %st(1), %st # at1 *= bt1
faddp %st, %st(2) # ct1 += at1
-
+
FLD 1 * SIZE(A1) # at1 = *(a_offset + 1)
fmul %st(1), %st # at1 *= bt1
faddp %st, %st(3) # ct2 += at1
diff --git a/kernel/x86_64/qgemv_t.S b/kernel/x86_64/qgemv_t.S
index 9402f21..d7c9cd2 100644
--- a/kernel/x86_64/qgemv_t.S
+++ b/kernel/x86_64/qgemv_t.S
@@ -42,7 +42,7 @@
#define STACKSIZE 80
#define P 4096
-
+
#define ALPHA 8 + STACKSIZE(%rsp)
#define OLD_INCX 24 + STACKSIZE(%rsp)
#define OLD_Y 32 + STACKSIZE(%rsp)
@@ -70,7 +70,7 @@
#define X1 %r13
#define Y1 %r14
#define MIN_M %rbx
-
+
PROLOGUE
PROFCODE
diff --git a/kernel/x86_64/qtrsm_kernel_LN_2x2.S b/kernel/x86_64/qtrsm_kernel_LN_2x2.S
index 7093eba..536042e 100644
--- a/kernel/x86_64/qtrsm_kernel_LN_2x2.S
+++ b/kernel/x86_64/qtrsm_kernel_LN_2x2.S
@@ -46,7 +46,7 @@
#define B ARG5
#define C ARG6
#define LDC %r10
-
+
#define I %r12
#define J %r13
#define AO %r14
@@ -73,7 +73,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -89,10 +89,10 @@
negq %rax
movq %rax, KK
#endif
-
+
addq $8 * SIZE, A
addq $8 * SIZE, B
-
+
salq $BASE_SHIFT, LDC
#ifdef LN
@@ -118,7 +118,7 @@
movq OFFSET, %rax
negq %rax
movq %rax, KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -160,7 +160,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -187,7 +187,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
fldz
fldz
@@ -410,7 +410,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
fldz
fldz
@@ -447,7 +447,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -465,7 +465,7 @@
FLD -5 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -5 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -485,7 +485,7 @@
FLD -3 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -3 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -503,7 +503,7 @@
FLD -1 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -1 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -540,7 +540,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -759,7 +759,7 @@
.L30:
movq N, %rax
- testq $1, %rax
+ testq $1, %rax
je .L999
#if defined(LT) || defined(RN)
@@ -787,7 +787,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -814,7 +814,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
fldz
@@ -989,7 +989,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
fldz
fldz
diff --git a/kernel/x86_64/qtrsm_kernel_LT_2x2.S b/kernel/x86_64/qtrsm_kernel_LT_2x2.S
index d2a05a1..6e94976 100644
--- a/kernel/x86_64/qtrsm_kernel_LT_2x2.S
+++ b/kernel/x86_64/qtrsm_kernel_LT_2x2.S
@@ -46,7 +46,7 @@
#define B ARG5
#define C ARG6
#define LDC %r10
-
+
#define I %r12
#define J %r13
#define AO %r14
@@ -73,7 +73,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -89,10 +89,10 @@
negq %rax
movq %rax, KK
#endif
-
+
addq $8 * SIZE, A
addq $8 * SIZE, B
-
+
salq $BASE_SHIFT, LDC
#ifdef LN
@@ -118,7 +118,7 @@
movq OFFSET, %rax
negq %rax
movq %rax, KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -160,7 +160,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -187,7 +187,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
fldz
fldz
@@ -224,7 +224,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -242,7 +242,7 @@
FLD -5 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -5 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -262,7 +262,7 @@
FLD -3 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -3 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -280,7 +280,7 @@
FLD -1 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -1 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -317,7 +317,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -532,7 +532,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
fldz
fldz
@@ -759,7 +759,7 @@
.L30:
movq N, %rax
- testq $1, %rax
+ testq $1, %rax
je .L999
#if defined(LT) || defined(RN)
@@ -787,7 +787,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -814,7 +814,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
fldz
fldz
@@ -1047,7 +1047,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
fldz
diff --git a/kernel/x86_64/qtrsm_kernel_RT_2x2.S b/kernel/x86_64/qtrsm_kernel_RT_2x2.S
index 288aa07..caa7de1 100644
--- a/kernel/x86_64/qtrsm_kernel_RT_2x2.S
+++ b/kernel/x86_64/qtrsm_kernel_RT_2x2.S
@@ -46,7 +46,7 @@
#define B ARG5
#define C ARG6
#define LDC %r10
-
+
#define I %r12
#define J %r13
#define AO %r14
@@ -73,7 +73,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -89,10 +89,10 @@
negq %rax
movq %rax, KK
#endif
-
+
addq $8 * SIZE, A
addq $8 * SIZE, B
-
+
salq $BASE_SHIFT, LDC
#ifdef LN
@@ -118,7 +118,7 @@
movq OFFSET, %rax
negq %rax
movq %rax, KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -127,7 +127,7 @@
#endif
movq N, %rax
- testq $1, %rax
+ testq $1, %rax
je .L30
#if defined(LT) || defined(RN)
@@ -155,7 +155,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -182,7 +182,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
fldz
fldz
@@ -415,7 +415,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
fldz
@@ -624,7 +624,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -651,7 +651,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
fldz
fldz
@@ -688,7 +688,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -706,7 +706,7 @@
FLD -5 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -5 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -726,7 +726,7 @@
FLD -3 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -3 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -744,7 +744,7 @@
FLD -1 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -1 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -781,7 +781,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -996,7 +996,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
fldz
fldz
diff --git a/kernel/x86_64/rot.S b/kernel/x86_64/rot.S
index 05e5aeb..6b2ad7f 100644
--- a/kernel/x86_64/rot.S
+++ b/kernel/x86_64/rot.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N ARG1
#define X ARG2
#define INCX ARG3
@@ -80,7 +80,7 @@
sarq $2, I
jle .L15
ALIGN_4
-
+
.L10:
#ifdef PREFETCHW
PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -212,7 +212,7 @@
sarq $2, I
jle .L55
ALIGN_4
-
+
.L51:
FLD 0 * SIZE(X)
FLD 0 * SIZE(Y)
diff --git a/kernel/x86_64/rot_sse.S b/kernel/x86_64/rot_sse.S
index cb7e1b3..6e37292 100644
--- a/kernel/x86_64/rot_sse.S
+++ b/kernel/x86_64/rot_sse.S
@@ -65,8 +65,8 @@
SAVEREGISTERS
- leaq (, INCX, SIZE), INCX
- leaq (, INCY, SIZE), INCY
+ leaq (, INCX, SIZE), INCX
+ leaq (, INCY, SIZE), INCY
pshufd $0x0, %xmm0, C
pshufd $0x0, %xmm1, S
@@ -153,7 +153,7 @@
movaps 4 * SIZE(X), %xmm2
movaps 8 * SIZE(X), %xmm8
movaps 12 * SIZE(X), %xmm10
-
+
decq %rax
jle .L12
ALIGN_3
diff --git a/kernel/x86_64/rot_sse2.S b/kernel/x86_64/rot_sse2.S
index 5029403..aa5852c 100644
--- a/kernel/x86_64/rot_sse2.S
+++ b/kernel/x86_64/rot_sse2.S
@@ -65,8 +65,8 @@
SAVEREGISTERS
- leaq (, INCX, SIZE), INCX
- leaq (, INCY, SIZE), INCY
+ leaq (, INCX, SIZE), INCX
+ leaq (, INCY, SIZE), INCY
pshufd $0x44, %xmm0, C
pshufd $0x44, %xmm1, S
@@ -123,7 +123,7 @@
movaps 2 * SIZE(X), %xmm2
movaps 4 * SIZE(X), %xmm8
movaps 6 * SIZE(X), %xmm10
-
+
decq %rax
jle .L12
ALIGN_3
diff --git a/kernel/x86_64/scal_atom.S b/kernel/x86_64/scal_atom.S
index ecc687c..11350ea 100644
--- a/kernel/x86_64/scal_atom.S
+++ b/kernel/x86_64/scal_atom.S
@@ -61,11 +61,11 @@
movq 40(%rsp), X
movq 48(%rsp), INCX
- movaps %xmm3, %xmm0
+ movaps %xmm3, %xmm0
#endif
SAVEREGISTERS
-
+
testq M, M
jle .L999
@@ -218,7 +218,7 @@
mulsd %xmm0, %xmm3
mulsd %xmm0, %xmm4
- decq I
+ decq I
jle .L112
ALIGN_4
diff --git a/kernel/x86_64/scal_sse.S b/kernel/x86_64/scal_sse.S
index 9c8dd9d..b92688d 100644
--- a/kernel/x86_64/scal_sse.S
+++ b/kernel/x86_64/scal_sse.S
@@ -61,11 +61,11 @@
movq 40(%rsp), X
movq 48(%rsp), INCX
- movaps %xmm3, %xmm0
+ movaps %xmm3, %xmm0
#endif
SAVEREGISTERS
-
+
testq M, M
jle .L999
@@ -285,7 +285,7 @@
movaps %xmm0, %xmm8
mulps -4 * SIZE(X), %xmm8
- decq I
+ decq I
jle .L112
ALIGN_4
@@ -341,9 +341,9 @@
movaps %xmm6, -12 * SIZE(X)
movaps %xmm7, -8 * SIZE(X)
movaps %xmm8, -4 * SIZE(X)
-
+
#else
-
+
movaps -32 * SIZE(X), %xmm1
movaps -28 * SIZE(X), %xmm2
movaps -24 * SIZE(X), %xmm3
@@ -352,7 +352,7 @@
movaps -12 * SIZE(X), %xmm6
movaps -8 * SIZE(X), %xmm7
movaps -4 * SIZE(X), %xmm8
- decq I
+ decq I
jle .L112
ALIGN_4
diff --git a/kernel/x86_64/scal_sse2.S b/kernel/x86_64/scal_sse2.S
index 3823b1f..20dd7fa 100644
--- a/kernel/x86_64/scal_sse2.S
+++ b/kernel/x86_64/scal_sse2.S
@@ -61,11 +61,11 @@
movq 40(%rsp), X
movq 48(%rsp), INCX
- movaps %xmm3, %xmm0
+ movaps %xmm3, %xmm0
#endif
SAVEREGISTERS
-
+
testq M, M
jle .L999
@@ -75,7 +75,7 @@
comisd %xmm0, %xmm1
jne .L100 # Alpha != ZERO
jp .L100 # For Alpha = NaN
-
+
/* Alpha == ZERO */
cmpq $SIZE, INCX
jne .L50
@@ -270,7 +270,7 @@
movaps %xmm0, %xmm8
mulpd -2 * SIZE(X), %xmm8
- decq I
+ decq I
jle .L112
ALIGN_4
@@ -336,7 +336,7 @@
movaps -4 * SIZE(X), %xmm7
movaps -2 * SIZE(X), %xmm8
- decq I
+ decq I
jle .L112
ALIGN_4
diff --git a/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S b/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S
index 2a034f0..9cc2718 100644
--- a/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S
+++ b/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S
@@ -78,8 +78,8 @@
#endif
-#define L_BUFFER_SIZE 512*8*4
-#define LB2_OFFSET 512*8*2
+#define L_BUFFER_SIZE 8192
+#define LB2_OFFSET 4096
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
diff --git a/kernel/x86_64/sgemm_kernel_16x2_piledriver.S b/kernel/x86_64/sgemm_kernel_16x2_piledriver.S
index dcfed6b..7c42f1e 100644
--- a/kernel/x86_64/sgemm_kernel_16x2_piledriver.S
+++ b/kernel/x86_64/sgemm_kernel_16x2_piledriver.S
@@ -105,8 +105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
-#define L_BUFFER_SIZE 512*8*4
-#define LB2_OFFSET 512*8*2
+#define L_BUFFER_SIZE 8192
+#define LB2_OFFSET 4096
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S
index 6c3cda0..1f9f886 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S
+++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S
@@ -90,8 +90,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
-#define L_BUFFER_SIZE 512*8*4
-#define LB2_OFFSET 512*8*2
+#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@@ -101,7 +100,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 64(%rsp)
#define KKK 72(%rsp)
#define BUFFER1 128(%rsp)
-#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384
diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_sandy.S
similarity index 90%
copy from kernel/x86_64/sgemm_kernel_16x4_haswell.S
copy to kernel/x86_64/sgemm_kernel_16x4_sandy.S
index 6c3cda0..e6c4126 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S
+++ b/kernel/x86_64/sgemm_kernel_16x4_sandy.S
@@ -25,30 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
-/*********************************************************************
-* 2013/11/13 Saar
-* BLASTEST : OK
-* CTEST : OK
-* TEST : OK
-*
-* 2013/10/28 Saar
-* Parameter:
-* SGEMM_DEFAULT_UNROLL_N 4
-* SGEMM_DEFAULT_UNROLL_M 16
-* SGEMM_DEFAULT_P 768
-* SGEMM_DEFAULT_Q 384
-* A_PR1 512
-* B_PR1 512
-*
-*
-* Performance at 9216x9216x9216:
-* 1 thread: 86 GFLOPS (SANDYBRIDGE: 59) (MKL: 83)
-* 2 threads: 157 GFLOPS (SANDYBRIDGE: 116) (MKL: 155)
-* 3 threads: 235 GFLOPS (SANDYBRIDGE: 165) (MKL: 230)
-* 4 threads: 288 GFLOPS (SANDYBRIDGE: 223) (MKL: 267)
-*
-*********************************************************************/
-
#define ASSEMBLER
#include "common.h"
@@ -90,8 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
-#define L_BUFFER_SIZE 512*8*4
-#define LB2_OFFSET 512*8*2
+#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@@ -101,7 +76,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 64(%rsp)
#define KKK 72(%rsp)
#define BUFFER1 128(%rsp)
-#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384
@@ -129,21 +103,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACK_TOUCH
#endif
-#if defined(BULLDOZER)
-
-#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
-
-#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0
-
-#else
-
-#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0
-
-#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0
-
-#endif
-
-
#define A_PR1 512
#define B_PR1 512
@@ -156,16 +115,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3
- VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
- VFMADD231PS_( %ymm5,%ymm2,%ymm1 )
- VFMADD231PS_( %ymm6,%ymm3,%ymm0 )
- VFMADD231PS_( %ymm7,%ymm3,%ymm1 )
+ vmulps %ymm2 , %ymm0 , %ymm12
+ vmulps %ymm2 , %ymm1 , %ymm13
+ vmulps %ymm3 , %ymm0 , %ymm14
+ vmulps %ymm3 , %ymm1 , %ymm15
+ vaddps %ymm12, %ymm4 , %ymm4
+ vaddps %ymm13, %ymm5 , %ymm5
+ vaddps %ymm14, %ymm6 , %ymm6
+ vaddps %ymm15, %ymm7 , %ymm7
vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2
vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3
- VFMADD231PS_( %ymm8,%ymm2,%ymm0 )
- VFMADD231PS_( %ymm9,%ymm2,%ymm1 )
- VFMADD231PS_( %ymm10,%ymm3,%ymm0 )
- VFMADD231PS_( %ymm11,%ymm3,%ymm1 )
+ vmulps %ymm2 , %ymm0 , %ymm12
+ vmulps %ymm2 , %ymm1 , %ymm13
+ vmulps %ymm3 , %ymm0 , %ymm14
+ vmulps %ymm3 , %ymm1 , %ymm15
+ vaddps %ymm12, %ymm8 , %ymm8
+ vaddps %ymm13, %ymm9 , %ymm9
+ vaddps %ymm14, %ymm10, %ymm10
+ vaddps %ymm15, %ymm11, %ymm11
addq $ 4 , BI
addq $ 16, %rax
.endm
@@ -227,12 +194,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3
- VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
- VFMADD231PS_( %ymm6,%ymm3,%ymm0 )
+ vmulps %ymm2 , %ymm0 , %ymm12
+ vmulps %ymm3 , %ymm0 , %ymm14
+ vaddps %ymm12, %ymm4 , %ymm4
+ vaddps %ymm14, %ymm6 , %ymm6
vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2
vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3
- VFMADD231PS_( %ymm8,%ymm2,%ymm0 )
- VFMADD231PS_( %ymm10,%ymm3,%ymm0 )
+ vmulps %ymm2 , %ymm0 , %ymm12
+ vmulps %ymm3 , %ymm0 , %ymm14
+ vaddps %ymm12, %ymm8 , %ymm8
+ vaddps %ymm14, %ymm10, %ymm10
addq $ 4 , BI
addq $ 8 , %rax
.endm
@@ -271,12 +242,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3
- VFMADD231PS_( %xmm4,%xmm2,%xmm0 )
- VFMADD231PS_( %xmm6,%xmm3,%xmm0 )
+ vmulps %xmm2 , %xmm0 , %xmm12
+ vmulps %xmm3 , %xmm0 , %xmm14
+ vaddps %xmm12, %xmm4 , %xmm4
+ vaddps %xmm14, %xmm6 , %xmm6
vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2
vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3
- VFMADD231PS_( %xmm8,%xmm2,%xmm0 )
- VFMADD231PS_( %xmm10,%xmm3,%xmm0 )
+ vmulps %xmm2 , %xmm0 , %xmm12
+ vmulps %xmm3 , %xmm0 , %xmm14
+ vaddps %xmm12, %xmm8 , %xmm8
+ vaddps %xmm14, %xmm10, %xmm10
addq $ 4 , BI
addq $ 4 , %rax
.endm
@@ -315,16 +290,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
- VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
- VFMADD231SS_( %xmm5,%xmm2,%xmm1 )
- VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
- VFMADD231SS_( %xmm7,%xmm3,%xmm1 )
+ vmulss %xmm2 , %xmm0 , %xmm12
+ vmulss %xmm2 , %xmm1 , %xmm13
+ vmulss %xmm3 , %xmm0 , %xmm14
+ vmulss %xmm3 , %xmm1 , %xmm15
+ vaddss %xmm12, %xmm4 , %xmm4
+ vaddss %xmm13, %xmm5 , %xmm5
+ vaddss %xmm14, %xmm6 , %xmm6
+ vaddss %xmm15, %xmm7 , %xmm7
vmovss -2 * SIZE(BO, BI, SIZE), %xmm2
vmovss -1 * SIZE(BO, BI, SIZE), %xmm3
- VFMADD231SS_( %xmm8,%xmm2,%xmm0 )
- VFMADD231SS_( %xmm9,%xmm2,%xmm1 )
- VFMADD231SS_( %xmm10,%xmm3,%xmm0 )
- VFMADD231SS_( %xmm11,%xmm3,%xmm1 )
+ vmulss %xmm2 , %xmm0 , %xmm12
+ vmulss %xmm2 , %xmm1 , %xmm13
+ vmulss %xmm3 , %xmm0 , %xmm14
+ vmulss %xmm3 , %xmm1 , %xmm15
+ vaddss %xmm12, %xmm8 , %xmm8
+ vaddss %xmm13, %xmm9 , %xmm9
+ vaddss %xmm14, %xmm10, %xmm10
+ vaddss %xmm15, %xmm11, %xmm11
addq $ 4 , BI
addq $ 2, %rax
.endm
@@ -380,12 +363,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
- VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
- VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
+ vmulss %xmm2 , %xmm0 , %xmm12
+ vmulss %xmm3 , %xmm0 , %xmm14
+ vaddss %xmm12, %xmm4 , %xmm4
+ vaddss %xmm14, %xmm6 , %xmm6
vmovss -2 * SIZE(BO, BI, SIZE), %xmm2
vmovss -1 * SIZE(BO, BI, SIZE), %xmm3
- VFMADD231SS_( %xmm8,%xmm2,%xmm0 )
- VFMADD231SS_( %xmm10,%xmm3,%xmm0 )
+ vmulss %xmm2 , %xmm0 , %xmm12
+ vmulss %xmm3 , %xmm0 , %xmm14
+ vaddss %xmm12, %xmm8 , %xmm8
+ vaddss %xmm14, %xmm10, %xmm10
addq $ 4 , BI
addq $ 1, %rax
.endm
@@ -428,10 +415,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3
- VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
- VFMADD231PS_( %ymm5,%ymm2,%ymm1 )
- VFMADD231PS_( %ymm6,%ymm3,%ymm0 )
- VFMADD231PS_( %ymm7,%ymm3,%ymm1 )
+ vmulps %ymm2 , %ymm0 , %ymm12
+ vmulps %ymm2 , %ymm1 , %ymm13
+ vmulps %ymm3 , %ymm0 , %ymm14
+ vmulps %ymm3 , %ymm1 , %ymm15
+ vaddps %ymm12, %ymm4 , %ymm4
+ vaddps %ymm13, %ymm5 , %ymm5
+ vaddps %ymm14, %ymm6 , %ymm6
+ vaddps %ymm15, %ymm7 , %ymm7
addq $ 2 , BI
addq $ 16, %rax
.endm
@@ -472,8 +463,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3
- VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
- VFMADD231PS_( %ymm6,%ymm3,%ymm0 )
+ vmulps %ymm2 , %ymm0 , %ymm12
+ vmulps %ymm3 , %ymm0 , %ymm14
+ vaddps %ymm12, %ymm4 , %ymm4
+ vaddps %ymm14, %ymm6 , %ymm6
addq $ 2 , BI
addq $ 8 , %rax
.endm
@@ -505,8 +498,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2
vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3
- VFMADD231PS_( %xmm4,%xmm2,%xmm0 )
- VFMADD231PS_( %xmm6,%xmm3,%xmm0 )
+ vmulps %xmm2 , %xmm0 , %xmm12
+ vmulps %xmm3 , %xmm0 , %xmm14
+ vaddps %xmm12, %xmm4 , %xmm4
+ vaddps %xmm14, %xmm6 , %xmm6
addq $ 2 , BI
addq $ 4 , %rax
.endm
@@ -538,10 +533,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
- VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
- VFMADD231SS_( %xmm5,%xmm2,%xmm1 )
- VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
- VFMADD231SS_( %xmm7,%xmm3,%xmm1 )
+ vmulss %xmm2 , %xmm0 , %xmm12
+ vmulss %xmm2 , %xmm1 , %xmm13
+ vmulss %xmm3 , %xmm0 , %xmm14
+ vmulss %xmm3 , %xmm1 , %xmm15
+ vaddss %xmm12, %xmm4 , %xmm4
+ vaddss %xmm13, %xmm5 , %xmm5
+ vaddss %xmm14, %xmm6 , %xmm6
+ vaddss %xmm15, %xmm7 , %xmm7
addq $ 2 , BI
addq $ 2, %rax
.endm
@@ -581,8 +580,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
- VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
- VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
+ vmulss %xmm2 , %xmm0 , %xmm12
+ vmulss %xmm3 , %xmm0 , %xmm14
+ vaddss %xmm12, %xmm4 , %xmm4
+ vaddss %xmm14, %xmm6 , %xmm6
addq $ 2 , BI
addq $ 1, %rax
.endm
@@ -617,8 +618,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
- VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
- VFMADD231PS_( %ymm5,%ymm2,%ymm1 )
+ vmulps %ymm2 , %ymm0 , %ymm12
+ vmulps %ymm2 , %ymm1 , %ymm13
+ vaddps %ymm12, %ymm4 , %ymm4
+ vaddps %ymm13, %ymm5 , %ymm5
addq $ 1 , BI
addq $ 16, %rax
.endm
@@ -648,7 +651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL8x1_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
- VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
+ vmulps %ymm2 , %ymm0 , %ymm12
+ vaddps %ymm12, %ymm4 , %ymm4
addq $ 1 , BI
addq $ 8 , %rax
.endm
@@ -676,7 +680,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x1_SUB
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2
- VFMADD231PS_( %xmm4,%xmm2,%xmm0 )
+ vmulps %xmm2 , %xmm0 , %xmm12
+ vaddps %xmm12, %xmm4 , %xmm4
addq $ 1 , BI
addq $ 4 , %rax
.endm
@@ -704,8 +709,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
- VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
- VFMADD231SS_( %xmm5,%xmm2,%xmm1 )
+ vmulss %xmm2 , %xmm0 , %xmm12
+ vmulss %xmm2 , %xmm1 , %xmm13
+ vaddss %xmm12, %xmm4 , %xmm4
+ vaddss %xmm13, %xmm5 , %xmm5
addq $ 1 , BI
addq $ 2 , %rax
.endm
@@ -735,7 +742,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL1x1_SUB
vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
- VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
+ vmulss %xmm2 , %xmm0 , %xmm12
+ vaddss %xmm12, %xmm4 , %xmm4
addq $ 1 , BI
addq $ 1 , %rax
.endm
diff --git a/kernel/x86_64/sgemm_kernel_8x4_bulldozer.S b/kernel/x86_64/sgemm_kernel_8x4_bulldozer.S
index 268d3ae..2194fd6 100644
--- a/kernel/x86_64/sgemm_kernel_8x4_bulldozer.S
+++ b/kernel/x86_64/sgemm_kernel_8x4_bulldozer.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,13 +49,13 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
#define CO1 %r15
#define CO2 %r12
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 64
@@ -226,7 +226,7 @@
vmovups 100 * SIZE(BO, %rax, 8), %xmm3 ;\
vmovaps %xmm0, %xmm2 ;\
addq $16 * SIZE, %rax
-
+
#define KERNEL_SUB1(xx) \
mulps %xmm1, %xmm0 ;\
mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\
@@ -334,7 +334,7 @@
#endif
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -389,13 +389,13 @@
shufps $0, %xmm0, %xmm0
movaps %xmm0, ALPHA
-
+
#ifdef TRMMKERNEL
movsd %xmm12, OFFSET
movsd %xmm12, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
subq $-32 * SIZE, A
@@ -410,22 +410,22 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $2, %rax
jle .L03
ALIGN_4
-
+
.L02:
- prefetcht0 192(B)
- prefetcht0 256(B)
- prefetcht0 192(BO)
- prefetcht0 256(BO)
+ prefetcht0 192(B)
+ prefetcht0 256(B)
+ prefetcht0 192(BO)
+ prefetcht0 256(BO)
movaps 0 * SIZE(B), %xmm3
movaps 0 * SIZE(B), %xmm3
movaps 4 * SIZE(B), %xmm7
@@ -506,7 +506,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L10:
movq C, CO1
leaq (C, LDC, 1), CO2
@@ -530,7 +530,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
movaps -32 * SIZE(BO), %xmm1
@@ -558,7 +558,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -745,7 +745,7 @@
vmulps %xmm7, %xmm8, %xmm8
vmulps %xmm7, %xmm9, %xmm9
- vmulps %xmm7, %xmm10, %xmm10
+ vmulps %xmm7, %xmm10, %xmm10
vmulps %xmm7, %xmm11, %xmm11
vmulps %xmm7, %xmm12,%xmm12
@@ -786,7 +786,7 @@
addq $8 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $4, M
@@ -803,7 +803,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -823,7 +823,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1024,7 +1024,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L30:
testq $2, M
@@ -1041,7 +1041,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -24 * SIZE(AO), %xmm10
@@ -1061,7 +1061,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1265,7 +1265,7 @@
addq $2 * SIZE, CO1 # coffset += 4
addq $2 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L40:
testq $1, M
@@ -1283,7 +1283,7 @@
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm8
movss -28 * SIZE(AO), %xmm10
@@ -1303,7 +1303,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1505,8 +1505,8 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L49:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $4, KK
@@ -1523,16 +1523,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $2, %rax
jle .L53
ALIGN_4
-
+
.L52:
movaps 0 * SIZE(B), %xmm3
@@ -1592,7 +1592,7 @@
decq %rax
jne .L54
ALIGN_4
-
+
.L60:
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -1615,7 +1615,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -1640,7 +1640,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1845,7 +1845,7 @@
addq $8 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L61
- ALIGN_4
+ ALIGN_4
.L70:
testq $4, M
@@ -1863,7 +1863,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -1883,7 +1883,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2024,7 +2024,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L80:
testq $2, M
@@ -2041,7 +2041,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -24 * SIZE(AO), %xmm10
@@ -2061,7 +2061,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2208,7 +2208,7 @@
addq $2 * SIZE, CO1 # coffset += 4
addq $2 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L90:
testq $1, M
@@ -2225,7 +2225,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm8
movss -28 * SIZE(AO), %xmm10
@@ -2245,7 +2245,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2388,8 +2388,8 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $2, KK
@@ -2406,16 +2406,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $3, %rax
jle .L103
ALIGN_4
-
+
.L102:
@@ -2469,7 +2469,7 @@
decq %rax
jne .L104
ALIGN_4
-
+
.L110:
movq C, CO1 # coffset1 = c
movq A, AO # aoffset = a
@@ -2491,7 +2491,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -2515,7 +2515,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2653,7 +2653,7 @@
addq $8 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L111
- ALIGN_4
+ ALIGN_4
.L120:
testq $4, M
@@ -2670,7 +2670,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -2688,7 +2688,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2792,7 +2792,7 @@
#endif
addq $4 * SIZE, CO1 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L130:
testq $2, M
@@ -2809,7 +2809,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -24 * SIZE(AO), %xmm10
@@ -2827,7 +2827,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2936,7 +2936,7 @@
#endif
addq $2 * SIZE, CO1 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L140:
testq $1, M
@@ -2953,7 +2953,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm8
movss -28 * SIZE(AO), %xmm10
@@ -2971,7 +2971,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -3055,8 +3055,8 @@
addss %xmm8, %xmm0
#endif
movss %xmm0, 0 * SIZE(CO1)
- ALIGN_4
-
+ ALIGN_4
+
.L999:
movq %rbx, %rsp
movq 0(%rsp), %rbx
diff --git a/kernel/x86_64/sgemm_kernel_8x8_sandy.S b/kernel/x86_64/sgemm_kernel_8x8_sandy.S
index 20ddcaa..fb67dee 100644
--- a/kernel/x86_64/sgemm_kernel_8x8_sandy.S
+++ b/kernel/x86_64/sgemm_kernel_8x8_sandy.S
@@ -13,19 +13,19 @@ notice, this list of conditions and the following disclaimer.
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
-3. Neither the name of the ISCAS nor the names of its contributors may
-be used to endorse or promote products derived from this software
+3. Neither the name of the ISCAS nor the names of its contributors may
+be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -90,7 +90,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define PREFETCH0 prefetcht0
#define PREFETCH1 prefetcht0
#define PREFETCH2 prefetcht2
-#define PRESIZE 80
+#define PRESIZE 80
#define xvec0 %xmm0
#define xvec1 %xmm1
@@ -140,10 +140,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OR orq
#define JNE jne
#define JMP jmp
-#define NOP
+#define NOP
#define XOR xorpd
#undef MOVQ
-#define MOVQ movq
+#define MOVQ movq
#define XOR_SY vxorps
#define XOR_SX vxorps
@@ -167,7 +167,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define EDUP_SX vmovsldup
#define ODUP_SX vmovshdup
-#define ADD_SY vaddps
+#define ADD_SY vaddps
#define ADD_SX vaddps
#define ADD1_DY vaddpd
@@ -184,7 +184,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VPERMILP_SX vpermilps
#define BROAD_SY vbroadcastss
-#define BROAD_SX vbroadcastss
+#define BROAD_SX vbroadcastss
#define MOV_SY vmovaps
#define MOV_SX vmovaps
@@ -222,7 +222,7 @@ movq %r15, 40(%rsp);
movq ARG1, old_bm
movq ARG2, old_bn
movq ARG3, old_bk
- movq OLD_A, ba
+ movq OLD_A, ba
movq OLD_B, bb
movq OLD_C, C
movq old_ldc, ldc
@@ -275,7 +275,7 @@ ALIGN_4;
.L1_bodyB:;
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (, %rax, SIZE), %rax;
@@ -317,7 +317,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $8, %rax;
-#else
+#else
ADDQ $8, %rax;
#endif
MOVQ %rax, kkk;
@@ -328,7 +328,7 @@ ALIGN_4;
.L2_bodyB:;
# Computing kernel
-#### Unroll times 1 ####
+#### Unroll times 1 ####
PREFETCH0 PRESIZE*SIZE(ptrba);
MUL_SY yvec0, yvec2, yvec6;
SHUF_SY $0x03, yvec2, yvec2, yvec4;
@@ -359,7 +359,7 @@ MUL_SY yvec0, yvec5, yvec7;
ADD_SY yvec10, yvec6, yvec10;
ADD_SY yvec8, yvec7, yvec8;
-#### Unroll times 2 ####
+#### Unroll times 2 ####
MUL_SY yvec1, yvec2, yvec6;
SHUF_SY $0x03, yvec2, yvec2, yvec4;
ODUP_SY 8*SIZE(ptrbb), yvec2
@@ -389,7 +389,7 @@ MUL_SY yvec1, yvec5, yvec7;
ADD_SY yvec10, yvec6, yvec10;
ADD_SY yvec8, yvec7, yvec8;
-#### Unroll times 3 ####
+#### Unroll times 3 ####
PREFETCH0 (PRESIZE+16)*SIZE(ptrba);
MUL_SY yvec0, yvec2, yvec6;
SHUF_SY $0x03, yvec2, yvec2, yvec4;
@@ -421,7 +421,7 @@ MUL_SY yvec0, yvec5, yvec7;
ADD_SY yvec10, yvec6, yvec10;
ADD_SY yvec8, yvec7, yvec8;
-#### Unroll times 4 ####
+#### Unroll times 4 ####
MUL_SY yvec1, yvec2, yvec6;
SHUF_SY $0x03, yvec2, yvec2, yvec4;
ODUP_SY 24*SIZE(ptrbb), yvec2
@@ -464,7 +464,7 @@ TEST $2, kkk;
JLE .L3_loopE;
ALIGN_4
.L3_loobB:
-#### Unroll times 1 ####
+#### Unroll times 1 ####
MUL_SY yvec0, yvec2, yvec6;
SHUF_SY $0x03, yvec2, yvec2, yvec4;
ODUP_SY 0*SIZE(ptrbb), yvec2
@@ -495,7 +495,7 @@ MUL_SY yvec0, yvec5, yvec7;
ADD_SY yvec10, yvec6, yvec10;
ADD_SY yvec8, yvec7, yvec8;
-#### Unroll times 2 ####
+#### Unroll times 2 ####
MUL_SY yvec1, yvec2, yvec6;
SHUF_SY $0x03, yvec2, yvec2, yvec4;
ODUP_SY 8*SIZE(ptrbb), yvec2
@@ -534,7 +534,7 @@ TEST $1, kkk;
JLE .L4_loopE;
ALIGN_4
.L4_loopB:;
-#### Unroll times 1 ####
+#### Unroll times 1 ####
MUL_SY yvec0, yvec2, yvec6;
SHUF_SY $0x03, yvec2, yvec2, yvec4;
ODUP_SY 0*SIZE(ptrbb), yvec2
@@ -802,8 +802,8 @@ JLE .L5_loopE;
ALIGN_4
.L5_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
-MOVQ bb, ptrbb;
-#else
+MOVQ bb, ptrbb;
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (, %rax, SIZE), %rax;
@@ -832,7 +832,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $4, %rax;
-#else
+#else
ADDQ $8, %rax;
#endif
MOVQ %rax, kkk;
@@ -872,7 +872,7 @@ ODUP_SX 8*SIZE(ptrbb), xvec3;
MUL_SX xvec0, xvec5, xvec5;
ADD_SX xvec5, xvec8, xvec8;
-#### Unroll time 2 ####
+#### Unroll time 2 ####
SHUF_SX $0x4e, xvec2, xvec4;
MUL_SX xvec1, xvec2, xvec2;
ADD_SX xvec2, xvec15, xvec15;
@@ -902,7 +902,7 @@ ODUP_SX 16*SIZE(ptrbb), xvec3;
MUL_SX xvec1, xvec5, xvec5;
ADD_SX xvec5, xvec8, xvec8;
-#### Unroll time 3 ####
+#### Unroll time 3 ####
SHUF_SX $0x4e, xvec2, xvec4;
MUL_SX xvec0, xvec2, xvec2;
ADD_SX xvec2, xvec15, xvec15;
@@ -933,7 +933,7 @@ MUL_SX xvec0, xvec5, xvec5;
ADD_SX xvec5, xvec8, xvec8;
ADDQ $16*SIZE, ptrba;
-#### Unroll time 4 ####
+#### Unroll time 4 ####
SHUF_SX $0x4e, xvec2, xvec4;
MUL_SX xvec1, xvec2, xvec2;
ADD_SX xvec2, xvec15, xvec15;
@@ -1005,7 +1005,7 @@ ODUP_SX 8*SIZE(ptrbb), xvec3;
MUL_SX xvec0, xvec5, xvec5;
ADD_SX xvec5, xvec8, xvec8;
-#### Unroll time 2 ####
+#### Unroll time 2 ####
ADDQ $8*SIZE, ptrba;
SHUF_SX $0x4e, xvec2, xvec4;
MUL_SX xvec1, xvec2, xvec2;
@@ -1099,7 +1099,7 @@ REVS_SX $0xe4, xvec7, xvec9, xvec9;
MOV_SX xvec10, xvec7;
REVS_SX $0xe4, xvec8, xvec10, xvec10;
REVS_SX $0xe4, xvec7, xvec8, xvec8;
-#### Testing Alignment ####
+#### Testing Alignment ####
MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
@@ -1200,8 +1200,8 @@ JLE .L6_loopE;
ALIGN_4
.L6_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
-MOVQ bb, ptrbb;
-#else
+MOVQ bb, ptrbb;
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (, %rax, SIZE), %rax;
@@ -1220,7 +1220,7 @@ MOVQ bk, k;
MOVQ bk, %rax;
SUBQ kk, %rax;
MOVQ %rax, kkk;
-#else
+#else
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $2, %rax;
@@ -1419,7 +1419,7 @@ ALIGN_4
.L7_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (,%rax, SIZE), %rax;
@@ -1440,7 +1440,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $1, %rax;
-#else
+#else
ADDQ $8, %rax;
#endif
MOVQ %rax, kkk;
@@ -1614,7 +1614,7 @@ ALIGN_4
.L21_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (, %rax, SIZE), %rax;
@@ -1643,7 +1643,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $8, %rax;
-#else
+#else
ADDQ $4, %rax;
#endif
MOVQ %rax, kkk;
@@ -1907,7 +1907,7 @@ MUL_SX xvec7, xvec11, xvec11;
MUL_SX xvec7, xvec10, xvec10;
MUL_SX xvec7, xvec9, xvec9;
MUL_SX xvec7, xvec8, xvec8;
-#### Writing Back ####
+#### Writing Back ####
#ifndef TRMMKERNEL
LDL_SX 0*SIZE(C0), xvec0, xvec0;
LDH_SX 2*SIZE(C1), xvec0, xvec0;
@@ -1971,8 +1971,8 @@ JLE .L22_loopE;
ALIGN_4
.L22_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
-MOVQ bb, ptrbb;
-#else
+MOVQ bb, ptrbb;
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (, %rax, SIZE), %rax;
@@ -1994,7 +1994,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $4, %rax;
-#else
+#else
ADDQ $4, %rax;
#endif
MOVQ %rax, kkk;
@@ -2188,8 +2188,8 @@ JLE .L23_loopE;
ALIGN_4
.L23_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
-MOVQ bb, ptrbb;
-#else
+MOVQ bb, ptrbb;
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (, %rax, SIZE), %rax;
@@ -2205,7 +2205,7 @@ MOVQ bk, k;
MOVQ bk, %rax;
SUBQ kk, %rax;
MOVQ %rax, kkk;
-#else
+#else
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $2, %rax;
@@ -2342,7 +2342,7 @@ ALIGN_4
.L24_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (,%rax, SIZE), %rax;
@@ -2361,7 +2361,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $1, %rax;
-#else
+#else
ADDQ $4, %rax;
#endif
MOVQ %rax, kkk;
@@ -2489,7 +2489,7 @@ ALIGN_4
.L31_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (, %rax, SIZE), %rax;
@@ -2507,11 +2507,11 @@ MOVQ bk, k;
MOVQ bk, %rax;
SUBQ kk, %rax;
MOVQ %rax, kkk;
-#else
+#else
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $8, %rax;
-#else
+#else
ADDQ $2, %rax;
#endif
MOVQ %rax, kkk;
@@ -2721,7 +2721,7 @@ ALIGN_4
.L32_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (,%rax, SIZE), %rax;
@@ -2737,11 +2737,11 @@ MOVQ bk, k;
MOVQ bk, %rax;
SUBQ kk, %rax;
MOVQ %rax, kkk;
-#else
+#else
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $4, %rax;
-#else
+#else
ADDQ $2, %rax;
#endif
MOVQ %rax, kkk;
@@ -2873,7 +2873,7 @@ ALIGN_4
.L33_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (,%rax, SIZE), %rax;
@@ -2891,11 +2891,11 @@ MOVQ bk, k;
MOVQ bk, %rax;
SUBQ kk, %rax;
MOVQ %rax, kkk;
-#else
+#else
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $2, %rax;
-#else
+#else
ADDQ $2, %rax;
#endif
MOVQ %rax, kkk;
@@ -3017,7 +3017,7 @@ ALIGN_4
.L34_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (, %rax, SIZE), %rax;
@@ -3033,13 +3033,13 @@ MOVQ bk, k;
MOVQ bk, %rax;
SUBQ kk, %rax;
MOVQ %rax, kkk;
-#else
+#else
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $1, %rax;
-#else
+#else
ADDQ $2, %rax;
-#endif
+#endif
MOVQ %rax, kkk;
#endif
SARQ $2, k;
@@ -3136,7 +3136,7 @@ addq $1*SIZE, ptrba;
addq $2*SIZE, ptrbb
.L343_loopE:
-#### Writing back ####
+#### Writing back ####
movss MEMALPHA, xvec7;
mulss xvec7, xvec15;
mulss xvec7, xvec14;
@@ -3186,7 +3186,7 @@ ALIGN_4
.L41_bodyB:
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax
LEAQ (, %rax, SIZE), %rax;
@@ -3201,11 +3201,11 @@ MOVQ bk, k;
MOVQ bk, %rax;
SUBQ kk, %rax;
MOVQ %rax, kkk;
-#else
+#else
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $8, %rax;
-#else
+#else
ADDQ $1, %rax;
#endif
MOVQ %rax, kkk;
@@ -3333,11 +3333,11 @@ MOVQ bk, k;
MOVQ bk, %rax;
SUBQ kk, %rax;
MOVQ %rax, kkk;
-#else
+#else
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $4, %rax;
-#else
+#else
ADDQ $1, %rax;
#endif
MOVQ %rax, kkk
@@ -3437,13 +3437,13 @@ ALIGN_4
.L43_bodyB:
#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax
LEAQ (, %rax, SIZE), %rax
LEAQ (ptrba, %rax, 2), ptrba
ADDQ %rax, ptrbb;
-#endif
+#endif
XOR_SY yvec15, yvec15, yvec15;
XOR_SY yvec14, yvec14, yvec14;
#ifndef TRMMKERNEL
@@ -3452,7 +3452,7 @@ MOVQ bk, k;
MOVQ bk, %rax;
SUBQ kk, %rax;
MOVQ %rax, kkk;
-#else
+#else
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $2, %rax;
@@ -3576,9 +3576,9 @@ TEST $1, bm;
JLE .L44_loopE;
ALIGN_4
.L44_bodyB:
-#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
MOVQ bb, ptrbb;
-#else
+#else
MOVQ bb, ptrbb;
MOVQ kk, %rax;
LEAQ (, %rax, SIZE), %rax;
@@ -3592,7 +3592,7 @@ MOVQ bk, k;
MOVQ bk, %rax;
SUBQ kk, %rax;
MOVQ %rax, kkk;
-#else
+#else
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $1, %rax;
diff --git a/kernel/x86_64/sgemv_n.S b/kernel/x86_64/sgemv_n.S
index 7641056..8f64fe5 100644
--- a/kernel/x86_64/sgemv_n.S
+++ b/kernel/x86_64/sgemv_n.S
@@ -48,7 +48,7 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 128
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_A %rcx
@@ -66,7 +66,7 @@
#else
#define STACKSIZE 288
-
+
#define OLD_M %rcx
#define OLD_N %rdx
#define OLD_A 40 + STACKSIZE(%rsp)
@@ -210,9 +210,9 @@
jle .L999
subq $-32 * SIZE, A
-
+
movq BUFFER, Y1
-
+
pxor %xmm0, %xmm0
movq M, %rax
@@ -301,7 +301,7 @@
testq $SIZE, A1
je .L1X
-
+
movss -32 * SIZE(A1), %xmm4
movss -32 * SIZE(A1, LDA, 1), %xmm5
movss -32 * SIZE(A1, LDA, 2), %xmm6
@@ -907,7 +907,7 @@
testq $SIZE, A1
je .L2X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(A1, LDA), %xmm1
movss -32 * SIZE(A2), %xmm2
@@ -1301,7 +1301,7 @@
testq $SIZE, A1
je .L3X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(A1, LDA), %xmm1
movss -32 * SIZE(A2), %xmm2
@@ -1628,7 +1628,7 @@
testq $SIZE, A1
je .L4X
-
+
movss -32 * SIZE(Y1), %xmm8
movss -32 * SIZE(A1), %xmm0
@@ -1885,7 +1885,7 @@
testq $SIZE, A1
je .L5X
-
+
movss -32 * SIZE(Y1), %xmm8
movss -32 * SIZE(A1), %xmm0
@@ -2066,7 +2066,7 @@
.L100:
testq $2 * SIZE - 1, LDA
jne .L200
-
+
cmpq $4, N
jl .L110
ALIGN_3
@@ -2105,7 +2105,7 @@
testq $SIZE, A1
je .L10X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(A1, LDA), %xmm1
movss -32 * SIZE(A2), %xmm2
@@ -2513,7 +2513,7 @@
testq $SIZE, A1
je .L11X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(A1, LDA), %xmm1
movss -32 * SIZE(A2), %xmm2
@@ -2846,7 +2846,7 @@
testq $SIZE, A1
je .L12X
-
+
movss -32 * SIZE(Y1), %xmm8
movss -32 * SIZE(A1), %xmm0
@@ -3116,7 +3116,7 @@
testq $SIZE, A1
je .L13X
-
+
movss -32 * SIZE(Y1), %xmm8
movss -32 * SIZE(A1), %xmm0
@@ -3332,7 +3332,7 @@
testq $SIZE, A1
je .L20X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(A1, LDA), %xmm1
movss -32 * SIZE(A2), %xmm2
@@ -3776,7 +3776,7 @@
testq $SIZE, A1
je .L21X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(A1, LDA), %xmm1
movss -32 * SIZE(A2), %xmm2
@@ -4139,7 +4139,7 @@
testq $SIZE, A1
je .L22X
-
+
movss -32 * SIZE(Y1), %xmm9
movss -32 * SIZE(A1), %xmm0
@@ -4423,7 +4423,7 @@
testq $SIZE, A1
je .L23X
-
+
movss -32 * SIZE(Y1), %xmm8
movss -32 * SIZE(A1), %xmm0
@@ -4637,7 +4637,7 @@
testq $SIZE, A1
je .L30X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(A1, LDA), %xmm1
movss -32 * SIZE(A2), %xmm2
@@ -5080,7 +5080,7 @@
testq $SIZE, A1
je .L31X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(A1, LDA), %xmm1
movss -32 * SIZE(A2), %xmm2
@@ -5439,7 +5439,7 @@
testq $SIZE, A1
je .L32X
-
+
movss -32 * SIZE(Y1), %xmm9
movss -32 * SIZE(A1), %xmm0
@@ -5722,7 +5722,7 @@
testq $SIZE, A1
je .L33X
-
+
movss -32 * SIZE(Y1), %xmm8
movss -32 * SIZE(A1), %xmm0
diff --git a/kernel/x86_64/sgemv_t.S b/kernel/x86_64/sgemv_t.S
index c2cb6b9..33e2fa8 100644
--- a/kernel/x86_64/sgemv_t.S
+++ b/kernel/x86_64/sgemv_t.S
@@ -43,11 +43,11 @@
#undef GEMV_UNROLL
#define GEMV_UNROLL 4
#endif
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 128
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_A %rcx
@@ -64,7 +64,7 @@
#else
#define STACKSIZE 288
-
+
#define OLD_M %rcx
#define OLD_N %rdx
#define OLD_A 40 + STACKSIZE(%rsp)
@@ -161,7 +161,7 @@
subq M,MMM
jge .L00t
ALIGN_4
-
+
movq MMM,%rax
addq M,%rax
jle .L999x
@@ -213,7 +213,7 @@
#endif
movq BUFFER, X1
-
+
movq M, I
sarq $3, I
jle .L05
@@ -307,7 +307,7 @@
testq $SIZE, A1
je .L1X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(X1), %xmm4
mulss %xmm4, %xmm0
@@ -991,7 +991,7 @@
testq $SIZE, A1
je .L2X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(X1), %xmm4
mulss %xmm4, %xmm0
@@ -1018,26 +1018,26 @@
#ifdef movsd
xorps %xmm0, %xmm0
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A1, LDA), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
#ifdef movsd
xorps %xmm2, %xmm2
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm2
mulps %xmm4, %xmm2
addps %xmm2, %xmm10
#ifdef movsd
xorps %xmm3, %xmm3
-#endif
+#endif
movsd -32 * SIZE(A2, LDA), %xmm3
mulps %xmm4, %xmm3
addps %xmm3, %xmm11
@@ -1279,29 +1279,29 @@
#ifdef movsd
xorps %xmm0, %xmm0
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
#ifdef movsd
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A1, LDA), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
#ifdef movsd
xorps %xmm2, %xmm2
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm2
mulps %xmm4, %xmm2
addps %xmm2, %xmm10
#ifdef movsd
xorps %xmm3, %xmm3
-#endif
+#endif
movsd -32 * SIZE(A2, LDA), %xmm3
mulps %xmm4, %xmm3
addps %xmm3, %xmm11
@@ -1415,7 +1415,7 @@
testq $SIZE, A1
je .L3X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(X1), %xmm4
mulss %xmm4, %xmm0
@@ -1442,26 +1442,26 @@
#ifdef movsd
xorps %xmm0, %xmm0
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A1, LDA), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
#ifdef movsd
xorps %xmm2, %xmm2
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm2
mulps %xmm4, %xmm2
addps %xmm2, %xmm10
#ifdef movsd
xorps %xmm3, %xmm3
-#endif
+#endif
movsd -32 * SIZE(A2, LDA), %xmm3
mulps %xmm4, %xmm3
addps %xmm3, %xmm11
@@ -1665,29 +1665,29 @@
#ifdef movsd
xorps %xmm0, %xmm0
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
#ifdef movsd
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A1, LDA), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
#ifdef movsd
xorps %xmm2, %xmm2
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm2
mulps %xmm4, %xmm2
addps %xmm2, %xmm10
#ifdef movsd
xorps %xmm3, %xmm3
-#endif
+#endif
addq $2 * SIZE, A1
addq $2 * SIZE, A2
@@ -1782,7 +1782,7 @@
testq $SIZE, A1
je .L4X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(X1), %xmm4
mulss %xmm4, %xmm0
@@ -1803,14 +1803,14 @@
#ifdef movsd
xorps %xmm0, %xmm0
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
@@ -1972,17 +1972,17 @@
#ifdef movsd
xorps %xmm0, %xmm0
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
#ifdef movsd
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
@@ -2054,7 +2054,7 @@
testq $SIZE, A1
je .L5X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(X1), %xmm4
mulss %xmm4, %xmm0
@@ -2072,7 +2072,7 @@
#ifdef movsd
xorps %xmm0, %xmm0
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
@@ -2194,11 +2194,11 @@
#ifdef movsd
xorps %xmm0, %xmm0
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
#ifdef movsd
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
@@ -2270,7 +2270,7 @@
testq $SIZE, A1
je .L10X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(X1), %xmm4
mulss %xmm4, %xmm0
@@ -2297,26 +2297,26 @@
#ifdef movsd
xorps %xmm0, %xmm0
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A1, LDA), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
#ifdef movsd
xorps %xmm2, %xmm2
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm2
mulps %xmm4, %xmm2
addps %xmm2, %xmm10
#ifdef movsd
xorps %xmm3, %xmm3
-#endif
+#endif
movsd -32 * SIZE(A2, LDA), %xmm3
mulps %xmm4, %xmm3
addps %xmm3, %xmm11
@@ -2582,29 +2582,29 @@
#ifdef movsd
xorps %xmm0, %xmm0
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
#ifdef movsd
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A1, LDA), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
#ifdef movsd
xorps %xmm2, %xmm2
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm2
mulps %xmm4, %xmm2
addps %xmm2, %xmm10
#ifdef movsd
xorps %xmm3, %xmm3
-#endif
+#endif
movsd -32 * SIZE(A2, LDA), %xmm3
mulps %xmm4, %xmm3
addps %xmm3, %xmm11
@@ -2715,7 +2715,7 @@
testq $SIZE, A1
je .L11X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(X1), %xmm4
mulss %xmm4, %xmm0
@@ -2739,20 +2739,20 @@
#ifdef movsd
xorps %xmm0, %xmm0
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A1, LDA), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
#ifdef movsd
xorps %xmm2, %xmm2
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm2
mulps %xmm4, %xmm2
addps %xmm2, %xmm10
@@ -2964,23 +2964,23 @@
#ifdef movsd
xorps %xmm0, %xmm0
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
#ifdef movsd
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A1, LDA), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
#ifdef movsd
xorps %xmm2, %xmm2
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm2
mulps %xmm4, %xmm2
addps %xmm2, %xmm10
@@ -3075,7 +3075,7 @@
testq $SIZE, A1
je .L12X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(X1), %xmm4
mulss %xmm4, %xmm0
@@ -3096,14 +3096,14 @@
#ifdef movsd
xorps %xmm0, %xmm0
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
@@ -3277,17 +3277,17 @@
#ifdef movsd
xorps %xmm0, %xmm0
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
#ifdef movsd
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
@@ -3358,7 +3358,7 @@
testq $SIZE, A1
je .L13X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(X1), %xmm4
mulss %xmm4, %xmm0
@@ -3376,7 +3376,7 @@
#ifdef movsd
xorps %xmm0, %xmm0
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
@@ -3497,11 +3497,11 @@
#ifdef movsd
xorps %xmm0, %xmm0
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
#ifdef movsd
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
@@ -3571,7 +3571,7 @@
testq $SIZE, A1
je .L20X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(X1), %xmm4
mulss %xmm4, %xmm0
@@ -3598,26 +3598,26 @@
#ifdef movsd
xorps %xmm0, %xmm0
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A1, LDA), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
#ifdef movsd
xorps %xmm2, %xmm2
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm2
mulps %xmm4, %xmm2
addps %xmm2, %xmm10
#ifdef movsd
xorps %xmm3, %xmm3
-#endif
+#endif
movsd -32 * SIZE(A2, LDA), %xmm3
mulps %xmm4, %xmm3
addps %xmm3, %xmm11
@@ -3927,29 +3927,29 @@
#ifdef movsd
xorps %xmm0, %xmm0
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
#ifdef movsd
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A1, LDA), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
#ifdef movsd
xorps %xmm2, %xmm2
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm2
mulps %xmm4, %xmm2
addps %xmm2, %xmm10
#ifdef movsd
xorps %xmm3, %xmm3
-#endif
+#endif
movsd -32 * SIZE(A2, LDA), %xmm3
mulps %xmm4, %xmm3
addps %xmm3, %xmm11
@@ -4060,7 +4060,7 @@
testq $SIZE, A1
je .L21X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(X1), %xmm4
mulss %xmm4, %xmm0
@@ -4084,20 +4084,20 @@
#ifdef movsd
xorps %xmm0, %xmm0
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A1, LDA), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
#ifdef movsd
xorps %xmm2, %xmm2
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm2
mulps %xmm4, %xmm2
addps %xmm2, %xmm10
@@ -4364,23 +4364,23 @@
#ifdef movsd
xorps %xmm0, %xmm0
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
#ifdef movsd
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A1, LDA), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
#ifdef movsd
xorps %xmm2, %xmm2
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm2
mulps %xmm4, %xmm2
addps %xmm2, %xmm10
@@ -4478,7 +4478,7 @@
testq $SIZE, A1
je .L22X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(X1), %xmm4
mulss %xmm4, %xmm0
@@ -4499,14 +4499,14 @@
#ifdef movsd
xorps %xmm0, %xmm0
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
@@ -4693,17 +4693,17 @@
#ifdef movsd
xorps %xmm0, %xmm0
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
#ifdef movsd
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
@@ -4791,7 +4791,7 @@
#ifdef movsd
xorps %xmm0, %xmm0
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
@@ -4912,11 +4912,11 @@
#ifdef movsd
xorps %xmm0, %xmm0
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
#ifdef movsd
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
@@ -4983,7 +4983,7 @@
testq $SIZE, A1
je .L30X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(X1), %xmm4
mulss %xmm4, %xmm0
@@ -5010,26 +5010,26 @@
#ifdef movsd
xorps %xmm0, %xmm0
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A1, LDA), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
#ifdef movsd
xorps %xmm2, %xmm2
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm2
mulps %xmm4, %xmm2
addps %xmm2, %xmm10
#ifdef movsd
xorps %xmm3, %xmm3
-#endif
+#endif
movsd -32 * SIZE(A2, LDA), %xmm3
mulps %xmm4, %xmm3
addps %xmm3, %xmm11
@@ -5339,29 +5339,29 @@
#ifdef movsd
xorps %xmm0, %xmm0
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
#ifdef movsd
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A1, LDA), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
#ifdef movsd
xorps %xmm2, %xmm2
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm2
mulps %xmm4, %xmm2
addps %xmm2, %xmm10
#ifdef movsd
xorps %xmm3, %xmm3
-#endif
+#endif
movsd -32 * SIZE(A2, LDA), %xmm3
mulps %xmm4, %xmm3
addps %xmm3, %xmm11
@@ -5475,7 +5475,7 @@
testq $SIZE, A1
je .L31X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(X1), %xmm4
mulss %xmm4, %xmm0
@@ -5499,20 +5499,20 @@
#ifdef movsd
xorps %xmm0, %xmm0
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A1, LDA), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
#ifdef movsd
xorps %xmm2, %xmm2
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm2
mulps %xmm4, %xmm2
addps %xmm2, %xmm10
@@ -5774,23 +5774,23 @@
#ifdef movsd
xorps %xmm0, %xmm0
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
#ifdef movsd
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A1, LDA), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
#ifdef movsd
xorps %xmm2, %xmm2
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm2
mulps %xmm4, %xmm2
addps %xmm2, %xmm10
@@ -5884,7 +5884,7 @@
testq $SIZE, A1
je .L32X
-
+
movss -32 * SIZE(A1), %xmm0
movss -32 * SIZE(X1), %xmm4
mulss %xmm4, %xmm0
@@ -5905,14 +5905,14 @@
#ifdef movsd
xorps %xmm0, %xmm0
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
@@ -6098,17 +6098,17 @@
#ifdef movsd
xorps %xmm0, %xmm0
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
#ifdef movsd
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
#ifdef movsd
xorps %xmm1, %xmm1
-#endif
+#endif
movsd -32 * SIZE(A2), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm9
@@ -6196,7 +6196,7 @@
#ifdef movsd
xorps %xmm0, %xmm0
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
@@ -6317,11 +6317,11 @@
#ifdef movsd
xorps %xmm0, %xmm0
-#endif
+#endif
movsd -32 * SIZE(A1), %xmm0
#ifdef movsd
xorps %xmm4, %xmm4
-#endif
+#endif
movsd -32 * SIZE(X1), %xmm4
mulps %xmm4, %xmm0
addps %xmm0, %xmm8
diff --git a/kernel/x86_64/swap.S b/kernel/x86_64/swap.S
index 50a7fb5..9529724 100644
--- a/kernel/x86_64/swap.S
+++ b/kernel/x86_64/swap.S
@@ -60,7 +60,7 @@
PROLOGUE
PROFCODE
-
+
#ifndef WINDOWS_ABI
#ifndef XDOUBLE
movq 8(%rsp), INCY
diff --git a/kernel/x86_64/swap_sse.S b/kernel/x86_64/swap_sse.S
index 5702870..dc964da 100644
--- a/kernel/x86_64/swap_sse.S
+++ b/kernel/x86_64/swap_sse.S
@@ -81,7 +81,7 @@
subq $-32 * SIZE, X
subq $-32 * SIZE, Y
-
+
cmpq $3, M
jle .L16
@@ -307,7 +307,7 @@
.L20:
movaps -33 * SIZE(X), %xmm0
movaps -32 * SIZE(Y), %xmm1
-
+
movss %xmm1, -32 * SIZE(X)
pshufd $0x39, %xmm1, %xmm3
movlps %xmm3, -31 * SIZE(X)
@@ -791,7 +791,7 @@
.L40:
movaps -35 * SIZE(X), %xmm0
movaps -32 * SIZE(Y), %xmm1
-
+
movss %xmm1, -32 * SIZE(X)
subq $3, M
diff --git a/kernel/x86_64/swap_sse2.S b/kernel/x86_64/swap_sse2.S
index 5f16419..e9260b9 100644
--- a/kernel/x86_64/swap_sse2.S
+++ b/kernel/x86_64/swap_sse2.S
@@ -97,7 +97,7 @@
.L10:
subq $-16 * SIZE, X
subq $-16 * SIZE, Y
-
+
testq $SIZE, X
jne .L20
diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S
index 5083d0b..cda0b47 100644
--- a/kernel/x86_64/symv_L_sse.S
+++ b/kernel/x86_64/symv_L_sse.S
@@ -97,7 +97,7 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 80
-
+
#define OLD_Y 8 + STACKSIZE(%rsp)
#define OLD_INCY 16 + STACKSIZE(%rsp)
#define OLD_BUFFER 24 + STACKSIZE(%rsp)
@@ -105,14 +105,14 @@
#define M ARG1
#define N ARG2
#define A ARG3
-#define LDA ARG4
+#define LDA ARG4
#define X ARG5
-#define INCX ARG6
+#define INCX ARG6
#else
#define STACKSIZE 256
-
+
#define OLD_LDA 40 + STACKSIZE(%rsp)
#define OLD_X 48 + STACKSIZE(%rsp)
#define OLD_INCX 56 + STACKSIZE(%rsp)
diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S
index a8bbb1c..0afc1e8 100644
--- a/kernel/x86_64/symv_L_sse2.S
+++ b/kernel/x86_64/symv_L_sse2.S
@@ -97,7 +97,7 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 80
-
+
#define OLD_Y 8 + STACKSIZE(%rsp)
#define OLD_INCY 16 + STACKSIZE(%rsp)
#define OLD_BUFFER 24 + STACKSIZE(%rsp)
@@ -105,14 +105,14 @@
#define M ARG1
#define N ARG2
#define A ARG3
-#define LDA ARG4
+#define LDA ARG4
#define X ARG5
-#define INCX ARG6
+#define INCX ARG6
#else
#define STACKSIZE 256
-
+
#define OLD_LDA 40 + STACKSIZE(%rsp)
#define OLD_X 48 + STACKSIZE(%rsp)
#define OLD_INCX 56 + STACKSIZE(%rsp)
diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S
index 47af772..691012c 100644
--- a/kernel/x86_64/symv_U_sse.S
+++ b/kernel/x86_64/symv_U_sse.S
@@ -97,7 +97,7 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 80
-
+
#define OLD_Y 8 + STACKSIZE(%rsp)
#define OLD_INCY 16 + STACKSIZE(%rsp)
#define OLD_BUFFER 24 + STACKSIZE(%rsp)
@@ -105,14 +105,14 @@
#define M ARG1
#define IS ARG2
#define A ARG3
-#define LDA ARG4
+#define LDA ARG4
#define X ARG5
-#define INCX ARG6
+#define INCX ARG6
#else
#define STACKSIZE 256
-
+
#define OLD_LDA 40 + STACKSIZE(%rsp)
#define OLD_X 48 + STACKSIZE(%rsp)
#define OLD_INCX 56 + STACKSIZE(%rsp)
diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S
index 57d8c2a..8ecbb39 100644
--- a/kernel/x86_64/symv_U_sse2.S
+++ b/kernel/x86_64/symv_U_sse2.S
@@ -97,7 +97,7 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 80
-
+
#define OLD_Y 8 + STACKSIZE(%rsp)
#define OLD_INCY 16 + STACKSIZE(%rsp)
#define OLD_BUFFER 24 + STACKSIZE(%rsp)
@@ -105,14 +105,14 @@
#define M ARG1
#define IS ARG2
#define A ARG3
-#define LDA ARG4
+#define LDA ARG4
#define X ARG5
-#define INCX ARG6
+#define INCX ARG6
#else
#define STACKSIZE 256
-
+
#define OLD_LDA 40 + STACKSIZE(%rsp)
#define OLD_X 48 + STACKSIZE(%rsp)
#define OLD_INCX 56 + STACKSIZE(%rsp)
@@ -213,7 +213,7 @@
movq IS, TEMP
imulq LDA, TEMP
addq TEMP, A
-
+
unpcklpd ALPHA, ALPHA
movq BUFFER, XX
diff --git a/kernel/x86_64/trsm_kernel_LN_2x8_nehalem.S b/kernel/x86_64/trsm_kernel_LN_2x8_nehalem.S
index d70bede..8deff20 100644
--- a/kernel/x86_64/trsm_kernel_LN_2x8_nehalem.S
+++ b/kernel/x86_64/trsm_kernel_LN_2x8_nehalem.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -94,7 +94,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -181,7 +181,7 @@
movq K, %rax
salq $3 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 8), %rax
subq %rax, C
#endif
@@ -196,7 +196,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -226,7 +226,7 @@
leaq (B, %rax, 8), BO
#else
movq B, BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -246,7 +246,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
mulpd %xmm0, %xmm1
addpd %xmm1, %xmm8
movaps -14 * SIZE(BO), %xmm1
@@ -681,7 +681,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L20:
movq M, I
@@ -705,7 +705,7 @@
leaq (B, %rax, 8), BO
#else
movq B, BO
-#endif
+#endif
prefetcht0 -16 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -747,7 +747,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm12
@@ -1373,7 +1373,7 @@
decq I
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L29:
#ifdef LN
@@ -1412,7 +1412,7 @@
movq K, %rax
salq $2 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 4), %rax
subq %rax, C
#endif
@@ -1427,7 +1427,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -1453,7 +1453,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1473,7 +1473,7 @@
jle .L45
ALIGN_3
-.L42:
+.L42:
mulpd %xmm0, %xmm1
addpd %xmm1, %xmm8
movaps -14 * SIZE(BO), %xmm1
@@ -1699,7 +1699,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L40:
movq M, I
@@ -1723,7 +1723,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -1751,7 +1751,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm8
@@ -2092,7 +2092,7 @@
decq I
BRANCH
jg .L31
- ALIGN_4
+ ALIGN_4
.L49:
#ifdef LN
@@ -2127,7 +2127,7 @@
movq K, %rax
salq $1 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -2142,7 +2142,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -2168,7 +2168,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -2186,7 +2186,7 @@
jle .L65
ALIGN_3
-.L62:
+.L62:
mulpd %xmm0, %xmm1
movddup -15 * SIZE(AO), %xmm0
addpd %xmm1, %xmm8
@@ -2342,7 +2342,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L60:
movq M, I
@@ -2366,7 +2366,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -2390,7 +2390,7 @@
jle .L55
ALIGN_3
-.L52:
+.L52:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm8
@@ -2636,7 +2636,7 @@
movq K, %rax
salq $BASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -2649,7 +2649,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -2675,7 +2675,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movhps -15 * SIZE(AO), %xmm0
@@ -2695,7 +2695,7 @@
jle .L85
ALIGN_3
-.L82:
+.L82:
mulpd %xmm0, %xmm1
movsd -14 * SIZE(AO), %xmm0
movhps -13 * SIZE(AO), %xmm0
@@ -2816,7 +2816,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L80:
@@ -2841,7 +2841,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -2864,7 +2864,7 @@
jle .L75
ALIGN_3
-.L72:
+.L72:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm8
@@ -3025,7 +3025,7 @@
decq I
BRANCH
jg .L71
- ALIGN_4
+ ALIGN_4
.L89:
#ifdef LN
diff --git a/kernel/x86_64/trsm_kernel_LN_4x2_atom.S b/kernel/x86_64/trsm_kernel_LN_4x2_atom.S
index 6ba2fc4..a192674 100644
--- a/kernel/x86_64/trsm_kernel_LN_4x2_atom.S
+++ b/kernel/x86_64/trsm_kernel_LN_4x2_atom.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -90,7 +90,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -146,7 +146,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -158,7 +158,7 @@
sarq $1, J
jle .L40
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movq A, AO
@@ -170,7 +170,7 @@
movq K, %rax
salq $1 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -185,7 +185,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
salq $BASE_SHIFT + 1, %rax
@@ -214,7 +214,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm7, %xmm7
@@ -411,7 +411,7 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -432,7 +432,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm2, %xmm2
@@ -738,7 +738,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
movq M, I
@@ -761,7 +761,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
prefetcht0 0 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -807,7 +807,7 @@
addsd %xmm7, %xmm14
movsd 3 * SIZE(AO), %xmm7
mulsd %xmm3, %xmm2
-
+
addsd %xmm6, %xmm15
PREFETCH (PREFETCHSIZE + 0) * SIZE(BO)
movaps %xmm4, %xmm6
@@ -1337,7 +1337,7 @@
movq K, %rax
salq $0 + BASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -1350,7 +1350,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -1374,7 +1374,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm5, %xmm5
@@ -1528,7 +1528,7 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L50:
testq $2, M
@@ -1548,7 +1548,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm2, %xmm2
@@ -1741,7 +1741,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L60:
movq M, I
@@ -1764,7 +1764,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm9, %xmm9
@@ -2066,7 +2066,7 @@
decq I # i --
jg .L41
- ALIGN_4
+ ALIGN_4
.L69:
#ifdef LN
@@ -2086,7 +2086,7 @@
subq $1, KK
#endif
ALIGN_2
-
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_barcelona.S b/kernel/x86_64/trsm_kernel_LN_4x4_barcelona.S
index 4cdaff3..69278bb 100644
--- a/kernel/x86_64/trsm_kernel_LN_4x4_barcelona.S
+++ b/kernel/x86_64/trsm_kernel_LN_4x4_barcelona.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,7 +49,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -300,7 +300,7 @@
movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\
movapd %xmm0, %xmm2 ;\
addq $8 * SIZE, %rax
-
+
#define KERNEL_SUB1(xx) \
mulpd %xmm1, %xmm0 ;\
mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\
@@ -405,7 +405,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, (%rsp)
movq %rbp, 8(%rsp)
@@ -470,7 +470,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -508,7 +508,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
salq $BASE_SHIFT + 2, %rax
@@ -541,7 +541,7 @@
movq KK, %rax
leaq (, %rax, SIZE), %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -806,7 +806,7 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -833,7 +833,7 @@
movq KK, %rax
leaq (, %rax, SIZE), %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1170,7 +1170,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
movq M, I
@@ -1198,7 +1198,7 @@
movq KK, %rax
leaq (, %rax, SIZE), %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
movddup -16 * SIZE(BO), %xmm1
@@ -1741,7 +1741,7 @@
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L39:
#ifdef LN
@@ -1794,7 +1794,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -1825,7 +1825,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 1), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1999,7 +1999,7 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -2024,7 +2024,7 @@
movq KK, %rax
leaq (, %rax, SIZE), %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2253,7 +2253,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L70:
movq M, I
@@ -2281,7 +2281,7 @@
movq KK, %rax
leaq (, %rax, SIZE), %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movddup -16 * SIZE(BO), %xmm1
movddup -15 * SIZE(BO), %xmm5
@@ -2622,8 +2622,8 @@
decq I # i --
jg .L51
- ALIGN_4
-
+ ALIGN_4
+
.L79:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -2670,7 +2670,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -2698,7 +2698,7 @@
#if defined(LN) || defined(RT)
movq KK, %rax
leaq (BO, %rax, SIZE), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2833,13 +2833,13 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
je .L110
-#ifdef LN
+#ifdef LN
movq K, %rax
salq $1 + BASE_SHIFT, %rax
subq %rax, AORIG
@@ -2857,7 +2857,7 @@
#if defined(LN) || defined(RT)
movq KK, %rax
leaq (BO, %rax, SIZE), BO
-#endif
+#endif
movddup -16 * SIZE(BO), %xmm0
pxor %xmm8, %xmm8
@@ -3045,7 +3045,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L110:
movq M, I
@@ -3072,7 +3072,7 @@
#if defined(LN) || defined(RT)
movq KK, %rax
leaq (BO, %rax, SIZE), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -3341,7 +3341,7 @@
decq I # i --
jg .L91
- ALIGN_4
+ ALIGN_4
.L119:
#ifdef LN
@@ -3360,7 +3360,7 @@
subq $1, KK
#endif
ALIGN_4
-
+
.L999:
movq (%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_core2.S b/kernel/x86_64/trsm_kernel_LN_4x4_core2.S
index fc5284a..3a16bfc 100644
--- a/kernel/x86_64/trsm_kernel_LN_4x4_core2.S
+++ b/kernel/x86_64/trsm_kernel_LN_4x4_core2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,7 +49,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -83,7 +83,7 @@
#define AORIG 32(%rsp)
#define BORIG 40(%rsp)
#define BUFFER 128(%rsp)
-
+
#define PREFETCH_R (8 * 4 + 0)
#define PREFETCH_W (PREFETCH_R)
@@ -92,7 +92,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -161,7 +161,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -180,7 +180,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq 16 * SIZE + BUFFER, BO
@@ -196,7 +196,7 @@
leaq (, %rax, SIZE), %rax
leaq (B, %rax, 4), B
leaq (BO, %rax, 8), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -212,7 +212,7 @@
sarq $2, %rax
jle .L03
ALIGN_4
-
+
.L02:
prefetcht0 (PREFETCH_R + 0) * SIZE(B)
movapd -16 * SIZE(B), %xmm0
@@ -241,7 +241,7 @@
unpckhpd %xmm6, %xmm6
movddup %xmm7, %xmm15
unpckhpd %xmm7, %xmm7
-
+
prefetcht0 (PREFETCH_W + 0) * SIZE(BO)
movapd %xmm8, -16 * SIZE(BO)
movapd %xmm0, -14 * SIZE(BO)
@@ -303,7 +303,7 @@
subq $1, %rax
jne .L04
ALIGN_4
-
+
.L10:
leaq (PREFETCH_R + 0) * SIZE(B), BB
@@ -348,7 +348,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -647,7 +647,7 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -674,7 +674,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1040,7 +1040,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
movq M, I
@@ -1068,7 +1068,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
prefetcht2 0 * SIZE(BB)
@@ -1114,7 +1114,7 @@
jle .L15
ALIGN_4
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm2, %xmm10
@@ -1739,9 +1739,9 @@
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
+
-
.L39:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -1777,12 +1777,12 @@
.L41:
/* Copying to Sub Buffer */
-
+
#ifdef LN
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -1798,7 +1798,7 @@
leaq (, %rax, SIZE), %rax
leaq (B, %rax, 2), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -1814,7 +1814,7 @@
sarq $2, %rax
jle .L43
ALIGN_4
-
+
.L42:
movddup -16 * SIZE(B), %xmm0
movddup -15 * SIZE(B), %xmm1
@@ -1864,7 +1864,7 @@
decq %rax
jne .L44
ALIGN_4
-
+
.L50:
#if defined(LT) || defined(RN)
movq A, AO
@@ -1907,7 +1907,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -2113,8 +2113,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L60:
testq $2, M
je .L70
@@ -2140,7 +2140,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -2385,7 +2385,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L70:
movq M, I
@@ -2413,7 +2413,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -2797,7 +2797,7 @@
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L79:
#ifdef LN
@@ -2833,7 +2833,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -2849,7 +2849,7 @@
leaq (, %rax, SIZE), %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 2), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -2865,7 +2865,7 @@
sarq $3, %rax
jle .L83
ALIGN_4
-
+
.L82:
movddup -16 * SIZE(B), %xmm0
movddup -15 * SIZE(B), %xmm1
@@ -2913,7 +2913,7 @@
subq $1, %rax
jne .L84
ALIGN_4
-
+
.L90:
#if defined(LT) || defined(RN)
movq A, AO
@@ -2954,7 +2954,7 @@
movq KK, %rax
salq $0 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -3124,7 +3124,7 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
@@ -3151,7 +3151,7 @@
movq KK, %rax
salq $0 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -3352,7 +3352,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L110:
movq M, I
@@ -3380,7 +3380,7 @@
movq KK, %rax
salq $0 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
#ifdef LN
@@ -3683,7 +3683,7 @@
decq I # i --
jg .L91
- ALIGN_4
+ ALIGN_4
.L119:
#ifdef LN
@@ -3707,7 +3707,7 @@
#endif
ALIGN_4
-
+
.L999:
movq %r15, %rsp
diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_penryn.S b/kernel/x86_64/trsm_kernel_LN_4x4_penryn.S
index 09f9122..3bc7ae1 100644
--- a/kernel/x86_64/trsm_kernel_LN_4x4_penryn.S
+++ b/kernel/x86_64/trsm_kernel_LN_4x4_penryn.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -95,7 +95,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -180,7 +180,7 @@
movq K, %rax
salq $2 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 4), %rax
subq %rax, C
#endif
@@ -195,7 +195,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
salq $BASE_SHIFT + 2, %rax
@@ -225,7 +225,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movaps -16 * SIZE(BO), %xmm2
@@ -247,7 +247,7 @@
jle .L35
ALIGN_4
-.L32:
+.L32:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
shufps $0x44, %xmm0, %xmm0
@@ -481,7 +481,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -503,7 +503,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
pxor %xmm3, %xmm3
@@ -526,7 +526,7 @@
jle .L25
ALIGN_4
-.L22:
+.L22:
addpd %xmm3, %xmm11
movaps -14 * SIZE(BO), %xmm3
pshufd $0x4e, %xmm2, %xmm7
@@ -857,7 +857,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
movq M, I
@@ -881,7 +881,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
prefetcht2 -16 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -936,7 +936,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
addpd %xmm3, %xmm11
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movaps -14 * SIZE(BO), %xmm3
@@ -1666,7 +1666,7 @@
decq I # i --
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L39:
#ifdef LN
@@ -1705,7 +1705,7 @@
movq K, %rax
salq $1 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -1720,7 +1720,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
salq $BASE_SHIFT + 1, %rax
@@ -1730,7 +1730,7 @@
movq OFFSET, %rax
movq %rax, KK
#endif
-
+
testq $1, M
BRANCH
jle .L60
@@ -1750,7 +1750,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movaps -16 * SIZE(BO), %xmm2
@@ -1769,7 +1769,7 @@
jle .L75
ALIGN_4
-.L72:
+.L72:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
shufps $0x44, %xmm0, %xmm0
@@ -1935,7 +1935,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1957,7 +1957,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1977,7 +1977,7 @@
jle .L65
ALIGN_4
-.L62:
+.L62:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x4e, %xmm2, %xmm7
@@ -2196,7 +2196,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L70:
movq M, I
@@ -2220,7 +2220,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
prefetcht2 -16 * SIZE(BB)
subq $-4 * SIZE, BB
@@ -2257,7 +2257,7 @@
jle .L55
ALIGN_4
-.L52:
+.L52:
movaps %xmm2, %xmm4
pshufd $0x4e, %xmm2, %xmm7
mulpd %xmm0, %xmm2
@@ -2596,7 +2596,7 @@
decq I
BRANCH
jg .L51
- ALIGN_4
+ ALIGN_4
.L79:
#ifdef LN
@@ -2631,7 +2631,7 @@
movq K, %rax
salq $BASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -2644,7 +2644,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -2654,7 +2654,7 @@
testq $1, M
BRANCH
jle .L90
-
+
#ifdef LN
movq K, %rax
salq $BASE_SHIFT, %rax
@@ -2669,7 +2669,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movsd -16 * SIZE(BO), %xmm2
@@ -2688,7 +2688,7 @@
jle .L115
ALIGN_4
-.L112:
+.L112:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
mulsd %xmm0, %xmm2
@@ -2828,7 +2828,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L90:
testq $2, M
@@ -2850,7 +2850,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2869,7 +2869,7 @@
jle .L105
ALIGN_4
-.L102:
+.L102:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x44, %xmm2, %xmm3
@@ -3049,7 +3049,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L110:
movq M, I
@@ -3073,7 +3073,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
movaps -14 * SIZE(AO), %xmm1
@@ -3100,7 +3100,7 @@
jle .L95
ALIGN_4
-.L92:
+.L92:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x44, %xmm2, %xmm3
@@ -3377,7 +3377,7 @@
decq I
BRANCH
jg .L91
- ALIGN_4
+ ALIGN_4
.L119:
#ifdef LN
diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_sse2.S b/kernel/x86_64/trsm_kernel_LN_4x4_sse2.S
index ca0bfbd..c846080 100644
--- a/kernel/x86_64/trsm_kernel_LN_4x4_sse2.S
+++ b/kernel/x86_64/trsm_kernel_LN_4x4_sse2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -212,10 +212,10 @@
movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
addpd %xmm14, %xmm7 ;\
movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14
-
+
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -284,7 +284,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -303,7 +303,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -319,7 +319,7 @@
leaq (, %rax, SIZE), %rax
leaq (B, %rax, 4), B
leaq (BO, %rax, 8), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -337,7 +337,7 @@
addq %rax, %rax
ALIGN_4
-
+
.L02:
PREFETCHNTA 40 * SIZE(B)
@@ -406,7 +406,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movq A, AO
@@ -449,7 +449,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -791,8 +791,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L20:
testq $2, M
je .L30
@@ -818,7 +818,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1255,7 +1255,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
@@ -1284,7 +1284,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(BO), %xmm9
movapd 2 * SIZE(BO), %xmm11
@@ -1329,7 +1329,7 @@
andq $-8, %rax
salq $4, %rax
je .L15
-.L1X:
+.L1X:
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
@@ -1968,7 +1968,7 @@
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L39:
#ifdef LN
@@ -2005,12 +2005,12 @@
.L41:
/* Copying to Sub Buffer */
-
+
#ifdef LN
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -2026,7 +2026,7 @@
leaq (, %rax, SIZE), %rax
leaq (B, %rax, 2), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -2042,7 +2042,7 @@
sarq $2, %rax
jle .L43
ALIGN_4
-
+
.L42:
PREFETCH 56 * SIZE(B)
@@ -2105,7 +2105,7 @@
decq %rax
jne .L44
ALIGN_4
-
+
.L50:
#if defined(LT) || defined(RN)
movq A, AO
@@ -2148,7 +2148,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2379,7 +2379,7 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -2406,7 +2406,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2688,7 +2688,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L70:
movq M, I
@@ -2716,7 +2716,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -3163,7 +3163,7 @@
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L79:
#ifdef LN
@@ -3199,7 +3199,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -3215,7 +3215,7 @@
leaq (, %rax, SIZE), %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 2), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -3231,7 +3231,7 @@
sarq $3, %rax
jle .L83
ALIGN_4
-
+
.L82:
PREFETCH 56 * SIZE(B)
@@ -3291,7 +3291,7 @@
decq %rax
jne .L84
ALIGN_4
-
+
.L90:
#if defined(LT) || defined(RN)
movq A, AO
@@ -3332,7 +3332,7 @@
movq KK, %rax
salq $0 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -3511,7 +3511,7 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
@@ -3538,7 +3538,7 @@
movq KK, %rax
salq $0 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -3748,7 +3748,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L110:
movq M, I
@@ -3776,7 +3776,7 @@
movq KK, %rax
salq $0 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -4095,7 +4095,7 @@
decq I # i --
jg .L91
- ALIGN_4
+ ALIGN_4
.L119:
#ifdef LN
diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_sse3.S b/kernel/x86_64/trsm_kernel_LN_4x4_sse3.S
index 66a5e40..fedeb5a 100644
--- a/kernel/x86_64/trsm_kernel_LN_4x4_sse3.S
+++ b/kernel/x86_64/trsm_kernel_LN_4x4_sse3.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -333,7 +333,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -390,7 +390,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -402,7 +402,7 @@
sarq $2, J # j = (n >> 2)
jle .L40
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movq A, AO
@@ -414,7 +414,7 @@
movq K, %rax
salq $2 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 4), %rax
subq %rax, C
#endif
@@ -429,7 +429,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
salq $BASE_SHIFT + 2, %rax
@@ -460,7 +460,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movddup 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -756,8 +756,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L30:
testq $2, M
BRANCH
@@ -780,7 +780,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1159,7 +1159,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L20:
movq M, I
@@ -1183,7 +1183,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
prefetcht0 0 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -1233,7 +1233,7 @@
andq $-8, %rax
salq $4, %rax
je .L15
-.L1X:
+.L1X:
KERNEL1 (16 * 0)
KERNEL2 (16 * 0)
KERNEL3 (16 * 0)
@@ -2121,7 +2121,7 @@
movq K, %rax
salq $1 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -2136,7 +2136,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
salq $BASE_SHIFT + 1, %rax
@@ -2167,7 +2167,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movddup 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2362,9 +2362,9 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
-.L70:
+.L70:
testq $2, M
je .L60
ALIGN_4
@@ -2384,7 +2384,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2631,7 +2631,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L60:
movq M, I
@@ -2654,7 +2654,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
prefetcht0 0 * SIZE(BB)
subq $-4 * SIZE, BB
@@ -3060,9 +3060,9 @@
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
+
-
.L79:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -3097,7 +3097,7 @@
movq K, %rax
salq $0 + BASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -3110,7 +3110,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -3136,7 +3136,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -3289,7 +3289,7 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L110:
testq $2, M
@@ -3311,7 +3311,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -3505,7 +3505,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L100:
movq M, I
@@ -3528,7 +3528,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -3823,7 +3823,7 @@
decq I # i --
jg .L91
- ALIGN_4
+ ALIGN_4
.L119:
#ifdef LN
@@ -3843,7 +3843,7 @@
subq $1, KK
#endif
ALIGN_2
-
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/trsm_kernel_LN_4x8_nehalem.S b/kernel/x86_64/trsm_kernel_LN_4x8_nehalem.S
index 28c2ca0..8717fc3 100644
--- a/kernel/x86_64/trsm_kernel_LN_4x8_nehalem.S
+++ b/kernel/x86_64/trsm_kernel_LN_4x8_nehalem.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -94,7 +94,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -180,7 +180,7 @@
movq K, %rax
salq $3 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 8), %rax
subq %rax, C
#endif
@@ -195,7 +195,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -224,7 +224,7 @@
leaq (B, %rax, 8), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm2, %xmm2
movsd -32 * SIZE(AO), %xmm0
@@ -243,7 +243,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
pshufd $0x00, %xmm0, %xmm1
addps %xmm2, %xmm8
movaps -32 * SIZE(BO), %xmm2
@@ -713,7 +713,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -734,7 +734,7 @@
leaq (B, %rax, 8), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -759,7 +759,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
addps %xmm1, %xmm8
pshufd $0x50, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -904,7 +904,7 @@
movaps %xmm8, %xmm4
shufps $0x88, %xmm9, %xmm8
shufps $0xdd, %xmm9, %xmm4
-
+
movaps %xmm10, %xmm5
shufps $0x88, %xmm11, %xmm10
shufps $0xdd, %xmm11, %xmm5
@@ -1327,7 +1327,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
movq M, I
@@ -1351,7 +1351,7 @@
leaq (B, %rax, 8), BO
#else
movq B, BO
-#endif
+#endif
prefetchnta -32 * SIZE(BB)
subq $-16 * SIZE, BB
@@ -1393,7 +1393,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm12
@@ -2195,8 +2195,8 @@
decq I
BRANCH
jg .L11
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -2233,7 +2233,7 @@
movq K, %rax
salq $2 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 4), %rax
subq %rax, C
#endif
@@ -2248,7 +2248,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -2273,7 +2273,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm2, %xmm2
movsd -32 * SIZE(AO), %xmm0
@@ -2291,7 +2291,7 @@
jle .L65
ALIGN_3
-.L62:
+.L62:
pshufd $0x00, %xmm0, %xmm1
addps %xmm2, %xmm8
movaps -32 * SIZE(BO), %xmm2
@@ -2522,7 +2522,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L50:
testq $2, M
@@ -2543,7 +2543,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -2563,7 +2563,7 @@
jle .L55
ALIGN_3
-.L52:
+.L52:
addps %xmm1, %xmm8
pshufd $0x50, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -2661,7 +2661,7 @@
movaps %xmm8, %xmm4
shufps $0x88, %xmm9, %xmm8
shufps $0xdd, %xmm9, %xmm4
-
+
movaps -32 * SIZE(BO), %xmm0
movaps -28 * SIZE(BO), %xmm1
@@ -2840,7 +2840,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L60:
movq M, I
@@ -2864,7 +2864,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -2892,7 +2892,7 @@
jle .L45
ALIGN_3
-.L42:
+.L42:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm8
@@ -3301,7 +3301,7 @@
decq I
BRANCH
jg .L41
- ALIGN_4
+ ALIGN_4
.L69:
#ifdef LN
@@ -3335,7 +3335,7 @@
movq K, %rax
salq $1 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -3350,7 +3350,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -3375,7 +3375,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm2, %xmm2
movsd -32 * SIZE(AO), %xmm0
@@ -3393,7 +3393,7 @@
jle .L95
ALIGN_3
-.L92:
+.L92:
pshufd $0x00, %xmm0, %xmm1
addps %xmm2, %xmm8
movsd -32 * SIZE(BO), %xmm2
@@ -3556,7 +3556,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L80:
testq $2, M
@@ -3577,7 +3577,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -3597,7 +3597,7 @@
jle .L85
ALIGN_3
-.L82:
+.L82:
addps %xmm1, %xmm8
movsd -32 * SIZE(BO), %xmm1
unpcklps %xmm1, %xmm1
@@ -3677,7 +3677,7 @@
#if defined(LN) || defined(LT)
pshufd $0xd8, %xmm8, %xmm8
-
+
movaps -32 * SIZE(BO), %xmm0
#else
movaps -32 * SIZE(AO), %xmm0
@@ -3782,7 +3782,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L90:
movq M, I
@@ -3806,7 +3806,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -3829,7 +3829,7 @@
jle .L75
ALIGN_3
-.L72:
+.L72:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm8
@@ -4110,8 +4110,8 @@
decq I
BRANCH
jg .L71
- ALIGN_4
-
+ ALIGN_4
+
.L99:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -4144,7 +4144,7 @@
movq K, %rax
salq $BASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -4157,7 +4157,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -4182,7 +4182,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm2, %xmm2
movss -32 * SIZE(AO), %xmm0
@@ -4199,7 +4199,7 @@
jle .L125
ALIGN_3
-.L122:
+.L122:
addss %xmm2, %xmm8
movss -32 * SIZE(BO), %xmm2
mulss %xmm0, %xmm2
@@ -4325,7 +4325,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L110:
testq $2, M
@@ -4346,7 +4346,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -4363,7 +4363,7 @@
jle .L115
ALIGN_3
-.L112:
+.L112:
addps %xmm1, %xmm8
movss -32 * SIZE(BO), %xmm1
unpcklps %xmm1, %xmm1
@@ -4549,7 +4549,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -4568,7 +4568,7 @@
jle .L105
ALIGN_3
-.L102:
+.L102:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm8
@@ -4798,7 +4798,7 @@
decq I
BRANCH
jg .L101
- ALIGN_4
+ ALIGN_4
.L129:
#ifdef LN
diff --git a/kernel/x86_64/trsm_kernel_LN_8x4_sse.S b/kernel/x86_64/trsm_kernel_LN_8x4_sse.S
index 552dbac..a318747 100644
--- a/kernel/x86_64/trsm_kernel_LN_8x4_sse.S
+++ b/kernel/x86_64/trsm_kernel_LN_8x4_sse.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -107,11 +107,11 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
EMMS
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -179,7 +179,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -198,10 +198,10 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
-
+
#ifdef RT
movq K, %rax
salq $2 + BASE_SHIFT, %rax
@@ -214,7 +214,7 @@
salq $2 + BASE_SHIFT, %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -230,7 +230,7 @@
sarq $2, %rax
jle .L03
ALIGN_4
-
+
.L02:
movaps 0 * SIZE(B), %xmm3
movaps 4 * SIZE(B), %xmm7
@@ -312,7 +312,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movq A, AO
@@ -352,7 +352,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movss 0 * SIZE(AO), %xmm8
movss 4 * SIZE(AO), %xmm10
@@ -731,7 +731,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -756,7 +756,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 8 * SIZE(AO), %xmm10
@@ -1179,7 +1179,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
testq $4, M
@@ -1204,7 +1204,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -1700,7 +1700,7 @@
salq $2 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L40:
movq M, I
@@ -1728,7 +1728,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(BO), %xmm9
movaps 4 * SIZE(BO), %xmm11
@@ -1762,7 +1762,7 @@
sarq $2, %rax
je .L15
ALIGN_4
-
+
.L12:
mulps %xmm8, %xmm9
addps %xmm9, %xmm0
@@ -2574,7 +2574,7 @@
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L49:
#ifdef LN
@@ -2608,10 +2608,10 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
-
+
#ifdef RT
movq K, %rax
salq $1 + BASE_SHIFT, %rax
@@ -2624,7 +2624,7 @@
salq $1 + BASE_SHIFT, %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -2640,7 +2640,7 @@
sarq $2, %rax
jle .L53
ALIGN_4
-
+
.L52:
movaps 0 * SIZE(B), %xmm3
movaps 4 * SIZE(B), %xmm7
@@ -2697,7 +2697,7 @@
decq %rax
jne .L54
ALIGN_4
-
+
.L60:
#if defined(LT) || defined(RN)
movq A, AO
@@ -2736,7 +2736,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movss 0 * SIZE(AO), %xmm8
movss 4 * SIZE(AO), %xmm10
@@ -2987,7 +2987,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L70:
testq $2, M
@@ -3012,7 +3012,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 8 * SIZE(AO), %xmm10
@@ -3306,7 +3306,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L80:
testq $4, M
@@ -3331,7 +3331,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -3695,7 +3695,7 @@
salq $2 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L90:
movq M, I
@@ -3723,7 +3723,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -4427,7 +4427,7 @@
decq I # i --
jg .L61
- ALIGN_4
+ ALIGN_4
.L99:
#ifdef LN
@@ -4459,10 +4459,10 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
-
+
#ifdef RT
movq K, %rax
salq $BASE_SHIFT, %rax
@@ -4475,7 +4475,7 @@
salq $BASE_SHIFT, %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -4548,7 +4548,7 @@
decq %rax
jne .L104
ALIGN_4
-
+
.L110:
#if defined(LT) || defined(RN)
movq A, AO
@@ -4586,7 +4586,7 @@
movq KK, %rax
salq $BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movss 0 * SIZE(AO), %xmm8
movss 4 * SIZE(AO), %xmm10
@@ -4753,7 +4753,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L120:
testq $2, M
@@ -4778,7 +4778,7 @@
movq KK, %rax
salq $BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 8 * SIZE(AO), %xmm10
@@ -5001,7 +5001,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L130:
testq $4, M
@@ -5026,7 +5026,7 @@
movq KK, %rax
salq $BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -5311,7 +5311,7 @@
salq $2 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L140:
movq M, I
@@ -5339,7 +5339,7 @@
movq KK, %rax
salq $BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -5895,7 +5895,7 @@
decq I # i --
jg .L111
- ALIGN_4
+ ALIGN_4
.L149:
#ifdef LN
diff --git a/kernel/x86_64/trsm_kernel_LT_2x8_nehalem.S b/kernel/x86_64/trsm_kernel_LT_2x8_nehalem.S
index b04299a..39ed586 100644
--- a/kernel/x86_64/trsm_kernel_LT_2x8_nehalem.S
+++ b/kernel/x86_64/trsm_kernel_LT_2x8_nehalem.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -94,7 +94,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -136,7 +136,7 @@
movq OLD_LDC, LDC
movq OLD_OFFSET, KK
-
+
leaq (, LDC, SIZE), LDC
movq KK, OFFSET
@@ -181,7 +181,7 @@
movq K, %rax
salq $3 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 8), %rax
subq %rax, C
#endif
@@ -196,7 +196,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -228,7 +228,7 @@
leaq (B, %rax, 8), BO
#else
movq B, BO
-#endif
+#endif
prefetcht0 -16 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -270,7 +270,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm12
@@ -898,7 +898,7 @@
decq I
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $1, M
@@ -920,7 +920,7 @@
leaq (B, %rax, 8), BO
#else
movq B, BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -940,7 +940,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
mulpd %xmm0, %xmm1
addpd %xmm1, %xmm8
movaps -14 * SIZE(BO), %xmm1
@@ -1376,7 +1376,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L29:
#ifdef LN
@@ -1415,7 +1415,7 @@
movq K, %rax
salq $2 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 4), %rax
subq %rax, C
#endif
@@ -1430,7 +1430,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -1458,7 +1458,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -1486,7 +1486,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm8
@@ -1827,7 +1827,7 @@
decq I
BRANCH
jg .L31
- ALIGN_4
+ ALIGN_4
.L40:
testq $1, M
@@ -1849,7 +1849,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1869,7 +1869,7 @@
jle .L45
ALIGN_3
-.L42:
+.L42:
mulpd %xmm0, %xmm1
addpd %xmm1, %xmm8
movaps -14 * SIZE(BO), %xmm1
@@ -2095,7 +2095,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L49:
#ifdef LN
@@ -2130,7 +2130,7 @@
movq K, %rax
salq $1 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -2145,7 +2145,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -2173,7 +2173,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -2197,7 +2197,7 @@
jle .L55
ALIGN_3
-.L52:
+.L52:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm8
@@ -2430,7 +2430,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -2448,7 +2448,7 @@
jle .L65
ALIGN_3
-.L62:
+.L62:
mulpd %xmm0, %xmm1
movddup -15 * SIZE(AO), %xmm0
addpd %xmm1, %xmm8
@@ -2604,7 +2604,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L69:
#ifdef LN
@@ -2639,7 +2639,7 @@
movq K, %rax
salq $BASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -2652,7 +2652,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -2680,7 +2680,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -2703,7 +2703,7 @@
jle .L75
ALIGN_3
-.L72:
+.L72:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm8
@@ -2864,7 +2864,7 @@
decq I
BRANCH
jg .L71
- ALIGN_4
+ ALIGN_4
.L80:
testq $1, M
@@ -2886,7 +2886,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movhps -15 * SIZE(AO), %xmm0
@@ -2906,7 +2906,7 @@
jle .L85
ALIGN_3
-.L82:
+.L82:
mulpd %xmm0, %xmm1
movsd -14 * SIZE(AO), %xmm0
movhps -13 * SIZE(AO), %xmm0
@@ -3027,7 +3027,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L89:
#ifdef LN
diff --git a/kernel/x86_64/trsm_kernel_LT_4x2_atom.S b/kernel/x86_64/trsm_kernel_LT_4x2_atom.S
index c6ad0a2..04b7e2d 100644
--- a/kernel/x86_64/trsm_kernel_LT_4x2_atom.S
+++ b/kernel/x86_64/trsm_kernel_LT_4x2_atom.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -90,7 +90,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -146,7 +146,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -158,7 +158,7 @@
sarq $1, J
jle .L40
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movq A, AO
@@ -170,7 +170,7 @@
movq K, %rax
salq $1 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -185,7 +185,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
salq $BASE_SHIFT + 1, %rax
@@ -216,7 +216,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
prefetcht0 0 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -262,7 +262,7 @@
addsd %xmm7, %xmm14
movsd 3 * SIZE(AO), %xmm7
mulsd %xmm3, %xmm2
-
+
addsd %xmm6, %xmm15
PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO)
movaps %xmm4, %xmm6
@@ -775,7 +775,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm2, %xmm2
@@ -1081,7 +1081,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -1102,7 +1102,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm7, %xmm7
@@ -1299,8 +1299,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -1337,7 +1337,7 @@
movq K, %rax
salq $0 + BASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -1350,7 +1350,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -1377,7 +1377,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm9, %xmm9
@@ -1679,7 +1679,7 @@
decq I # i --
jg .L41
- ALIGN_4
+ ALIGN_4
.L50:
testq $2, M
@@ -1699,7 +1699,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm2, %xmm2
@@ -1892,7 +1892,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L60:
testq $1, M
@@ -1912,7 +1912,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm5, %xmm5
@@ -2066,7 +2066,7 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L69:
#ifdef LN
@@ -2086,7 +2086,7 @@
subq $1, KK
#endif
ALIGN_2
-
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_barcelona.S b/kernel/x86_64/trsm_kernel_LT_4x4_barcelona.S
index b133bcf..b371200 100644
--- a/kernel/x86_64/trsm_kernel_LT_4x4_barcelona.S
+++ b/kernel/x86_64/trsm_kernel_LT_4x4_barcelona.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,7 +49,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -300,7 +300,7 @@
movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\
movapd %xmm0, %xmm2 ;\
addq $8 * SIZE, %rax
-
+
#define KERNEL_SUB1(xx) \
mulpd %xmm1, %xmm0 ;\
mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\
@@ -405,7 +405,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, (%rsp)
movq %rbp, 8(%rsp)
@@ -470,7 +470,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -508,7 +508,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
salq $BASE_SHIFT + 2, %rax
@@ -544,7 +544,7 @@
movq KK, %rax
leaq (, %rax, SIZE), %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
movddup -16 * SIZE(BO), %xmm1
@@ -1087,7 +1087,7 @@
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $3, M
@@ -1117,7 +1117,7 @@
movq KK, %rax
leaq (, %rax, SIZE), %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1454,7 +1454,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -1479,7 +1479,7 @@
movq KK, %rax
leaq (, %rax, SIZE), %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1744,8 +1744,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -1797,7 +1797,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -1829,7 +1829,7 @@
movq KK, %rax
leaq (, %rax, SIZE), %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movddup -16 * SIZE(BO), %xmm1
movddup -15 * SIZE(BO), %xmm5
@@ -2170,7 +2170,7 @@
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -2195,7 +2195,7 @@
movq KK, %rax
leaq (, %rax, SIZE), %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2424,7 +2424,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L70:
testq $1, M
@@ -2451,7 +2451,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 1), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2625,8 +2625,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L79:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -2673,7 +2673,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -2704,7 +2704,7 @@
#if defined(LN) || defined(RT)
movq KK, %rax
leaq (BO, %rax, SIZE), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2973,13 +2973,13 @@
decq I # i --
jg .L91
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
je .L110
-#ifdef LN
+#ifdef LN
movq K, %rax
salq $1 + BASE_SHIFT, %rax
subq %rax, AORIG
@@ -2997,7 +2997,7 @@
#if defined(LN) || defined(RT)
movq KK, %rax
leaq (BO, %rax, SIZE), BO
-#endif
+#endif
movddup -16 * SIZE(BO), %xmm0
pxor %xmm8, %xmm8
@@ -3185,7 +3185,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L110:
testq $1, M
@@ -3211,7 +3211,7 @@
#if defined(LN) || defined(RT)
movq KK, %rax
leaq (BO, %rax, SIZE), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -3346,7 +3346,7 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L119:
#ifdef LN
@@ -3366,7 +3366,7 @@
#endif
ALIGN_4
-
+
.L999:
movq (%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_core2.S b/kernel/x86_64/trsm_kernel_LT_4x4_core2.S
index 7864ec5..c5fdc52 100644
--- a/kernel/x86_64/trsm_kernel_LT_4x4_core2.S
+++ b/kernel/x86_64/trsm_kernel_LT_4x4_core2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,7 +49,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -83,7 +83,7 @@
#define AORIG 32(%rsp)
#define BORIG 40(%rsp)
#define BUFFER 128(%rsp)
-
+
#define PREFETCH_R (8 * 4 + 0)
#define PREFETCH_W (PREFETCH_R)
@@ -92,7 +92,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -161,7 +161,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -180,7 +180,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq 16 * SIZE + BUFFER, BO
@@ -196,7 +196,7 @@
leaq (, %rax, SIZE), %rax
leaq (B, %rax, 4), B
leaq (BO, %rax, 8), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -212,7 +212,7 @@
sarq $2, %rax
jle .L03
ALIGN_4
-
+
.L02:
prefetcht0 (PREFETCH_R + 0) * SIZE(B)
movapd -16 * SIZE(B), %xmm0
@@ -241,7 +241,7 @@
unpckhpd %xmm6, %xmm6
movddup %xmm7, %xmm15
unpckhpd %xmm7, %xmm7
-
+
prefetcht0 (PREFETCH_W + 0) * SIZE(BO)
movapd %xmm8, -16 * SIZE(BO)
movapd %xmm0, -14 * SIZE(BO)
@@ -303,7 +303,7 @@
subq $1, %rax
jne .L04
ALIGN_4
-
+
.L10:
leaq (PREFETCH_R + 0) * SIZE(B), BB
@@ -349,7 +349,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
prefetcht2 0 * SIZE(BB)
@@ -384,7 +384,7 @@
jle .L15
ALIGN_4
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm2, %xmm10
@@ -1009,7 +1009,7 @@
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $3, M
@@ -1039,7 +1039,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1405,7 +1405,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -1432,7 +1432,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1731,8 +1731,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -1768,12 +1768,12 @@
.L41:
/* Copying to Sub Buffer */
-
+
#ifdef LN
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -1789,7 +1789,7 @@
leaq (, %rax, SIZE), %rax
leaq (B, %rax, 2), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -1805,7 +1805,7 @@
sarq $2, %rax
jle .L43
ALIGN_4
-
+
.L42:
movddup -16 * SIZE(B), %xmm0
movddup -15 * SIZE(B), %xmm1
@@ -1855,7 +1855,7 @@
decq %rax
jne .L44
ALIGN_4
-
+
.L50:
#if defined(LT) || defined(RN)
movq A, AO
@@ -1899,7 +1899,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -2283,7 +2283,7 @@
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -2310,7 +2310,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -2555,7 +2555,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L70:
testq $1, M
@@ -2582,7 +2582,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -2788,8 +2788,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L79:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -2824,7 +2824,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -2840,7 +2840,7 @@
leaq (, %rax, SIZE), %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 2), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -2856,7 +2856,7 @@
sarq $3, %rax
jle .L83
ALIGN_4
-
+
.L82:
movddup -16 * SIZE(B), %xmm0
movddup -15 * SIZE(B), %xmm1
@@ -2904,7 +2904,7 @@
subq $1, %rax
jne .L84
ALIGN_4
-
+
.L90:
#if defined(LT) || defined(RN)
movq A, AO
@@ -2946,7 +2946,7 @@
movq KK, %rax
salq $0 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
#ifdef LN
@@ -3249,7 +3249,7 @@
decq I # i --
jg .L91
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
@@ -3276,7 +3276,7 @@
movq KK, %rax
salq $0 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -3477,7 +3477,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L110:
testq $1, M
@@ -3504,7 +3504,7 @@
movq KK, %rax
salq $0 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -3674,7 +3674,7 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L119:
#ifdef LN
@@ -3698,7 +3698,7 @@
#endif
ALIGN_4
-
+
.L999:
movq %r15, %rsp
diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_penryn.S b/kernel/x86_64/trsm_kernel_LT_4x4_penryn.S
index 77fc0c5..e186b94 100644
--- a/kernel/x86_64/trsm_kernel_LT_4x4_penryn.S
+++ b/kernel/x86_64/trsm_kernel_LT_4x4_penryn.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -95,7 +95,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -180,7 +180,7 @@
movq K, %rax
salq $2 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 4), %rax
subq %rax, C
#endif
@@ -195,7 +195,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
salq $BASE_SHIFT + 2, %rax
@@ -227,7 +227,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
prefetcht2 -16 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -282,7 +282,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
addpd %xmm3, %xmm11
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movaps -14 * SIZE(BO), %xmm3
@@ -1012,7 +1012,7 @@
decq I # i --
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -1034,7 +1034,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
pxor %xmm3, %xmm3
@@ -1057,7 +1057,7 @@
jle .L25
ALIGN_4
-.L22:
+.L22:
addpd %xmm3, %xmm11
movaps -14 * SIZE(BO), %xmm3
pshufd $0x4e, %xmm2, %xmm7
@@ -1388,7 +1388,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -1410,7 +1410,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movaps -16 * SIZE(BO), %xmm2
@@ -1432,7 +1432,7 @@
jle .L35
ALIGN_4
-.L32:
+.L32:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
shufps $0x44, %xmm0, %xmm0
@@ -1666,7 +1666,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L39:
#ifdef LN
@@ -1705,7 +1705,7 @@
movq K, %rax
salq $1 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -1720,7 +1720,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
salq $BASE_SHIFT + 1, %rax
@@ -1730,7 +1730,7 @@
movq OFFSET, %rax
movq %rax, KK
#endif
-
+
movq M, I
sarq $2, I # i = (m >> 2)
NOBRANCH
@@ -1752,7 +1752,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
prefetcht2 -16 * SIZE(BB)
subq $-4 * SIZE, BB
@@ -1788,7 +1788,7 @@
jle .L55
ALIGN_4
-.L52:
+.L52:
movaps %xmm2, %xmm4
pshufd $0x4e, %xmm2, %xmm7
mulpd %xmm0, %xmm2
@@ -2127,7 +2127,7 @@
decq I
BRANCH
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -2149,7 +2149,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2169,7 +2169,7 @@
jle .L65
ALIGN_4
-.L62:
+.L62:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x4e, %xmm2, %xmm7
@@ -2388,7 +2388,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L70:
testq $1, M
@@ -2410,7 +2410,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movaps -16 * SIZE(BO), %xmm2
@@ -2429,7 +2429,7 @@
jle .L75
ALIGN_4
-.L72:
+.L72:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
shufps $0x44, %xmm0, %xmm0
@@ -2595,7 +2595,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L79:
#ifdef LN
@@ -2630,7 +2630,7 @@
movq K, %rax
salq $BASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -2643,7 +2643,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -2671,7 +2671,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
movaps -14 * SIZE(AO), %xmm1
@@ -2698,7 +2698,7 @@
jle .L95
ALIGN_4
-.L92:
+.L92:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x44, %xmm2, %xmm3
@@ -2975,7 +2975,7 @@
decq I
BRANCH
jg .L91
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
@@ -2997,7 +2997,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -3016,7 +3016,7 @@
jle .L105
ALIGN_4
-.L102:
+.L102:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x44, %xmm2, %xmm3
@@ -3196,13 +3196,13 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L110:
testq $1, M
BRANCH
jle .L119
-
+
#ifdef LN
movq K, %rax
salq $BASE_SHIFT, %rax
@@ -3217,7 +3217,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movsd -16 * SIZE(BO), %xmm2
@@ -3236,7 +3236,7 @@
jle .L115
ALIGN_4
-.L112:
+.L112:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
mulsd %xmm0, %xmm2
@@ -3376,7 +3376,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L119:
#ifdef LN
diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_sse2.S b/kernel/x86_64/trsm_kernel_LT_4x4_sse2.S
index d50c8d5..583fb47 100644
--- a/kernel/x86_64/trsm_kernel_LT_4x4_sse2.S
+++ b/kernel/x86_64/trsm_kernel_LT_4x4_sse2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -99,7 +99,7 @@
#define PREFETCHSIZE (8 * 4 + 4)
#endif
-#ifdef OPTERON
+#ifdef OPTERON
#define movsd movlpd
#endif
@@ -216,10 +216,10 @@
movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
addpd %xmm14, %xmm7 ;\
movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14
-
+
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -288,7 +288,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -307,7 +307,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -323,7 +323,7 @@
leaq (, %rax, SIZE), %rax
leaq (B, %rax, 4), B
leaq (BO, %rax, 8), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -341,7 +341,7 @@
addq %rax, %rax
ALIGN_4
-
+
.L02:
PREFETCHNTA 40 * SIZE(B)
@@ -410,7 +410,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movq A, AO
@@ -454,7 +454,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(BO), %xmm9
movapd 2 * SIZE(BO), %xmm11
@@ -490,7 +490,7 @@
andq $-8, %rax
salq $4, %rax
je .L15
-.L1X:
+.L1X:
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
@@ -576,7 +576,7 @@
sarq $3, %rax
je .L15
-.L12:
+.L12:
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
@@ -1159,7 +1159,7 @@
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $3, M
@@ -1189,7 +1189,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1626,7 +1626,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -1653,7 +1653,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1995,8 +1995,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -2032,12 +2032,12 @@
.L41:
/* Copying to Sub Buffer */
-
+
#ifdef LN
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -2053,7 +2053,7 @@
leaq (, %rax, SIZE), %rax
leaq (B, %rax, 2), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -2069,7 +2069,7 @@
sarq $2, %rax
jle .L43
ALIGN_4
-
+
.L42:
PREFETCH 56 * SIZE(B)
@@ -2132,7 +2132,7 @@
decq %rax
jne .L44
ALIGN_4
-
+
.L50:
#if defined(LT) || defined(RN)
movq A, AO
@@ -2176,7 +2176,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2618,7 +2618,7 @@
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -2645,7 +2645,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2927,7 +2927,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L70:
testq $1, M
@@ -2954,7 +2954,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -3185,8 +3185,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L79:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -3221,7 +3221,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -3237,7 +3237,7 @@
leaq (, %rax, SIZE), %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 2), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -3253,7 +3253,7 @@
sarq $3, %rax
jle .L83
ALIGN_4
-
+
.L82:
PREFETCH 56 * SIZE(B)
@@ -3313,7 +3313,7 @@
decq %rax
jne .L84
ALIGN_4
-
+
.L90:
#if defined(LT) || defined(RN)
movq A, AO
@@ -3355,7 +3355,7 @@
movq KK, %rax
salq $0 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -3670,7 +3670,7 @@
decq I # i --
jg .L91
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
@@ -3697,7 +3697,7 @@
movq KK, %rax
salq $0 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -3907,7 +3907,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L110:
testq $1, M
@@ -3934,7 +3934,7 @@
movq KK, %rax
salq $0 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -4113,7 +4113,7 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L119:
#ifdef LN
@@ -4137,7 +4137,7 @@
#endif
ALIGN_4
-
+
.L999:
movq %rbx, %rsp
diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_sse3.S b/kernel/x86_64/trsm_kernel_LT_4x4_sse3.S
index 266f442..3856c72 100644
--- a/kernel/x86_64/trsm_kernel_LT_4x4_sse3.S
+++ b/kernel/x86_64/trsm_kernel_LT_4x4_sse3.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -333,7 +333,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -389,7 +389,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -401,7 +401,7 @@
sarq $2, J # j = (n >> 2)
jle .L40
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movq A, AO
@@ -413,7 +413,7 @@
movq K, %rax
salq $2 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 4), %rax
subq %rax, C
#endif
@@ -428,7 +428,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
salq $BASE_SHIFT + 2, %rax
@@ -459,7 +459,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
prefetcht0 0 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -498,7 +498,7 @@
andq $-8, %rax
salq $4, %rax
je .L15
-.L1X:
+.L1X:
KERNEL1 (16 * 0)
KERNEL2 (16 * 0)
KERNEL3 (16 * 0)
@@ -1369,7 +1369,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1748,7 +1748,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -1771,7 +1771,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movddup 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2067,8 +2067,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -2105,7 +2105,7 @@
movq K, %rax
salq $1 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -2120,7 +2120,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
salq $BASE_SHIFT + 1, %rax
@@ -2151,7 +2151,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
prefetcht0 0 * SIZE(BB)
subq $-4 * SIZE, BB
@@ -2557,7 +2557,7 @@
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -2579,7 +2579,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2826,7 +2826,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L70:
testq $1, M
@@ -2848,7 +2848,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movddup 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -3043,8 +3043,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L79:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -3079,7 +3079,7 @@
movq K, %rax
salq $0 + BASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -3093,7 +3093,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -3120,7 +3120,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -3415,7 +3415,7 @@
decq I # i --
jg .L91
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
@@ -3437,7 +3437,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -3631,7 +3631,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L110:
testq $1, M
@@ -3653,7 +3653,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm9
pxor %xmm0, %xmm0
@@ -3806,7 +3806,7 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L119:
#ifdef LN
@@ -3826,7 +3826,7 @@
subq $1, KK
#endif
ALIGN_2
-
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/trsm_kernel_LT_4x8_nehalem.S b/kernel/x86_64/trsm_kernel_LT_4x8_nehalem.S
index 917f8f9..28f38bd 100644
--- a/kernel/x86_64/trsm_kernel_LT_4x8_nehalem.S
+++ b/kernel/x86_64/trsm_kernel_LT_4x8_nehalem.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -94,7 +94,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -180,7 +180,7 @@
movq K, %rax
salq $3 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 8), %rax
subq %rax, C
#endif
@@ -195,7 +195,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -227,7 +227,7 @@
leaq (B, %rax, 8), BO
#else
movq B, BO
-#endif
+#endif
prefetchnta -32 * SIZE(BB)
subq $-16 * SIZE, BB
@@ -269,7 +269,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm12
@@ -1071,7 +1071,7 @@
decq I
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -1092,7 +1092,7 @@
leaq (B, %rax, 8), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -1117,7 +1117,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
addps %xmm1, %xmm8
pshufd $0x50, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -1262,7 +1262,7 @@
movaps %xmm8, %xmm4
shufps $0x88, %xmm9, %xmm8
shufps $0xdd, %xmm9, %xmm4
-
+
movaps %xmm10, %xmm5
shufps $0x88, %xmm11, %xmm10
shufps $0xdd, %xmm11, %xmm5
@@ -1685,7 +1685,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -1706,7 +1706,7 @@
leaq (B, %rax, 8), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm2, %xmm2
movsd -32 * SIZE(AO), %xmm0
@@ -1725,7 +1725,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
pshufd $0x00, %xmm0, %xmm1
addps %xmm2, %xmm8
movaps -32 * SIZE(BO), %xmm2
@@ -2195,8 +2195,8 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -2233,7 +2233,7 @@
movq K, %rax
salq $2 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 4), %rax
subq %rax, C
#endif
@@ -2248,7 +2248,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -2276,7 +2276,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -2304,7 +2304,7 @@
jle .L45
ALIGN_3
-.L42:
+.L42:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm8
@@ -2713,7 +2713,7 @@
decq I
BRANCH
jg .L41
- ALIGN_4
+ ALIGN_4
.L50:
testq $2, M
@@ -2734,7 +2734,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -2754,7 +2754,7 @@
jle .L55
ALIGN_3
-.L52:
+.L52:
addps %xmm1, %xmm8
pshufd $0x50, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -2852,7 +2852,7 @@
movaps %xmm8, %xmm4
shufps $0x88, %xmm9, %xmm8
shufps $0xdd, %xmm9, %xmm4
-
+
movaps -32 * SIZE(BO), %xmm0
movaps -28 * SIZE(BO), %xmm1
@@ -3031,7 +3031,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L60:
testq $1, M
@@ -3052,7 +3052,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm2, %xmm2
movsd -32 * SIZE(AO), %xmm0
@@ -3070,7 +3070,7 @@
jle .L65
ALIGN_3
-.L62:
+.L62:
pshufd $0x00, %xmm0, %xmm1
addps %xmm2, %xmm8
movaps -32 * SIZE(BO), %xmm2
@@ -3301,8 +3301,8 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L69:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -3335,7 +3335,7 @@
movq K, %rax
salq $1 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -3350,7 +3350,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -3378,7 +3378,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -3401,7 +3401,7 @@
jle .L75
ALIGN_3
-.L72:
+.L72:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm8
@@ -3682,7 +3682,7 @@
decq I
BRANCH
jg .L71
- ALIGN_4
+ ALIGN_4
.L80:
testq $2, M
@@ -3703,7 +3703,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -3723,7 +3723,7 @@
jle .L85
ALIGN_3
-.L82:
+.L82:
addps %xmm1, %xmm8
movsd -32 * SIZE(BO), %xmm1
unpcklps %xmm1, %xmm1
@@ -3803,7 +3803,7 @@
#if defined(LN) || defined(LT)
pshufd $0xd8, %xmm8, %xmm8
-
+
movaps -32 * SIZE(BO), %xmm0
#else
movaps -32 * SIZE(AO), %xmm0
@@ -3908,7 +3908,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L90:
testq $1, M
@@ -3929,7 +3929,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm2, %xmm2
movsd -32 * SIZE(AO), %xmm0
@@ -3947,7 +3947,7 @@
jle .L95
ALIGN_3
-.L92:
+.L92:
pshufd $0x00, %xmm0, %xmm1
addps %xmm2, %xmm8
movsd -32 * SIZE(BO), %xmm2
@@ -4110,8 +4110,8 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L99:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -4144,7 +4144,7 @@
movq K, %rax
salq $BASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -4157,7 +4157,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -4185,7 +4185,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -4204,7 +4204,7 @@
jle .L105
ALIGN_3
-.L102:
+.L102:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm8
@@ -4434,7 +4434,7 @@
decq I
BRANCH
jg .L101
- ALIGN_4
+ ALIGN_4
.L110:
testq $2, M
@@ -4455,7 +4455,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -4472,7 +4472,7 @@
jle .L115
ALIGN_3
-.L112:
+.L112:
addps %xmm1, %xmm8
movss -32 * SIZE(BO), %xmm1
unpcklps %xmm1, %xmm1
@@ -4655,7 +4655,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm2, %xmm2
movss -32 * SIZE(AO), %xmm0
@@ -4672,7 +4672,7 @@
jle .L125
ALIGN_3
-.L122:
+.L122:
addss %xmm2, %xmm8
movss -32 * SIZE(BO), %xmm2
mulss %xmm0, %xmm2
@@ -4798,8 +4798,8 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L129:
#ifdef LN
leaq (, K, SIZE), %rax
diff --git a/kernel/x86_64/trsm_kernel_LT_8x4_sse.S b/kernel/x86_64/trsm_kernel_LT_8x4_sse.S
index 7727fd5..887c071 100644
--- a/kernel/x86_64/trsm_kernel_LT_8x4_sse.S
+++ b/kernel/x86_64/trsm_kernel_LT_8x4_sse.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -53,7 +53,7 @@
#define BO %r14
#define CO1 %r15
#define CO2 %rbp
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 64
@@ -107,11 +107,11 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
EMMS
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -179,7 +179,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -198,10 +198,10 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
-
+
#ifdef RT
movq K, %rax
salq $2 + BASE_SHIFT, %rax
@@ -214,7 +214,7 @@
salq $2 + BASE_SHIFT, %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -230,7 +230,7 @@
sarq $2, %rax
jle .L03
ALIGN_4
-
+
.L02:
movaps 0 * SIZE(B), %xmm3
movaps 4 * SIZE(B), %xmm7
@@ -312,7 +312,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movq A, AO
@@ -356,7 +356,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(BO), %xmm9
movaps 4 * SIZE(BO), %xmm11
@@ -390,7 +390,7 @@
sarq $2, %rax
je .L15
ALIGN_4
-
+
.L12:
mulps %xmm8, %xmm9
addps %xmm9, %xmm0
@@ -1202,7 +1202,7 @@
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $4, M
@@ -1227,7 +1227,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -1723,7 +1723,7 @@
salq $2 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
testq $2, M
@@ -1748,7 +1748,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 8 * SIZE(AO), %xmm10
@@ -2171,7 +2171,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L40:
testq $1, M
@@ -2195,7 +2195,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movss 0 * SIZE(AO), %xmm8
movss 4 * SIZE(AO), %xmm10
@@ -2574,8 +2574,8 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L49:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -2608,10 +2608,10 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
-
+
#ifdef RT
movq K, %rax
salq $1 + BASE_SHIFT, %rax
@@ -2624,7 +2624,7 @@
salq $1 + BASE_SHIFT, %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -2640,7 +2640,7 @@
sarq $2, %rax
jle .L53
ALIGN_4
-
+
.L52:
movaps 0 * SIZE(B), %xmm3
movaps 4 * SIZE(B), %xmm7
@@ -2697,7 +2697,7 @@
decq %rax
jne .L54
ALIGN_4
-
+
.L60:
#if defined(LT) || defined(RN)
movq A, AO
@@ -2740,7 +2740,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -3444,7 +3444,7 @@
decq I # i --
jg .L61
- ALIGN_4
+ ALIGN_4
.L70:
testq $4, M
@@ -3469,7 +3469,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -3833,7 +3833,7 @@
salq $2 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L80:
testq $2, M
@@ -3858,7 +3858,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 8 * SIZE(AO), %xmm10
@@ -4152,7 +4152,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L90:
testq $1, M
@@ -4176,7 +4176,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movss 0 * SIZE(AO), %xmm8
movss 4 * SIZE(AO), %xmm10
@@ -4426,8 +4426,8 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L99:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -4458,10 +4458,10 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
-
+
#ifdef RT
movq K, %rax
salq $BASE_SHIFT, %rax
@@ -4474,7 +4474,7 @@
salq $BASE_SHIFT, %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -4547,7 +4547,7 @@
decq %rax
jne .L104
ALIGN_4
-
+
.L110:
#if defined(LT) || defined(RN)
movq A, AO
@@ -4589,7 +4589,7 @@
movq KK, %rax
salq $BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -5145,7 +5145,7 @@
decq I # i --
jg .L111
- ALIGN_4
+ ALIGN_4
.L120:
testq $4, M
@@ -5170,7 +5170,7 @@
movq KK, %rax
salq $BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -5455,7 +5455,7 @@
salq $2 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L130:
testq $2, M
@@ -5480,7 +5480,7 @@
movq KK, %rax
salq $BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 8 * SIZE(AO), %xmm10
@@ -5703,7 +5703,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L140:
testq $1, M
@@ -5727,7 +5727,7 @@
movq KK, %rax
salq $BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movss 0 * SIZE(AO), %xmm8
movss 4 * SIZE(AO), %xmm10
@@ -5894,7 +5894,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L149:
#ifdef LN
diff --git a/kernel/x86_64/trsm_kernel_RT_2x8_nehalem.S b/kernel/x86_64/trsm_kernel_RT_2x8_nehalem.S
index 8c7f92f..b8e75e7 100644
--- a/kernel/x86_64/trsm_kernel_RT_2x8_nehalem.S
+++ b/kernel/x86_64/trsm_kernel_RT_2x8_nehalem.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -94,7 +94,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -178,7 +178,7 @@
movq K, %rax
salq $BASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -191,7 +191,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -219,7 +219,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -242,7 +242,7 @@
jle .L75
ALIGN_3
-.L72:
+.L72:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm8
@@ -403,7 +403,7 @@
decq I
BRANCH
jg .L71
- ALIGN_4
+ ALIGN_4
.L80:
testq $1, M
@@ -425,7 +425,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movhps -15 * SIZE(AO), %xmm0
@@ -445,7 +445,7 @@
jle .L85
ALIGN_3
-.L82:
+.L82:
mulpd %xmm0, %xmm1
movsd -14 * SIZE(AO), %xmm0
movhps -13 * SIZE(AO), %xmm0
@@ -566,7 +566,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L89:
#ifdef LN
@@ -601,7 +601,7 @@
movq K, %rax
salq $1 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -616,7 +616,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -644,7 +644,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -668,7 +668,7 @@
jle .L55
ALIGN_3
-.L52:
+.L52:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm8
@@ -901,7 +901,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -919,7 +919,7 @@
jle .L65
ALIGN_3
-.L62:
+.L62:
mulpd %xmm0, %xmm1
movddup -15 * SIZE(AO), %xmm0
addpd %xmm1, %xmm8
@@ -1075,7 +1075,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L69:
#ifdef LN
@@ -1110,7 +1110,7 @@
movq K, %rax
salq $2 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 4), %rax
subq %rax, C
#endif
@@ -1125,7 +1125,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -1153,7 +1153,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -1181,7 +1181,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm8
@@ -1522,7 +1522,7 @@
decq I
BRANCH
jg .L31
- ALIGN_4
+ ALIGN_4
.L40:
testq $1, M
@@ -1544,7 +1544,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1564,7 +1564,7 @@
jle .L45
ALIGN_3
-.L42:
+.L42:
mulpd %xmm0, %xmm1
addpd %xmm1, %xmm8
movaps -14 * SIZE(BO), %xmm1
@@ -1790,7 +1790,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L49:
#ifdef LN
@@ -1828,7 +1828,7 @@
movq K, %rax
salq $3 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 8), %rax
subq %rax, C
#endif
@@ -1843,7 +1843,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -1875,7 +1875,7 @@
leaq (B, %rax, 8), BO
#else
movq B, BO
-#endif
+#endif
prefetcht0 -16 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -1918,7 +1918,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm12
@@ -2546,7 +2546,7 @@
decq I
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $1, M
@@ -2568,7 +2568,7 @@
leaq (B, %rax, 8), BO
#else
movq B, BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -2588,7 +2588,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
mulpd %xmm0, %xmm1
addpd %xmm1, %xmm8
movaps -14 * SIZE(BO), %xmm1
@@ -3024,7 +3024,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L29:
#ifdef LN
diff --git a/kernel/x86_64/trsm_kernel_RT_4x2_atom.S b/kernel/x86_64/trsm_kernel_RT_4x2_atom.S
index ae49c38..9b5a937 100644
--- a/kernel/x86_64/trsm_kernel_RT_4x2_atom.S
+++ b/kernel/x86_64/trsm_kernel_RT_4x2_atom.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -90,7 +90,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -146,7 +146,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -168,7 +168,7 @@
movq K, %rax
salq $0 + BASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -181,7 +181,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -208,7 +208,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm9, %xmm9
@@ -510,7 +510,7 @@
decq I # i --
jg .L41
- ALIGN_4
+ ALIGN_4
.L50:
testq $2, M
@@ -530,7 +530,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm2, %xmm2
@@ -723,7 +723,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L60:
testq $1, M
@@ -743,7 +743,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm5, %xmm5
@@ -897,7 +897,7 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L69:
#ifdef LN
@@ -923,7 +923,7 @@
sarq $1, J
jle .L999
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movq A, AO
@@ -935,7 +935,7 @@
movq K, %rax
salq $1 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -950,7 +950,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
salq $BASE_SHIFT + 1, %rax
@@ -981,7 +981,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
prefetcht0 0 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -1027,7 +1027,7 @@
addsd %xmm7, %xmm14
movsd 3 * SIZE(AO), %xmm7
mulsd %xmm3, %xmm2
-
+
addsd %xmm6, %xmm15
PREFETCH (PREFETCHSIZE + 0) * SIZE(BO)
movaps %xmm4, %xmm6
@@ -1540,7 +1540,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm2, %xmm2
@@ -1846,7 +1846,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -1867,7 +1867,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm7, %xmm7
@@ -2064,8 +2064,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#ifdef LN
leaq (, K, SIZE), %rax
diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_barcelona.S b/kernel/x86_64/trsm_kernel_RT_4x4_barcelona.S
index 400f60e..08e92dc 100644
--- a/kernel/x86_64/trsm_kernel_RT_4x4_barcelona.S
+++ b/kernel/x86_64/trsm_kernel_RT_4x4_barcelona.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,7 +49,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -300,7 +300,7 @@
movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\
movapd %xmm0, %xmm2 ;\
addq $8 * SIZE, %rax
-
+
#define KERNEL_SUB1(xx) \
mulpd %xmm1, %xmm0 ;\
mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\
@@ -405,7 +405,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, (%rsp)
movq %rbp, 8(%rsp)
@@ -470,7 +470,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -504,7 +504,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -535,7 +535,7 @@
#if defined(LN) || defined(RT)
movq KK, %rax
leaq (BO, %rax, SIZE), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -804,13 +804,13 @@
decq I # i --
jg .L91
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
je .L110
-#ifdef LN
+#ifdef LN
movq K, %rax
salq $1 + BASE_SHIFT, %rax
subq %rax, AORIG
@@ -828,7 +828,7 @@
#if defined(LN) || defined(RT)
movq KK, %rax
leaq (BO, %rax, SIZE), BO
-#endif
+#endif
movddup -16 * SIZE(BO), %xmm0
pxor %xmm8, %xmm8
@@ -1016,7 +1016,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L110:
testq $1, M
@@ -1040,7 +1040,7 @@
#if defined(LN) || defined(RT)
movq KK, %rax
leaq (BO, %rax, SIZE), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1175,7 +1175,7 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L119:
#ifdef LN
@@ -1195,7 +1195,7 @@
#endif
ALIGN_4
-.L40:
+.L40:
testq $2, N
je .L80
@@ -1224,7 +1224,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -1256,7 +1256,7 @@
movq KK, %rax
leaq (, %rax, SIZE), %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movddup -16 * SIZE(BO), %xmm1
movddup -15 * SIZE(BO), %xmm5
@@ -1597,7 +1597,7 @@
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1622,7 +1622,7 @@
movq KK, %rax
leaq (, %rax, SIZE), %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1851,7 +1851,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L70:
testq $1, M
@@ -1878,7 +1878,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 1), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2052,8 +2052,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L79:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -2104,7 +2104,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
salq $BASE_SHIFT + 2, %rax
@@ -2141,7 +2141,7 @@
movq KK, %rax
leaq (, %rax, SIZE), %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
movddup -16 * SIZE(BO), %xmm1
@@ -2683,7 +2683,7 @@
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $3, M
@@ -2713,7 +2713,7 @@
movq KK, %rax
leaq (, %rax, SIZE), %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -3050,7 +3050,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -3075,7 +3075,7 @@
movq KK, %rax
leaq (, %rax, SIZE), %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -3340,8 +3340,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#ifdef LN
leaq (, K, SIZE), %rax
diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_core2.S b/kernel/x86_64/trsm_kernel_RT_4x4_core2.S
index 89d07ce..64e0342 100644
--- a/kernel/x86_64/trsm_kernel_RT_4x4_core2.S
+++ b/kernel/x86_64/trsm_kernel_RT_4x4_core2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,7 +49,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -83,7 +83,7 @@
#define AORIG 32(%rsp)
#define BORIG 40(%rsp)
#define BUFFER 128(%rsp)
-
+
#define PREFETCH_R (8 * 4 + 0)
#define PREFETCH_W (PREFETCH_R)
@@ -92,7 +92,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -161,7 +161,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -180,7 +180,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -196,7 +196,7 @@
leaq (, %rax, SIZE), %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 2), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -212,7 +212,7 @@
sarq $3, %rax
jle .L83
ALIGN_4
-
+
.L82:
movddup -16 * SIZE(B), %xmm0
movddup -15 * SIZE(B), %xmm1
@@ -260,7 +260,7 @@
subq $1, %rax
jne .L84
ALIGN_4
-
+
.L90:
#if defined(LT) || defined(RN)
movq A, AO
@@ -302,7 +302,7 @@
movq KK, %rax
salq $0 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
#ifdef LN
@@ -605,7 +605,7 @@
decq I # i --
jg .L91
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
@@ -632,7 +632,7 @@
movq KK, %rax
salq $0 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -833,7 +833,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L110:
testq $1, M
@@ -860,7 +860,7 @@
movq KK, %rax
salq $0 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1030,7 +1030,7 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L119:
#ifdef LN
@@ -1061,12 +1061,12 @@
.L41:
/* Copying to Sub Buffer */
-
+
#ifdef LN
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -1082,7 +1082,7 @@
leaq (, %rax, SIZE), %rax
leaq (B, %rax, 2), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -1098,7 +1098,7 @@
sarq $2, %rax
jle .L43
ALIGN_4
-
+
.L42:
movddup -16 * SIZE(B), %xmm0
movddup -15 * SIZE(B), %xmm1
@@ -1148,7 +1148,7 @@
decq %rax
jne .L44
ALIGN_4
-
+
.L50:
#if defined(LT) || defined(RN)
movq A, AO
@@ -1192,7 +1192,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1576,7 +1576,7 @@
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1603,7 +1603,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1848,7 +1848,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L70:
testq $1, M
@@ -1875,7 +1875,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -2081,8 +2081,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L79:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -2117,7 +2117,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq 16 * SIZE + BUFFER, BO
@@ -2133,7 +2133,7 @@
leaq (, %rax, SIZE), %rax
leaq (B, %rax, 4), B
leaq (BO, %rax, 8), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -2149,7 +2149,7 @@
sarq $2, %rax
jle .L03
ALIGN_4
-
+
.L02:
prefetcht0 (PREFETCH_R + 0) * SIZE(B)
movapd -16 * SIZE(B), %xmm0
@@ -2178,7 +2178,7 @@
unpckhpd %xmm6, %xmm6
movddup %xmm7, %xmm15
unpckhpd %xmm7, %xmm7
-
+
prefetcht0 (PREFETCH_W + 0) * SIZE(BO)
movapd %xmm8, -16 * SIZE(BO)
movapd %xmm0, -14 * SIZE(BO)
@@ -2240,7 +2240,7 @@
subq $1, %rax
jne .L04
ALIGN_4
-
+
.L10:
leaq (PREFETCH_R + 0) * SIZE(B), BB
@@ -2286,7 +2286,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -2332,7 +2332,7 @@
jle .L15
ALIGN_4
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm2, %xmm10
@@ -2957,7 +2957,7 @@
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $3, M
@@ -2987,7 +2987,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -3353,7 +3353,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -3380,7 +3380,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -3679,8 +3679,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#ifdef LN
leaq (, K, SIZE), %rax
diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_penryn.S b/kernel/x86_64/trsm_kernel_RT_4x4_penryn.S
index a575d4c..f95200a 100644
--- a/kernel/x86_64/trsm_kernel_RT_4x4_penryn.S
+++ b/kernel/x86_64/trsm_kernel_RT_4x4_penryn.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -95,7 +95,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -177,7 +177,7 @@
movq K, %rax
salq $BASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -190,7 +190,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -218,7 +218,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
movaps -14 * SIZE(AO), %xmm1
@@ -245,7 +245,7 @@
jle .L95
ALIGN_4
-.L92:
+.L92:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x44, %xmm2, %xmm3
@@ -522,7 +522,7 @@
decq I
BRANCH
jg .L91
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
@@ -544,7 +544,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -563,7 +563,7 @@
jle .L105
ALIGN_4
-.L102:
+.L102:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x44, %xmm2, %xmm3
@@ -743,13 +743,13 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L110:
testq $1, M
BRANCH
jle .L119
-
+
#ifdef LN
movq K, %rax
salq $BASE_SHIFT, %rax
@@ -764,7 +764,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movsd -16 * SIZE(BO), %xmm2
@@ -783,7 +783,7 @@
jle .L115
ALIGN_4
-.L112:
+.L112:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
mulsd %xmm0, %xmm2
@@ -923,7 +923,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L119:
#ifdef LN
@@ -957,7 +957,7 @@
movq K, %rax
salq $1 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -972,7 +972,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
salq $BASE_SHIFT + 1, %rax
@@ -983,7 +983,7 @@
movq OFFSET, %rax
movq %rax, KK
#endif
-
+
movq M, I
sarq $2, I # i = (m >> 2)
NOBRANCH
@@ -1005,7 +1005,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
prefetcht2 -16 * SIZE(BB)
subq $-4 * SIZE, BB
@@ -1041,7 +1041,7 @@
jle .L55
ALIGN_4
-.L52:
+.L52:
movaps %xmm2, %xmm4
pshufd $0x4e, %xmm2, %xmm7
mulpd %xmm0, %xmm2
@@ -1380,7 +1380,7 @@
decq I
BRANCH
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1402,7 +1402,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1422,7 +1422,7 @@
jle .L65
ALIGN_4
-.L62:
+.L62:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x4e, %xmm2, %xmm7
@@ -1641,7 +1641,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L70:
testq $1, M
@@ -1663,7 +1663,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movaps -16 * SIZE(BO), %xmm2
@@ -1682,7 +1682,7 @@
jle .L75
ALIGN_4
-.L72:
+.L72:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
shufps $0x44, %xmm0, %xmm0
@@ -1848,7 +1848,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L79:
#ifdef LN
@@ -1886,7 +1886,7 @@
movq K, %rax
salq $2 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 4), %rax
subq %rax, C
#endif
@@ -1901,7 +1901,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
salq $BASE_SHIFT + 2, %rax
@@ -1934,7 +1934,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
prefetcht2 -16 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -1989,7 +1989,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
addpd %xmm3, %xmm11
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movaps -14 * SIZE(BO), %xmm3
@@ -2719,7 +2719,7 @@
decq I # i --
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -2741,7 +2741,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
pxor %xmm3, %xmm3
@@ -2764,7 +2764,7 @@
jle .L25
ALIGN_4
-.L22:
+.L22:
addpd %xmm3, %xmm11
movaps -14 * SIZE(BO), %xmm3
pshufd $0x4e, %xmm2, %xmm7
@@ -3095,7 +3095,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -3117,7 +3117,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movaps -16 * SIZE(BO), %xmm2
@@ -3139,7 +3139,7 @@
jle .L35
ALIGN_4
-.L32:
+.L32:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
shufps $0x44, %xmm0, %xmm0
@@ -3373,7 +3373,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L39:
#ifdef LN
diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_sse2.S b/kernel/x86_64/trsm_kernel_RT_4x4_sse2.S
index 07c978e..49a5fe6 100644
--- a/kernel/x86_64/trsm_kernel_RT_4x4_sse2.S
+++ b/kernel/x86_64/trsm_kernel_RT_4x4_sse2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -99,7 +99,7 @@
#define PREFETCHSIZE (8 * 4 + 4)
#endif
-#ifdef OPTERON
+#ifdef OPTERON
#define movsd movlpd
#endif
@@ -216,10 +216,10 @@
movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
addpd %xmm14, %xmm7 ;\
movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14
-
+
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -288,7 +288,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -307,7 +307,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -323,7 +323,7 @@
leaq (, %rax, SIZE), %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 2), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -339,7 +339,7 @@
sarq $3, %rax
jle .L83
ALIGN_4
-
+
.L82:
PREFETCH 56 * SIZE(B)
@@ -399,7 +399,7 @@
decq %rax
jne .L84
ALIGN_4
-
+
.L90:
#if defined(LT) || defined(RN)
movq A, AO
@@ -441,7 +441,7 @@
movq KK, %rax
salq $0 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -756,7 +756,7 @@
decq I # i --
jg .L91
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
@@ -783,7 +783,7 @@
movq KK, %rax
salq $0 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -993,7 +993,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L110:
testq $1, M
@@ -1020,7 +1020,7 @@
movq KK, %rax
salq $0 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1199,7 +1199,7 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L119:
#ifdef LN
@@ -1231,12 +1231,12 @@
.L41:
/* Copying to Sub Buffer */
-
+
#ifdef LN
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -1252,7 +1252,7 @@
leaq (, %rax, SIZE), %rax
leaq (B, %rax, 2), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -1268,7 +1268,7 @@
sarq $2, %rax
jle .L43
ALIGN_4
-
+
.L42:
PREFETCH 56 * SIZE(B)
@@ -1331,7 +1331,7 @@
decq %rax
jne .L44
ALIGN_4
-
+
.L50:
#if defined(LT) || defined(RN)
movq A, AO
@@ -1375,7 +1375,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1817,7 +1817,7 @@
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1844,7 +1844,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2126,7 +2126,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L70:
testq $1, M
@@ -2153,7 +2153,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2384,8 +2384,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L79:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -2420,7 +2420,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -2436,7 +2436,7 @@
leaq (, %rax, SIZE), %rax
leaq (B, %rax, 4), B
leaq (BO, %rax, 8), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -2454,7 +2454,7 @@
addq %rax, %rax
ALIGN_4
-
+
.L02:
PREFETCHNTA 40 * SIZE(B)
@@ -2523,7 +2523,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movq A, AO
@@ -2567,7 +2567,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(BO), %xmm9
movapd 2 * SIZE(BO), %xmm11
@@ -2601,7 +2601,7 @@
andq $-8, %rax
salq $4, %rax
je .L15
-.L1X:
+.L1X:
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
@@ -3240,7 +3240,7 @@
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $3, M
@@ -3270,7 +3270,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -3707,7 +3707,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -3734,7 +3734,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -4076,8 +4076,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#ifdef LN
leaq (, K, SIZE), %rax
diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_sse3.S b/kernel/x86_64/trsm_kernel_RT_4x4_sse3.S
index f0e8bf9..b6c56e0 100644
--- a/kernel/x86_64/trsm_kernel_RT_4x4_sse3.S
+++ b/kernel/x86_64/trsm_kernel_RT_4x4_sse3.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -331,7 +331,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -387,7 +387,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -409,7 +409,7 @@
movq K, %rax
salq $0 + BASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -423,7 +423,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -450,7 +450,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -745,7 +745,7 @@
decq I # i --
jg .L91
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
@@ -767,7 +767,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -961,7 +961,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L110:
testq $1, M
@@ -983,7 +983,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1136,7 +1136,7 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L119:
#ifdef LN
@@ -1172,7 +1172,7 @@
movq K, %rax
salq $1 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -1187,7 +1187,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -1214,7 +1214,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1617,7 +1617,7 @@
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1639,7 +1639,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1886,7 +1886,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L70:
testq $1, M
@@ -1908,7 +1908,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movddup 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2103,8 +2103,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L79:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -2129,7 +2129,7 @@
sarq $2, J # j = (n >> 2)
jle .L999
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movq A, AO
@@ -2141,7 +2141,7 @@
movq K, %rax
salq $2 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 4), %rax
subq %rax, C
#endif
@@ -2156,7 +2156,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -2184,7 +2184,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2220,7 +2220,7 @@
andq $-8, %rax
salq $4, %rax
je .L15
-.L1X:
+.L1X:
KERNEL1 (16 * 0)
KERNEL2 (16 * 0)
KERNEL3 (16 * 0)
@@ -3093,7 +3093,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -3472,7 +3472,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -3495,7 +3495,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movddup 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -3791,8 +3791,8 @@
salq $0 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -3814,7 +3814,7 @@
jg .L10
ALIGN_4
-
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/trsm_kernel_RT_4x8_nehalem.S b/kernel/x86_64/trsm_kernel_RT_4x8_nehalem.S
index ffac798..4942f46 100644
--- a/kernel/x86_64/trsm_kernel_RT_4x8_nehalem.S
+++ b/kernel/x86_64/trsm_kernel_RT_4x8_nehalem.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -94,7 +94,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -176,7 +176,7 @@
movq K, %rax
salq $BASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -189,7 +189,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -217,7 +217,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -236,7 +236,7 @@
jle .L105
ALIGN_3
-.L102:
+.L102:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm8
@@ -466,7 +466,7 @@
decq I
BRANCH
jg .L101
- ALIGN_4
+ ALIGN_4
.L110:
testq $2, M
@@ -487,7 +487,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -504,7 +504,7 @@
jle .L115
ALIGN_3
-.L112:
+.L112:
addps %xmm1, %xmm8
movss -32 * SIZE(BO), %xmm1
unpcklps %xmm1, %xmm1
@@ -687,7 +687,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm2, %xmm2
movss -32 * SIZE(AO), %xmm0
@@ -704,7 +704,7 @@
jle .L125
ALIGN_3
-.L122:
+.L122:
addss %xmm2, %xmm8
movss -32 * SIZE(BO), %xmm2
mulss %xmm0, %xmm2
@@ -830,8 +830,8 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L129:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -864,7 +864,7 @@
movq K, %rax
salq $1 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -879,7 +879,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -907,7 +907,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -930,7 +930,7 @@
jle .L75
ALIGN_3
-.L72:
+.L72:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm8
@@ -1211,7 +1211,7 @@
decq I
BRANCH
jg .L71
- ALIGN_4
+ ALIGN_4
.L80:
testq $2, M
@@ -1232,7 +1232,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -1252,7 +1252,7 @@
jle .L85
ALIGN_3
-.L82:
+.L82:
addps %xmm1, %xmm8
movsd -32 * SIZE(BO), %xmm1
unpcklps %xmm1, %xmm1
@@ -1332,7 +1332,7 @@
#if defined(LN) || defined(LT)
pshufd $0xd8, %xmm8, %xmm8
-
+
movaps -32 * SIZE(BO), %xmm0
#else
movaps -32 * SIZE(AO), %xmm0
@@ -1437,7 +1437,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L90:
testq $1, M
@@ -1458,7 +1458,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm2, %xmm2
movsd -32 * SIZE(AO), %xmm0
@@ -1476,7 +1476,7 @@
jle .L95
ALIGN_3
-.L92:
+.L92:
pshufd $0x00, %xmm0, %xmm1
addps %xmm2, %xmm8
movsd -32 * SIZE(BO), %xmm2
@@ -1639,8 +1639,8 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L99:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -1673,7 +1673,7 @@
movq K, %rax
salq $2 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 4), %rax
subq %rax, C
#endif
@@ -1688,7 +1688,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -1716,7 +1716,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -1744,7 +1744,7 @@
jle .L45
ALIGN_3
-.L42:
+.L42:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm8
@@ -2153,7 +2153,7 @@
decq I
BRANCH
jg .L41
- ALIGN_4
+ ALIGN_4
.L50:
testq $2, M
@@ -2174,7 +2174,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -2194,7 +2194,7 @@
jle .L55
ALIGN_3
-.L52:
+.L52:
addps %xmm1, %xmm8
pshufd $0x50, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -2292,7 +2292,7 @@
movaps %xmm8, %xmm4
shufps $0x88, %xmm9, %xmm8
shufps $0xdd, %xmm9, %xmm4
-
+
movaps -32 * SIZE(BO), %xmm0
movaps -28 * SIZE(BO), %xmm1
@@ -2471,7 +2471,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L60:
testq $1, M
@@ -2492,7 +2492,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm2, %xmm2
movsd -32 * SIZE(AO), %xmm0
@@ -2510,7 +2510,7 @@
jle .L65
ALIGN_3
-.L62:
+.L62:
pshufd $0x00, %xmm0, %xmm1
addps %xmm2, %xmm8
movaps -32 * SIZE(BO), %xmm2
@@ -2741,8 +2741,8 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L69:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -2779,7 +2779,7 @@
movq K, %rax
salq $3 + BASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 8), %rax
subq %rax, C
#endif
@@ -2794,7 +2794,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -2826,7 +2826,7 @@
leaq (B, %rax, 8), BO
#else
movq B, BO
-#endif
+#endif
prefetchnta -32 * SIZE(BB)
subq $-16 * SIZE, BB
@@ -2868,7 +2868,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm12
@@ -3670,7 +3670,7 @@
decq I
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -3691,7 +3691,7 @@
leaq (B, %rax, 8), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -3716,7 +3716,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
addps %xmm1, %xmm8
pshufd $0x50, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -3861,7 +3861,7 @@
movaps %xmm8, %xmm4
shufps $0x88, %xmm9, %xmm8
shufps $0xdd, %xmm9, %xmm4
-
+
movaps %xmm10, %xmm5
shufps $0x88, %xmm11, %xmm10
shufps $0xdd, %xmm11, %xmm5
@@ -4284,7 +4284,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -4305,7 +4305,7 @@
leaq (B, %rax, 8), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm2, %xmm2
movsd -32 * SIZE(AO), %xmm0
@@ -4324,7 +4324,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
pshufd $0x00, %xmm0, %xmm1
addps %xmm2, %xmm8
movaps -32 * SIZE(BO), %xmm2
@@ -4794,8 +4794,8 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#ifdef LN
leaq (, K, SIZE), %rax
diff --git a/kernel/x86_64/trsm_kernel_RT_8x4_sse.S b/kernel/x86_64/trsm_kernel_RT_8x4_sse.S
index 6993649..c854b93 100644
--- a/kernel/x86_64/trsm_kernel_RT_8x4_sse.S
+++ b/kernel/x86_64/trsm_kernel_RT_8x4_sse.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -53,7 +53,7 @@
#define BO %r14
#define CO1 %r15
#define CO2 %rbp
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 64
@@ -107,7 +107,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -177,7 +177,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -192,10 +192,10 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
-
+
#ifdef RT
movq K, %rax
salq $BASE_SHIFT, %rax
@@ -208,7 +208,7 @@
salq $BASE_SHIFT, %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -281,7 +281,7 @@
decq %rax
jne .L104
ALIGN_4
-
+
.L110:
#if defined(LT) || defined(RN)
movq A, AO
@@ -323,7 +323,7 @@
movq KK, %rax
salq $BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -879,7 +879,7 @@
decq I # i --
jg .L111
- ALIGN_4
+ ALIGN_4
.L120:
testq $4, M
@@ -904,7 +904,7 @@
movq KK, %rax
salq $BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -1189,7 +1189,7 @@
salq $2 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L130:
testq $2, M
@@ -1214,7 +1214,7 @@
movq KK, %rax
salq $BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm8
movhps 2 * SIZE(AO), %xmm8
@@ -1439,7 +1439,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L140:
testq $1, M
@@ -1463,7 +1463,7 @@
movq KK, %rax
salq $BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movss 0 * SIZE(AO), %xmm8
movss 4 * SIZE(AO), %xmm10
@@ -1630,7 +1630,7 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L149:
#ifdef LN
@@ -1662,10 +1662,10 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
-
+
#ifdef RT
movq K, %rax
salq $1 + BASE_SHIFT, %rax
@@ -1678,7 +1678,7 @@
salq $1 + BASE_SHIFT, %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -1694,7 +1694,7 @@
sarq $2, %rax
jle .L53
ALIGN_4
-
+
.L52:
movaps 0 * SIZE(B), %xmm3
movaps 4 * SIZE(B), %xmm7
@@ -1751,7 +1751,7 @@
decq %rax
jne .L54
ALIGN_4
-
+
.L60:
#if defined(LT) || defined(RN)
movq A, AO
@@ -1794,7 +1794,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -2498,7 +2498,7 @@
decq I # i --
jg .L61
- ALIGN_4
+ ALIGN_4
.L70:
testq $4, M
@@ -2523,7 +2523,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -2887,7 +2887,7 @@
salq $2 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L80:
testq $2, M
@@ -2912,7 +2912,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
#ifdef movsd
xorps %xmm8, %xmm8
@@ -3224,7 +3224,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L90:
testq $1, M
@@ -3248,7 +3248,7 @@
movq KK, %rax
salq $1 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movss 0 * SIZE(AO), %xmm8
movss 4 * SIZE(AO), %xmm10
@@ -3498,8 +3498,8 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L99:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -3534,10 +3534,10 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
-
+
#ifdef RT
movq K, %rax
salq $2 + BASE_SHIFT, %rax
@@ -3550,7 +3550,7 @@
salq $2 + BASE_SHIFT, %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -3566,7 +3566,7 @@
sarq $2, %rax
jle .L03
ALIGN_4
-
+
.L02:
movaps 0 * SIZE(B), %xmm3
movaps 4 * SIZE(B), %xmm7
@@ -3648,7 +3648,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movq A, AO
@@ -3692,7 +3692,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(BO), %xmm9
movaps 4 * SIZE(BO), %xmm11
@@ -3726,7 +3726,7 @@
sarq $2, %rax
je .L15
ALIGN_4
-
+
.L12:
mulps %xmm8, %xmm9
addps %xmm9, %xmm0
@@ -4538,7 +4538,7 @@
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $4, M
@@ -4563,7 +4563,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -5059,7 +5059,7 @@
salq $2 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
testq $2, M
@@ -5084,7 +5084,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
#ifdef movsd
xorps %xmm8, %xmm8
@@ -5513,7 +5513,7 @@
salq $1 + BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L40:
testq $1, M
@@ -5537,7 +5537,7 @@
movq KK, %rax
salq $2 + BASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movss 0 * SIZE(AO), %xmm8
movss 4 * SIZE(AO), %xmm10
@@ -5916,8 +5916,8 @@
salq $BASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L49:
#ifdef LN
leaq (, K, SIZE), %rax
diff --git a/kernel/x86_64/xdot.S b/kernel/x86_64/xdot.S
index 966b499..ea97164 100644
--- a/kernel/x86_64/xdot.S
+++ b/kernel/x86_64/xdot.S
@@ -41,7 +41,7 @@
#define STACK 12
#define ARGS 0
-
+
#define RESULT 4 + STACK + ARGS(%esp)
#define STACK_N 8 + STACK + ARGS(%esp)
#define STACK_X 12 + STACK + ARGS(%esp)
diff --git a/kernel/x86_64/xgemm3m_kernel_2x2.S b/kernel/x86_64/xgemm3m_kernel_2x2.S
index 6d116a1..843fc24 100644
--- a/kernel/x86_64/xgemm3m_kernel_2x2.S
+++ b/kernel/x86_64/xgemm3m_kernel_2x2.S
@@ -46,7 +46,7 @@
#define B ARG5
#define C ARG6
#define LDC %r10
-
+
#define I %r12
#define J %r13
#define AO %r14
@@ -77,7 +77,7 @@
#endif
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -93,10 +93,10 @@
negq %rax
movq %rax, KK
#endif
-
+
addq $8 * SIZE, A
addq $8 * SIZE, B
-
+
salq $ZBASE_SHIFT, LDC
movq N, %rax
@@ -109,7 +109,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq A, AO
@@ -132,7 +132,7 @@
salq $BASE_SHIFT, %rax
leaq (AO, %rax, 2), AO
leaq (B, %rax, 2), BO
-#endif
+#endif
fldz
fldz
@@ -152,7 +152,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -178,7 +178,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -196,7 +196,7 @@
FLD -5 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -5 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -216,7 +216,7 @@
FLD -3 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -3 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -234,7 +234,7 @@
FLD -1 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -1 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -270,7 +270,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -379,7 +379,7 @@
salq $BASE_SHIFT, %rax
leaq (AO, %rax, 1), AO
leaq ( B, %rax, 2), BO
-#endif
+#endif
fldz
fldz
@@ -389,7 +389,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -542,13 +542,13 @@
.L30:
movq N, %rax
- testq $1, %rax
+ testq $1, %rax
je .L999
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq A, AO
@@ -570,7 +570,7 @@
salq $BASE_SHIFT, %rax
leaq (AO, %rax, 2), AO
leaq ( B, %rax, 1), BO
-#endif
+#endif
fldz
fldz
@@ -586,7 +586,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -741,7 +741,7 @@
salq $BASE_SHIFT, %rax
leaq (AO, %rax, 1), AO
leaq ( B, %rax, 1), BO
-#endif
+#endif
fldz
@@ -750,7 +750,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
diff --git a/kernel/x86_64/xgemm_kernel_1x1.S b/kernel/x86_64/xgemm_kernel_1x1.S
index 164e618..e0cd1f1 100644
--- a/kernel/x86_64/xgemm_kernel_1x1.S
+++ b/kernel/x86_64/xgemm_kernel_1x1.S
@@ -46,7 +46,7 @@
#define B ARG5
#define C ARG6
#define LDC %r10
-
+
#define I %r12
#define J %r13
#define AO %r14
@@ -96,7 +96,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -112,15 +112,15 @@
negq %rax
movq %rax, KK
#endif
-
+
addq $8 * SIZE, A
addq $8 * SIZE, B
-
+
salq $ZBASE_SHIFT, LDC
cmpq $0, M
jle .L999
-
+
movq N, %rax
movq %rax, J
testq %rax, %rax
@@ -131,7 +131,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq A, AO
@@ -151,7 +151,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 1), AO
leaq (B, %rax, 1), BO
-#endif
+#endif
fldz
fldz
@@ -169,7 +169,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -195,7 +195,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -213,7 +213,7 @@
FLD -5 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -5 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -233,7 +233,7 @@
FLD -3 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -3 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -251,7 +251,7 @@
FLD -1 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -1 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -287,7 +287,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -332,7 +332,7 @@
FST 1 * SIZE(CO)
FST 0 * SIZE(CO)
#endif
-
+
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
movq K, %rax
diff --git a/kernel/x86_64/xgemv_n.S b/kernel/x86_64/xgemv_n.S
index db6d80a..cbde640 100644
--- a/kernel/x86_64/xgemv_n.S
+++ b/kernel/x86_64/xgemv_n.S
@@ -41,9 +41,9 @@
#include "l2param.h"
#define P 32
-
+
#define STACKSIZE 80
-
+
#define ALPHA_R 8 + STACKSIZE(%rsp)
#define ALPHA_I 24 + STACKSIZE(%rsp)
#define OLD_INCX 40 + STACKSIZE(%rsp)
@@ -71,7 +71,7 @@
#define Y1 %r14
#define XP %r15
#define MIN_N %rbx
-
+
PROLOGUE
PROFCODE
diff --git a/kernel/x86_64/xgemv_t.S b/kernel/x86_64/xgemv_t.S
index c09dcf0..31320f6 100644
--- a/kernel/x86_64/xgemv_t.S
+++ b/kernel/x86_64/xgemv_t.S
@@ -42,7 +42,7 @@
#define STACKSIZE 80
#define P 4096
-
+
#define ALPHA_R 8 + STACKSIZE(%rsp)
#define ALPHA_I 24 + STACKSIZE(%rsp)
#define OLD_INCX 40 + STACKSIZE(%rsp)
@@ -70,7 +70,7 @@
#define X1 %r13
#define Y1 %r14
#define MIN_M %rbx
-
+
PROLOGUE
PROFCODE
diff --git a/kernel/x86_64/xtrsm_kernel_LT_1x1.S b/kernel/x86_64/xtrsm_kernel_LT_1x1.S
index 86d4a74..a61a240 100644
--- a/kernel/x86_64/xtrsm_kernel_LT_1x1.S
+++ b/kernel/x86_64/xtrsm_kernel_LT_1x1.S
@@ -46,7 +46,7 @@
#define B ARG5
#define C ARG6
#define LDC %r10
-
+
#define I %r12
#define J %r13
#define AO %r14
@@ -59,7 +59,7 @@
#define KK %r11
#define AORIG 48(%rsp)
-
+
#ifdef OPTERON
#define PREFETCH prefetch
#define PREFETCHW prefetchw
@@ -89,7 +89,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -104,7 +104,7 @@
addq $8 * SIZE, A
addq $8 * SIZE, B
-
+
#ifdef LN
movq M, %rax
salq $ZBASE_SHIFT, %rax
@@ -128,7 +128,7 @@
movq OFFSET, %rax
negq %rax
movq %rax, KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -138,7 +138,7 @@
cmpq $0, M
jle .L999
-
+
movq N, %rax
movq %rax, J
testq %rax, %rax
@@ -170,7 +170,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, %rax
@@ -194,7 +194,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
fldz
fldz
@@ -229,7 +229,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -247,7 +247,7 @@
FLD -5 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -5 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -267,7 +267,7 @@
FLD -3 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -3 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -285,7 +285,7 @@
FLD -1 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -1 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -322,7 +322,7 @@
FLD -7 * SIZE(BO)
fmul %st, %st(2)
-
+
FLD -7 * SIZE(AO)
fmul %st, %st(2)
fmulp %st, %st(1)
@@ -341,7 +341,7 @@
.L18:
faddp %st, %st(3)
faddp %st, %st(1)
-
+
fxch %st(1)
#if defined(LN) || defined(RT)
@@ -421,7 +421,7 @@
FST 0 * SIZE(CO)
FST 1 * SIZE(CO)
-
+
#ifndef LN
addq $2 * SIZE, CO
#endif
diff --git a/kernel/x86_64/zamax.S b/kernel/x86_64/zamax.S
index 21d96b6..74e127e 100644
--- a/kernel/x86_64/zamax.S
+++ b/kernel/x86_64/zamax.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1
#define X ARG2
#define INCX ARG3
@@ -67,9 +67,9 @@
ffreep %st
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
addq INCX, X
decq M
@@ -82,16 +82,16 @@
sarq $2, I
jle .L20
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi
FMOV %st(1), %st(0)
@@ -99,9 +99,9 @@
ffreep %st
FLD 2 * SIZE(X)
- fabs
+ fabs
FLD 3 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi
FMOV %st(1), %st(0)
@@ -109,9 +109,9 @@
ffreep %st
FLD 4 * SIZE(X)
- fabs
+ fabs
FLD 5 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi
FMOV %st(1), %st(0)
@@ -119,9 +119,9 @@
ffreep %st
FLD 6 * SIZE(X)
- fabs
+ fabs
FLD 7 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi
FMOV %st(1), %st(0)
@@ -143,9 +143,9 @@
.L21:
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi
FMOV %st(1), %st(0)
@@ -163,12 +163,12 @@
sarq $2, I
jle .L60
ALIGN_4
-
+
.L50:
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
addq INCX, X
faddp %st, %st(1)
fcomi
@@ -177,9 +177,9 @@
ffreep %st
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
addq INCX, X
faddp %st, %st(1)
fcomi
@@ -188,9 +188,9 @@
ffreep %st
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
addq INCX, X
faddp %st, %st(1)
fcomi
@@ -199,9 +199,9 @@
ffreep %st
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
addq INCX, X
faddp %st, %st(1)
fcomi
@@ -221,9 +221,9 @@
.L61:
FLD 0 * SIZE(X)
- fabs
+ fabs
FLD 1 * SIZE(X)
- fabs
+ fabs
faddp %st, %st(1)
fcomi
FMOV %st(1), %st(0)
diff --git a/kernel/x86_64/zamax_atom.S b/kernel/x86_64/zamax_atom.S
index 3f67574..8b4e144 100644
--- a/kernel/x86_64/zamax_atom.S
+++ b/kernel/x86_64/zamax_atom.S
@@ -38,13 +38,13 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
#define I %rax
-
+
#ifdef USE_MIN
#define maxsd minsd
#endif
@@ -103,7 +103,7 @@
decq I
jle .L13
ALIGN_4
-
+
.L12:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -183,7 +183,7 @@
maxsd %xmm5, %xmm0
maxsd %xmm7, %xmm1
- ALIGN_3
+ ALIGN_3
.L17:
testq $1, M
@@ -225,7 +225,7 @@
decq I
jle .L23
ALIGN_4
-
+
.L22:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -307,7 +307,7 @@
maxsd %xmm5, %xmm0
maxsd %xmm7, %xmm1
- ALIGN_3
+ ALIGN_3
.L27:
testq $1, M
diff --git a/kernel/x86_64/zamax_sse.S b/kernel/x86_64/zamax_sse.S
index 5566a35..5f8a1f1 100644
--- a/kernel/x86_64/zamax_sse.S
+++ b/kernel/x86_64/zamax_sse.S
@@ -38,18 +38,18 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
#define I %rax
-
+
#ifdef USE_MIN
#define maxps minps
#define maxss minss
#endif
-
+
#include "l1param.h"
PROLOGUE
@@ -83,7 +83,7 @@
sarq $3, I
jle .L35
ALIGN_4
-
+
.L31:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -164,7 +164,7 @@
maxss %xmm4, %xmm0
maxss %xmm6, %xmm1
addq $4 * SIZE, X
- ALIGN_3
+ ALIGN_3
.L37:
testq $1, M
@@ -185,7 +185,7 @@
sarq $3, I
jle .L45
ALIGN_4
-
+
.L41:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -256,7 +256,7 @@
andps %xmm15, %xmm6
addps %xmm6, %xmm4
maxps %xmm4, %xmm0
- ALIGN_3
+ ALIGN_3
.L46:
testq $2, M
@@ -277,7 +277,7 @@
maxss %xmm4, %xmm0
maxss %xmm6, %xmm1
ALIGN_3
-
+
.L47:
testq $1, M
je .L998
diff --git a/kernel/x86_64/zamax_sse2.S b/kernel/x86_64/zamax_sse2.S
index eb8fd43..bde290b 100644
--- a/kernel/x86_64/zamax_sse2.S
+++ b/kernel/x86_64/zamax_sse2.S
@@ -38,13 +38,13 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
#define I %rax
-
+
#ifdef USE_MIN
#define maxpd minpd
#define maxsd minsd
@@ -184,7 +184,7 @@
andpd %xmm15, %xmm5
addpd %xmm5, %xmm4
maxpd %xmm4, %xmm0
- ALIGN_3
+ ALIGN_3
.L37:
testq $1, M
@@ -205,7 +205,7 @@
sarq $3, I
jle .L45
ALIGN_4
-
+
.L41:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -309,7 +309,7 @@
andpd %xmm15, %xmm5
addpd %xmm5, %xmm4
maxpd %xmm4, %xmm2
- ALIGN_3
+ ALIGN_3
.L47:
testq $1, M
diff --git a/kernel/x86_64/zasum.S b/kernel/x86_64/zasum.S
index b94e49b..c372fc5 100644
--- a/kernel/x86_64/zasum.S
+++ b/kernel/x86_64/zasum.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1
#define X ARG2
#define INCX ARG3
@@ -68,7 +68,7 @@
sarq $2, I
jle .L20
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -132,7 +132,7 @@
sarq $2, I
jle .L60
ALIGN_4
-
+
.L50:
FLD 0 * SIZE(X)
fabs
diff --git a/kernel/x86_64/zasum_atom.S b/kernel/x86_64/zasum_atom.S
index ab83809..888dbbb 100644
--- a/kernel/x86_64/zasum_atom.S
+++ b/kernel/x86_64/zasum_atom.S
@@ -38,20 +38,20 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
#define I %rax
-
+
#include "l1param.h"
PROLOGUE
PROFCODE
SAVEREGISTERS
-
+
xorps %xmm0, %xmm0
testq M, M
@@ -102,7 +102,7 @@
decq I
jle .L11
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -257,7 +257,7 @@
pshufd $0x4e, %xmm5, %xmm13
addsd %xmm5, %xmm2
addsd %xmm13, %xmm3
- ALIGN_3
+ ALIGN_3
.L14:
testq $2, M
@@ -270,8 +270,8 @@
pshufd $0x4e, %xmm4, %xmm5
addsd %xmm4, %xmm2
addsd %xmm5, %xmm3
- ALIGN_3
-
+ ALIGN_3
+
.L15:
testq $1, M
je .L998
@@ -303,7 +303,7 @@
decq I
jle .L23
ALIGN_4
-
+
.L22:
andps %xmm15, %xmm4
addq INCX, X
@@ -379,7 +379,7 @@
addsd %xmm6, %xmm2
andps %xmm15, %xmm7
addsd %xmm7, %xmm3
- ALIGN_3
+ ALIGN_3
.L26:
testq $1, M
@@ -404,7 +404,7 @@
.L999:
RESTOREREGISTERS
-
+
ret
EPILOGUE
diff --git a/kernel/x86_64/zasum_sse.S b/kernel/x86_64/zasum_sse.S
index 7f3d3d1..44d6da5 100644
--- a/kernel/x86_64/zasum_sse.S
+++ b/kernel/x86_64/zasum_sse.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
@@ -51,7 +51,7 @@
PROFCODE
SAVEREGISTERS
-
+
pxor %xmm0, %xmm0
testq M, M
jle .L999
@@ -64,7 +64,7 @@
pcmpeqb %xmm15, %xmm15
psrld $1, %xmm15
-
+
salq $ZBASE_SHIFT, INCX
cmpq $2 * SIZE, INCX
@@ -116,7 +116,7 @@
decq I
jle .L12
ALIGN_3
-
+
.L11:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -267,7 +267,7 @@
sarq $2, I
jle .L105
ALIGN_4
-
+
.L101:
movsd (X), %xmm4
addq INCX, X
@@ -314,19 +314,19 @@
#ifndef HAVE_SSE3
movhlps %xmm0, %xmm1
addps %xmm1, %xmm0
-
+
movaps %xmm0, %xmm1
shufps $1, %xmm0, %xmm0
addss %xmm1, %xmm0
#else
haddps %xmm0, %xmm0
haddps %xmm0, %xmm0
-#endif
+#endif
ALIGN_4
.L999:
RESTOREREGISTERS
-
+
ret
EPILOGUE
diff --git a/kernel/x86_64/zasum_sse2.S b/kernel/x86_64/zasum_sse2.S
index 9d0ec2e..d1e076c 100644
--- a/kernel/x86_64/zasum_sse2.S
+++ b/kernel/x86_64/zasum_sse2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
@@ -51,7 +51,7 @@
PROFCODE
SAVEREGISTERS
-
+
xorps %xmm0, %xmm0
testq M, M
jle .L999
@@ -105,7 +105,7 @@
decq I
jle .L11
ALIGN_4
-
+
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -213,7 +213,7 @@
addpd %xmm5, %xmm1
addq $4 * SIZE, X
- ALIGN_3
+ ALIGN_3
.L22:
testq $2, M
@@ -223,7 +223,7 @@
andps %xmm15, %xmm6
addpd %xmm6, %xmm3
addq $2 * SIZE, X
-
+
.L23:
testq $1, M
je .L998
@@ -243,7 +243,7 @@
sarq $2, I
jle .L60
ALIGN_4
-
+
.L50:
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
prefetcht0 PREFETCHSIZE * SIZE(X)
@@ -312,7 +312,7 @@
.L999:
RESTOREREGISTERS
-
+
ret
EPILOGUE
diff --git a/kernel/x86_64/zaxpy.S b/kernel/x86_64/zaxpy.S
index 266c147..1758ca9 100644
--- a/kernel/x86_64/zaxpy.S
+++ b/kernel/x86_64/zaxpy.S
@@ -68,10 +68,10 @@
salq $ZBASE_SHIFT, INCX
salq $ZBASE_SHIFT, INCY
-
+
testq M, M
jle .L40
-
+
cmpq $2 * SIZE, INCX
jne .L14
cmpq $2 * SIZE, INCY
@@ -332,5 +332,5 @@
ffreep %st(0)
ffreep %st(0)
ret
-
+
EPILOGUE
diff --git a/kernel/x86_64/zaxpy_atom.S b/kernel/x86_64/zaxpy_atom.S
index e623326..2fe2756 100644
--- a/kernel/x86_64/zaxpy_atom.S
+++ b/kernel/x86_64/zaxpy_atom.S
@@ -70,7 +70,7 @@
#endif
#else
movaps %xmm3, %xmm0
- movsd 40(%rsp), %xmm1
+ movsd 40(%rsp), %xmm1
movq 48(%rsp), X
movq 56(%rsp), INCX
@@ -79,7 +79,7 @@
#endif
SAVEREGISTERS
-
+
#ifndef CONJ
#define ADD1 subsd
#define ADD2 addsd
@@ -95,7 +95,7 @@
testq M, M
jle .L999
-
+
cmpq $2 * SIZE, INCX
jne .L20
cmpq $2 * SIZE, INCY
diff --git a/kernel/x86_64/zaxpy_sse.S b/kernel/x86_64/zaxpy_sse.S
index 42b920c..0a12e24 100644
--- a/kernel/x86_64/zaxpy_sse.S
+++ b/kernel/x86_64/zaxpy_sse.S
@@ -67,7 +67,7 @@
movq 8(%rsp), INCY
#else
movaps %xmm3, %xmm0
- movss 40(%rsp), %xmm1
+ movss 40(%rsp), %xmm1
movq 48(%rsp), X
movq 56(%rsp), INCX
@@ -76,13 +76,13 @@
#endif
SAVEREGISTERS
-
+
salq $ZBASE_SHIFT, INCX
salq $ZBASE_SHIFT, INCY
testq M, M
jle .L999
-
+
cmpq $2 * SIZE, INCX
jne .L100
cmpq $2 * SIZE, INCY
@@ -3113,7 +3113,7 @@
movsd %xmm8, (Y)
jmp .L999
ALIGN_3
-
+
.L200:
movq M, %rax
cmpq $0, %rax
@@ -3142,11 +3142,11 @@
movsd %xmm8, (Y)
addq INCY, Y
-
+
decq %rax
jg .L201
ALIGN_3
-
+
.L999:
xorq %rax, %rax
diff --git a/kernel/x86_64/zaxpy_sse2.S b/kernel/x86_64/zaxpy_sse2.S
index 1b7e3a5..a7dd054 100644
--- a/kernel/x86_64/zaxpy_sse2.S
+++ b/kernel/x86_64/zaxpy_sse2.S
@@ -76,7 +76,7 @@
movq 8(%rsp), INCY
#else
movaps %xmm3, %xmm0
- movsd 40(%rsp), %xmm1
+ movsd 40(%rsp), %xmm1
movq 48(%rsp), X
movq 56(%rsp), INCX
@@ -85,18 +85,18 @@
#endif
SAVEREGISTERS
-
+
salq $ZBASE_SHIFT, INCX
salq $ZBASE_SHIFT, INCY
testq M, M
jle .L999
-
+
cmpq $2 * SIZE, INCX
jne .L50
cmpq $2 * SIZE, INCY
jne .L50
-
+
subq $-16 * SIZE, X
subq $-16 * SIZE, Y
@@ -112,10 +112,10 @@
#endif
#ifndef CONJ
- shufps $0x0c, %xmm7, %xmm7
+ shufps $0x0c, %xmm7, %xmm7
xorpd %xmm7, ALPHA_I
#else
- shufps $0xc0, %xmm7, %xmm7
+ shufps $0xc0, %xmm7, %xmm7
xorpd %xmm7, ALPHA_R
#endif
@@ -1421,7 +1421,7 @@
je .L58
cmpq $0, INCY
je .L58
-
+
sarq $3, %rax
jle .L55
@@ -1775,7 +1775,7 @@
andq $1, %rax
jle .L999
-.L58:
+.L58:
MOVDDUP( 0 * SIZE, X, %xmm0)
MOVDDUP( 1 * SIZE, X, %xmm1)
@@ -1788,7 +1788,7 @@
movlpd %xmm8, 0 * SIZE(YY)
movhpd %xmm8, 1 * SIZE(YY)
-
+
decq %rax
jg .L58
ALIGN_3
diff --git a/kernel/x86_64/zcopy.S b/kernel/x86_64/zcopy.S
index d76426b..3cc4e18 100644
--- a/kernel/x86_64/zcopy.S
+++ b/kernel/x86_64/zcopy.S
@@ -50,7 +50,7 @@
#define INCY %r10
#define FLAG %r11
#endif
-
+
#include "l1param.h"
PROLOGUE
diff --git a/kernel/x86_64/zcopy_sse.S b/kernel/x86_64/zcopy_sse.S
index 91f283a..018a56f 100644
--- a/kernel/x86_64/zcopy_sse.S
+++ b/kernel/x86_64/zcopy_sse.S
@@ -65,7 +65,7 @@
#endif
SAVEREGISTERS
-
+
salq $ZBASE_SHIFT, INCX
salq $ZBASE_SHIFT, INCY
@@ -80,7 +80,7 @@
subq $-32 * SIZE, X
subq $-32 * SIZE, Y
addq M, M
-
+
testq $SIZE, Y
je .L05
diff --git a/kernel/x86_64/zdot.S b/kernel/x86_64/zdot.S
index f968347..607b9b9 100644
--- a/kernel/x86_64/zdot.S
+++ b/kernel/x86_64/zdot.S
@@ -53,7 +53,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
movq 40(%rsp), INCY
#endif
diff --git a/kernel/x86_64/zdot_sse.S b/kernel/x86_64/zdot_sse.S
index e2f153a..f53e04c 100644
--- a/kernel/x86_64/zdot_sse.S
+++ b/kernel/x86_64/zdot_sse.S
@@ -92,7 +92,7 @@
movsd -32 * SIZE(X), %xmm4
movsd -32 * SIZE(Y), %xmm0
- pshufd $0xb1, %xmm0, %xmm1
+ pshufd $0xb1, %xmm0, %xmm1
mulps %xmm4, %xmm0
mulps %xmm4, %xmm1
addq $2 * SIZE, X
@@ -126,7 +126,7 @@
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
movaps -16 * SIZE(Y), %xmm8
@@ -134,7 +134,7 @@
movaps -16 * SIZE(X), %xmm4
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm9, %xmm12
+ pshufd $0xb1, %xmm9, %xmm12
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
movaps -12 * SIZE(Y), %xmm9
@@ -146,7 +146,7 @@
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
#endif
- pshufd $0xb1, %xmm10, %xmm12
+ pshufd $0xb1, %xmm10, %xmm12
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
movaps -8 * SIZE(Y), %xmm10
@@ -154,7 +154,7 @@
movaps -8 * SIZE(X), %xmm6
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm11, %xmm12
+ pshufd $0xb1, %xmm11, %xmm12
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
movaps -4 * SIZE(Y), %xmm11
@@ -166,7 +166,7 @@
PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
#endif
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
movaps 0 * SIZE(Y), %xmm8
@@ -174,7 +174,7 @@
movaps 0 * SIZE(X), %xmm4
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm9, %xmm12
+ pshufd $0xb1, %xmm9, %xmm12
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
movaps 4 * SIZE(Y), %xmm9
@@ -186,7 +186,7 @@
PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
#endif
- pshufd $0xb1, %xmm10, %xmm12
+ pshufd $0xb1, %xmm10, %xmm12
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
movaps 8 * SIZE(Y), %xmm10
@@ -194,7 +194,7 @@
movaps 8 * SIZE(X), %xmm6
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm11, %xmm12
+ pshufd $0xb1, %xmm11, %xmm12
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
movaps 12 * SIZE(Y), %xmm11
@@ -210,7 +210,7 @@
ALIGN_3
.L12:
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
movaps -16 * SIZE(Y), %xmm8
@@ -218,7 +218,7 @@
movaps -16 * SIZE(X), %xmm4
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm9, %xmm12
+ pshufd $0xb1, %xmm9, %xmm12
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
movaps -12 * SIZE(Y), %xmm9
@@ -226,7 +226,7 @@
movaps -12 * SIZE(X), %xmm5
addps %xmm12, %xmm3
- pshufd $0xb1, %xmm10, %xmm12
+ pshufd $0xb1, %xmm10, %xmm12
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
movaps -8 * SIZE(Y), %xmm10
@@ -234,7 +234,7 @@
movaps -8 * SIZE(X), %xmm6
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm11, %xmm12
+ pshufd $0xb1, %xmm11, %xmm12
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
movaps -4 * SIZE(Y), %xmm11
@@ -242,25 +242,25 @@
movaps -4 * SIZE(X), %xmm7
addps %xmm12, %xmm3
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
mulps %xmm4, %xmm12
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm9, %xmm12
+ pshufd $0xb1, %xmm9, %xmm12
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
mulps %xmm5, %xmm12
addps %xmm12, %xmm3
- pshufd $0xb1, %xmm10, %xmm12
+ pshufd $0xb1, %xmm10, %xmm12
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
mulps %xmm6, %xmm12
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm11, %xmm12
+ pshufd $0xb1, %xmm11, %xmm12
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
mulps %xmm7, %xmm12
@@ -277,7 +277,7 @@
movaps -32 * SIZE(X), %xmm4
movaps -32 * SIZE(Y), %xmm8
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
mulps %xmm4, %xmm12
@@ -286,7 +286,7 @@
movaps -28 * SIZE(X), %xmm5
movaps -28 * SIZE(Y), %xmm9
- pshufd $0xb1, %xmm9, %xmm12
+ pshufd $0xb1, %xmm9, %xmm12
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
mulps %xmm5, %xmm12
@@ -295,7 +295,7 @@
movaps -24 * SIZE(X), %xmm6
movaps -24 * SIZE(Y), %xmm10
- pshufd $0xb1, %xmm10, %xmm12
+ pshufd $0xb1, %xmm10, %xmm12
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
mulps %xmm6, %xmm12
@@ -304,7 +304,7 @@
movaps -20 * SIZE(X), %xmm7
movaps -20 * SIZE(Y), %xmm11
- pshufd $0xb1, %xmm11, %xmm12
+ pshufd $0xb1, %xmm11, %xmm12
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
mulps %xmm7, %xmm12
@@ -323,13 +323,13 @@
movaps -28 * SIZE(X), %xmm5
movaps -28 * SIZE(Y), %xmm9
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
mulps %xmm4, %xmm12
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm9, %xmm12
+ pshufd $0xb1, %xmm9, %xmm12
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
mulps %xmm5, %xmm12
@@ -346,7 +346,7 @@
movaps -32 * SIZE(X), %xmm4
movaps -32 * SIZE(Y), %xmm8
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
mulps %xmm4, %xmm12
@@ -369,7 +369,7 @@
#endif
movsd -32 * SIZE(Y), %xmm8
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
mulps %xmm4, %xmm12
@@ -410,7 +410,7 @@
#endif
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x39, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -420,7 +420,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x39, %xmm9, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -434,7 +434,7 @@
#endif
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x39, %xmm10, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -444,7 +444,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x39, %xmm11, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -458,7 +458,7 @@
#endif
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x39, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -468,7 +468,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x39, %xmm9, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -482,7 +482,7 @@
#endif
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x39, %xmm10, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -492,7 +492,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x39, %xmm11, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -510,7 +510,7 @@
.L22:
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x39, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -520,7 +520,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x39, %xmm9, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -530,7 +530,7 @@
addps %xmm12, %xmm1
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x39, %xmm10, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -540,7 +540,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x39, %xmm11, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -550,7 +550,7 @@
addps %xmm12, %xmm1
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x39, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -559,7 +559,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x39, %xmm9, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -567,7 +567,7 @@
addps %xmm12, %xmm1
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x39, %xmm10, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -575,7 +575,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x39, %xmm11, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -596,7 +596,7 @@
movaps -28 * SIZE(Y), %xmm10
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x39, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -607,7 +607,7 @@
movaps -24 * SIZE(Y), %xmm11
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x39, %xmm9, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -618,7 +618,7 @@
movaps -20 * SIZE(Y), %xmm8
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x39, %xmm10, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -626,7 +626,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x39, %xmm11, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -645,7 +645,7 @@
movaps -32 * SIZE(Y), %xmm9
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x39, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -656,7 +656,7 @@
movaps -28 * SIZE(Y), %xmm10
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x39, %xmm9, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -677,7 +677,7 @@
movaps -32 * SIZE(Y), %xmm9
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x39, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -698,7 +698,7 @@
#endif
movsd -32 * SIZE(X), %xmm4
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x59, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -745,7 +745,7 @@
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
movsd -16 * SIZE(Y), %xmm8
@@ -754,7 +754,7 @@
movaps -16 * SIZE(X), %xmm4
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm9, %xmm12
+ pshufd $0xb1, %xmm9, %xmm12
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
movsd -12 * SIZE(Y), %xmm9
@@ -767,7 +767,7 @@
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
#endif
- pshufd $0xb1, %xmm10, %xmm12
+ pshufd $0xb1, %xmm10, %xmm12
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
movsd -8 * SIZE(Y), %xmm10
@@ -776,7 +776,7 @@
movaps -8 * SIZE(X), %xmm6
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm11, %xmm12
+ pshufd $0xb1, %xmm11, %xmm12
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
movsd -4 * SIZE(Y), %xmm11
@@ -789,7 +789,7 @@
PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
#endif
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
movsd 0 * SIZE(Y), %xmm8
@@ -798,7 +798,7 @@
movaps 0 * SIZE(X), %xmm4
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm9, %xmm12
+ pshufd $0xb1, %xmm9, %xmm12
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
movsd 4 * SIZE(Y), %xmm9
@@ -811,7 +811,7 @@
PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
#endif
- pshufd $0xb1, %xmm10, %xmm12
+ pshufd $0xb1, %xmm10, %xmm12
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
movsd 8 * SIZE(Y), %xmm10
@@ -820,7 +820,7 @@
movaps 8 * SIZE(X), %xmm6
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm11, %xmm12
+ pshufd $0xb1, %xmm11, %xmm12
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
movsd 12 * SIZE(Y), %xmm11
@@ -837,7 +837,7 @@
ALIGN_3
.L32:
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
movsd -16 * SIZE(Y), %xmm8
@@ -846,7 +846,7 @@
movaps -16 * SIZE(X), %xmm4
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm9, %xmm12
+ pshufd $0xb1, %xmm9, %xmm12
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
movsd -12 * SIZE(Y), %xmm9
@@ -855,7 +855,7 @@
movaps -12 * SIZE(X), %xmm5
addps %xmm12, %xmm3
- pshufd $0xb1, %xmm10, %xmm12
+ pshufd $0xb1, %xmm10, %xmm12
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
movsd -8 * SIZE(Y), %xmm10
@@ -864,7 +864,7 @@
movaps -8 * SIZE(X), %xmm6
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm11, %xmm12
+ pshufd $0xb1, %xmm11, %xmm12
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
movsd -4 * SIZE(Y), %xmm11
@@ -873,25 +873,25 @@
movaps -4 * SIZE(X), %xmm7
addps %xmm12, %xmm3
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
mulps %xmm4, %xmm12
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm9, %xmm12
+ pshufd $0xb1, %xmm9, %xmm12
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
mulps %xmm5, %xmm12
addps %xmm12, %xmm3
- pshufd $0xb1, %xmm10, %xmm12
+ pshufd $0xb1, %xmm10, %xmm12
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
mulps %xmm6, %xmm12
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm11, %xmm12
+ pshufd $0xb1, %xmm11, %xmm12
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
mulps %xmm7, %xmm12
@@ -909,7 +909,7 @@
movsd -32 * SIZE(Y), %xmm8
movhps -30 * SIZE(Y), %xmm8
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
mulps %xmm4, %xmm12
@@ -919,7 +919,7 @@
movsd -28 * SIZE(Y), %xmm9
movhps -26 * SIZE(Y), %xmm9
- pshufd $0xb1, %xmm9, %xmm12
+ pshufd $0xb1, %xmm9, %xmm12
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
mulps %xmm5, %xmm12
@@ -929,7 +929,7 @@
movsd -24 * SIZE(Y), %xmm10
movhps -22 * SIZE(Y), %xmm10
- pshufd $0xb1, %xmm10, %xmm12
+ pshufd $0xb1, %xmm10, %xmm12
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
mulps %xmm6, %xmm12
@@ -939,7 +939,7 @@
movsd -20 * SIZE(Y), %xmm11
movhps -18 * SIZE(Y), %xmm11
- pshufd $0xb1, %xmm11, %xmm12
+ pshufd $0xb1, %xmm11, %xmm12
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
mulps %xmm7, %xmm12
@@ -957,7 +957,7 @@
movsd -32 * SIZE(Y), %xmm8
movhps -30 * SIZE(Y), %xmm8
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
mulps %xmm4, %xmm12
@@ -967,7 +967,7 @@
movsd -28 * SIZE(Y), %xmm9
movhps -26 * SIZE(Y), %xmm9
- pshufd $0xb1, %xmm9, %xmm12
+ pshufd $0xb1, %xmm9, %xmm12
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
mulps %xmm5, %xmm12
@@ -985,7 +985,7 @@
movsd -32 * SIZE(Y), %xmm8
movhps -30 * SIZE(Y), %xmm8
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
mulps %xmm4, %xmm12
@@ -1008,7 +1008,7 @@
#endif
movsd -32 * SIZE(Y), %xmm8
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
mulps %xmm4, %xmm12
@@ -1045,7 +1045,7 @@
#endif
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x93, %xmm9, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1055,7 +1055,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x93, %xmm10, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -1069,7 +1069,7 @@
#endif
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x93, %xmm11, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -1079,7 +1079,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x93, %xmm8, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -1093,7 +1093,7 @@
#endif
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x93, %xmm9, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1103,7 +1103,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x93, %xmm10, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -1117,7 +1117,7 @@
#endif
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x93, %xmm11, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -1127,7 +1127,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x93, %xmm8, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -1145,7 +1145,7 @@
.L42:
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x93, %xmm9, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1155,7 +1155,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x93, %xmm10, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -1165,7 +1165,7 @@
addps %xmm12, %xmm1
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x93, %xmm11, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -1175,7 +1175,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x93, %xmm8, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -1185,7 +1185,7 @@
addps %xmm12, %xmm1
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x93, %xmm9, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1194,7 +1194,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x93, %xmm10, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -1202,7 +1202,7 @@
addps %xmm12, %xmm1
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x93, %xmm11, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -1210,7 +1210,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x93, %xmm8, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -1231,7 +1231,7 @@
movaps -28 * SIZE(Y), %xmm10
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x93, %xmm9, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1242,7 +1242,7 @@
movaps -24 * SIZE(Y), %xmm11
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x93, %xmm10, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -1253,7 +1253,7 @@
movaps -20 * SIZE(Y), %xmm8
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x93, %xmm11, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -1261,7 +1261,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x93, %xmm8, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -1280,7 +1280,7 @@
movaps -32 * SIZE(Y), %xmm9
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x93, %xmm9, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1291,7 +1291,7 @@
movaps -28 * SIZE(Y), %xmm10
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x93, %xmm10, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -1312,7 +1312,7 @@
movaps -32 * SIZE(Y), %xmm9
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x93, %xmm9, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1335,7 +1335,7 @@
movss -32 * SIZE(Y), %xmm9
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x03, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1368,7 +1368,7 @@
#endif
movsd -32 * SIZE(Y), %xmm4
- pshufd $0xb1, %xmm0, %xmm1
+ pshufd $0xb1, %xmm0, %xmm1
mulps %xmm4, %xmm0
mulps %xmm4, %xmm1
addq $2 * SIZE, X
@@ -1408,7 +1408,7 @@
#endif
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x39, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1418,7 +1418,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x39, %xmm9, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -1432,7 +1432,7 @@
#endif
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x39, %xmm10, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -1442,7 +1442,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x39, %xmm11, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -1456,7 +1456,7 @@
#endif
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x39, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1466,7 +1466,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x39, %xmm9, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -1480,7 +1480,7 @@
#endif
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x39, %xmm10, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -1490,7 +1490,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x39, %xmm11, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -1508,7 +1508,7 @@
.L52:
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x39, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1518,7 +1518,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x39, %xmm9, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -1528,7 +1528,7 @@
addps %xmm12, %xmm1
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x39, %xmm10, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -1538,7 +1538,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x39, %xmm11, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -1548,7 +1548,7 @@
addps %xmm12, %xmm1
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x39, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1557,7 +1557,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x39, %xmm9, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -1565,7 +1565,7 @@
addps %xmm12, %xmm1
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x39, %xmm10, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -1573,7 +1573,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x39, %xmm11, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -1594,7 +1594,7 @@
movaps -28 * SIZE(X), %xmm10
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x39, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1605,7 +1605,7 @@
movaps -24 * SIZE(X), %xmm11
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x39, %xmm9, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -1616,7 +1616,7 @@
movaps -20 * SIZE(X), %xmm8
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x39, %xmm10, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -1624,7 +1624,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x39, %xmm11, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -1643,7 +1643,7 @@
movaps -32 * SIZE(X), %xmm9
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x39, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1654,7 +1654,7 @@
movaps -28 * SIZE(X), %xmm10
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x39, %xmm9, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -1675,7 +1675,7 @@
movaps -32 * SIZE(X), %xmm9
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x39, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1696,7 +1696,7 @@
#endif
movsd -32 * SIZE(Y), %xmm4
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0xa9, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1733,7 +1733,7 @@
#endif
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x93, %xmm9, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1743,7 +1743,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x93, %xmm10, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -1757,7 +1757,7 @@
#endif
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x93, %xmm11, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -1767,7 +1767,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x93, %xmm8, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -1781,7 +1781,7 @@
#endif
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x93, %xmm9, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1791,7 +1791,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x93, %xmm10, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -1805,7 +1805,7 @@
#endif
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x93, %xmm11, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -1815,7 +1815,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x93, %xmm8, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -1833,7 +1833,7 @@
.L62:
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x93, %xmm9, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1843,7 +1843,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x93, %xmm10, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -1853,7 +1853,7 @@
addps %xmm12, %xmm1
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x93, %xmm11, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -1863,7 +1863,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x93, %xmm8, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -1873,7 +1873,7 @@
addps %xmm12, %xmm1
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x93, %xmm9, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1882,7 +1882,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x93, %xmm10, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -1890,7 +1890,7 @@
addps %xmm12, %xmm1
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x93, %xmm11, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -1898,7 +1898,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x93, %xmm8, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -1919,7 +1919,7 @@
movaps -28 * SIZE(X), %xmm10
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x93, %xmm9, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1930,7 +1930,7 @@
movaps -24 * SIZE(X), %xmm11
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x93, %xmm10, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -1941,7 +1941,7 @@
movaps -20 * SIZE(X), %xmm8
movss %xmm11, %xmm10
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
shufps $0x93, %xmm11, %xmm10
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
@@ -1949,7 +1949,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
shufps $0x93, %xmm8, %xmm11
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
@@ -1968,7 +1968,7 @@
movaps -32 * SIZE(X), %xmm9
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x93, %xmm9, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -1979,7 +1979,7 @@
movaps -28 * SIZE(X), %xmm10
movss %xmm10, %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
shufps $0x93, %xmm10, %xmm9
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
@@ -2000,7 +2000,7 @@
movaps -32 * SIZE(X), %xmm9
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x93, %xmm9, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -2023,7 +2023,7 @@
movss -32 * SIZE(X), %xmm9
movss %xmm9, %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
shufps $0x03, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
@@ -2046,7 +2046,7 @@
#endif
movsd -32 * SIZE(X), %xmm4
- pshufd $0xb1, %xmm0, %xmm1
+ pshufd $0xb1, %xmm0, %xmm1
mulps %xmm4, %xmm0
mulps %xmm4, %xmm1
addq $2 * SIZE, X
@@ -2095,7 +2095,7 @@
PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
#endif
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
movaps -16 * SIZE(Y), %xmm4
@@ -2104,7 +2104,7 @@
movhps -14 * SIZE(X), %xmm8
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
movaps -12 * SIZE(Y), %xmm5
@@ -2113,7 +2113,7 @@
movhps -10 * SIZE(X), %xmm9
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
movaps -8 * SIZE(Y), %xmm6
@@ -2122,7 +2122,7 @@
movhps -6 * SIZE(X), %xmm10
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
movaps -4 * SIZE(Y), %xmm7
@@ -2131,7 +2131,7 @@
movhps -2 * SIZE(X), %xmm11
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
movaps 0 * SIZE(Y), %xmm4
@@ -2140,7 +2140,7 @@
movhps 2 * SIZE(X), %xmm8
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
movaps 4 * SIZE(Y), %xmm5
@@ -2149,7 +2149,7 @@
movhps 6 * SIZE(X), %xmm9
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
movaps 8 * SIZE(Y), %xmm6
@@ -2158,7 +2158,7 @@
movhps 10 * SIZE(X), %xmm10
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
movaps 12 * SIZE(Y), %xmm7
@@ -2175,7 +2175,7 @@
ALIGN_3
.L52:
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
movaps -16 * SIZE(Y), %xmm4
@@ -2184,7 +2184,7 @@
movhps -14 * SIZE(X), %xmm8
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
movaps -12 * SIZE(Y), %xmm5
@@ -2193,7 +2193,7 @@
movhps -10 * SIZE(X), %xmm9
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
movaps -8 * SIZE(Y), %xmm6
@@ -2202,7 +2202,7 @@
movhps -6 * SIZE(X), %xmm10
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
movaps -4 * SIZE(Y), %xmm7
@@ -2211,25 +2211,25 @@
movhps -2 * SIZE(X), %xmm11
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
mulps %xmm8, %xmm12
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
mulps %xmm9, %xmm12
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
mulps %xmm10, %xmm12
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
mulps %xmm11, %xmm12
@@ -2251,7 +2251,7 @@
movlps -28 * SIZE(X), %xmm9
movhps -26 * SIZE(X), %xmm9
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
mulps %xmm8, %xmm12
@@ -2261,7 +2261,7 @@
movlps -24 * SIZE(X), %xmm10
movhps -22 * SIZE(X), %xmm10
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
mulps %xmm9, %xmm12
@@ -2271,13 +2271,13 @@
movlps -20 * SIZE(X), %xmm11
movhps -18 * SIZE(X), %xmm11
- pshufd $0xb1, %xmm6, %xmm12
+ pshufd $0xb1, %xmm6, %xmm12
mulps %xmm10, %xmm6
addps %xmm6, %xmm0
mulps %xmm10, %xmm12
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm7, %xmm12
+ pshufd $0xb1, %xmm7, %xmm12
mulps %xmm11, %xmm7
addps %xmm7, %xmm0
mulps %xmm11, %xmm12
@@ -2295,7 +2295,7 @@
movlps -32 * SIZE(X), %xmm8
movhps -30 * SIZE(X), %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
mulps %xmm8, %xmm12
@@ -2305,7 +2305,7 @@
movlps -28 * SIZE(X), %xmm9
movhps -26 * SIZE(X), %xmm9
- pshufd $0xb1, %xmm5, %xmm12
+ pshufd $0xb1, %xmm5, %xmm12
mulps %xmm9, %xmm5
addps %xmm5, %xmm0
mulps %xmm9, %xmm12
@@ -2323,7 +2323,7 @@
movlps -32 * SIZE(X), %xmm8
movhps -30 * SIZE(X), %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
mulps %xmm8, %xmm12
@@ -2347,7 +2347,7 @@
#endif
movsd -32 * SIZE(X), %xmm8
- pshufd $0xb1, %xmm4, %xmm12
+ pshufd $0xb1, %xmm4, %xmm12
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
mulps %xmm8, %xmm12
@@ -2409,7 +2409,7 @@
#endif
movss %xmm9, %xmm8
- pshufd $0x1b, %xmm8, %xmm12
+ pshufd $0x1b, %xmm8, %xmm12
movss %xmm5, %xmm4
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
@@ -2419,7 +2419,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0x1b, %xmm9, %xmm12
+ pshufd $0x1b, %xmm9, %xmm12
movss %xmm6, %xmm5
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
@@ -2433,7 +2433,7 @@
#endif
movss %xmm11, %xmm10
- pshufd $0x1b, %xmm10, %xmm12
+ pshufd $0x1b, %xmm10, %xmm12
movss %xmm7, %xmm6
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
@@ -2443,7 +2443,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0x1b, %xmm11, %xmm12
+ pshufd $0x1b, %xmm11, %xmm12
movss %xmm4, %xmm7
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
@@ -2457,7 +2457,7 @@
#endif
movss %xmm9, %xmm8
- pshufd $0x1b, %xmm8, %xmm12
+ pshufd $0x1b, %xmm8, %xmm12
movss %xmm5, %xmm4
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
@@ -2467,7 +2467,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0x1b, %xmm9, %xmm12
+ pshufd $0x1b, %xmm9, %xmm12
movss %xmm6, %xmm5
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
@@ -2481,7 +2481,7 @@
#endif
movss %xmm11, %xmm10
- pshufd $0x1b, %xmm10, %xmm12
+ pshufd $0x1b, %xmm10, %xmm12
movss %xmm7, %xmm6
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
@@ -2491,7 +2491,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0x1b, %xmm11, %xmm12
+ pshufd $0x1b, %xmm11, %xmm12
movss %xmm4, %xmm7
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
@@ -2509,7 +2509,7 @@
.L72:
movss %xmm9, %xmm8
- pshufd $0x1b, %xmm8, %xmm12
+ pshufd $0x1b, %xmm8, %xmm12
movss %xmm5, %xmm4
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
@@ -2519,7 +2519,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0x1b, %xmm9, %xmm12
+ pshufd $0x1b, %xmm9, %xmm12
movss %xmm6, %xmm5
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
@@ -2529,7 +2529,7 @@
addps %xmm12, %xmm3
movss %xmm11, %xmm10
- pshufd $0x1b, %xmm10, %xmm12
+ pshufd $0x1b, %xmm10, %xmm12
movss %xmm7, %xmm6
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
@@ -2539,7 +2539,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0x1b, %xmm11, %xmm12
+ pshufd $0x1b, %xmm11, %xmm12
movss %xmm4, %xmm7
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
@@ -2549,7 +2549,7 @@
addps %xmm12, %xmm3
movss %xmm9, %xmm8
- pshufd $0x1b, %xmm8, %xmm12
+ pshufd $0x1b, %xmm8, %xmm12
movss %xmm5, %xmm4
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
@@ -2559,7 +2559,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0x1b, %xmm9, %xmm12
+ pshufd $0x1b, %xmm9, %xmm12
movss %xmm6, %xmm5
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
@@ -2567,7 +2567,7 @@
addps %xmm12, %xmm3
movss %xmm11, %xmm10
- pshufd $0x1b, %xmm10, %xmm12
+ pshufd $0x1b, %xmm10, %xmm12
movss %xmm7, %xmm6
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
@@ -2575,7 +2575,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0x1b, %xmm11, %xmm12
+ pshufd $0x1b, %xmm11, %xmm12
movss %xmm4, %xmm7
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
@@ -2594,7 +2594,7 @@
movaps -32 * SIZE(Y), %xmm9
movss %xmm9, %xmm8
- pshufd $0x1b, %xmm8, %xmm12
+ pshufd $0x1b, %xmm8, %xmm12
movss %xmm5, %xmm4
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
@@ -2605,7 +2605,7 @@
movaps -28 * SIZE(Y), %xmm10
movss %xmm10, %xmm9
- pshufd $0x1b, %xmm9, %xmm12
+ pshufd $0x1b, %xmm9, %xmm12
movss %xmm6, %xmm5
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
@@ -2616,7 +2616,7 @@
movaps -24 * SIZE(Y), %xmm11
movss %xmm11, %xmm10
- pshufd $0x1b, %xmm10, %xmm12
+ pshufd $0x1b, %xmm10, %xmm12
movss %xmm7, %xmm6
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
@@ -2627,7 +2627,7 @@
movaps -20 * SIZE(Y), %xmm8
movss %xmm8, %xmm11
- pshufd $0x1b, %xmm11, %xmm12
+ pshufd $0x1b, %xmm11, %xmm12
movss %xmm4, %xmm7
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
@@ -2648,7 +2648,7 @@
movaps -28 * SIZE(Y), %xmm10
movss %xmm9, %xmm8
- pshufd $0x1b, %xmm8, %xmm12
+ pshufd $0x1b, %xmm8, %xmm12
movss %xmm5, %xmm4
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
@@ -2656,7 +2656,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0x1b, %xmm9, %xmm12
+ pshufd $0x1b, %xmm9, %xmm12
movss %xmm6, %xmm5
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
@@ -2678,7 +2678,7 @@
movaps -32 * SIZE(Y), %xmm9
movss %xmm9, %xmm8
- pshufd $0x1b, %xmm8, %xmm12
+ pshufd $0x1b, %xmm8, %xmm12
movss %xmm5, %xmm4
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
@@ -2698,7 +2698,7 @@
movss %xmm5, %xmm8
shufps $0x24, %xmm4, %xmm4
- pshufd $0x18, %xmm8, %xmm12
+ pshufd $0x18, %xmm8, %xmm12
shufps $0x24, %xmm8, %xmm8
mulps %xmm4, %xmm8
@@ -2748,7 +2748,7 @@
#endif
movss %xmm9, %xmm8
- pshufd $0x1b, %xmm8, %xmm12
+ pshufd $0x1b, %xmm8, %xmm12
movss %xmm5, %xmm4
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
@@ -2759,7 +2759,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0x1b, %xmm9, %xmm12
+ pshufd $0x1b, %xmm9, %xmm12
movss %xmm6, %xmm5
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
@@ -2774,7 +2774,7 @@
#endif
movss %xmm11, %xmm10
- pshufd $0x1b, %xmm10, %xmm12
+ pshufd $0x1b, %xmm10, %xmm12
movss %xmm7, %xmm6
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
@@ -2785,7 +2785,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0x1b, %xmm11, %xmm12
+ pshufd $0x1b, %xmm11, %xmm12
movss %xmm4, %xmm7
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
@@ -2800,7 +2800,7 @@
#endif
movss %xmm9, %xmm8
- pshufd $0x1b, %xmm8, %xmm12
+ pshufd $0x1b, %xmm8, %xmm12
movss %xmm5, %xmm4
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
@@ -2811,7 +2811,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0x1b, %xmm9, %xmm12
+ pshufd $0x1b, %xmm9, %xmm12
movss %xmm6, %xmm5
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
@@ -2826,7 +2826,7 @@
#endif
movss %xmm11, %xmm10
- pshufd $0x1b, %xmm10, %xmm12
+ pshufd $0x1b, %xmm10, %xmm12
movss %xmm7, %xmm6
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
@@ -2837,7 +2837,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0x1b, %xmm11, %xmm12
+ pshufd $0x1b, %xmm11, %xmm12
movss %xmm4, %xmm7
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
@@ -2856,7 +2856,7 @@
.L82:
movss %xmm9, %xmm8
- pshufd $0x1b, %xmm8, %xmm12
+ pshufd $0x1b, %xmm8, %xmm12
movss %xmm5, %xmm4
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
@@ -2867,7 +2867,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0x1b, %xmm9, %xmm12
+ pshufd $0x1b, %xmm9, %xmm12
movss %xmm6, %xmm5
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
@@ -2878,7 +2878,7 @@
addps %xmm12, %xmm3
movss %xmm11, %xmm10
- pshufd $0x1b, %xmm10, %xmm12
+ pshufd $0x1b, %xmm10, %xmm12
movss %xmm7, %xmm6
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
@@ -2889,7 +2889,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0x1b, %xmm11, %xmm12
+ pshufd $0x1b, %xmm11, %xmm12
movss %xmm4, %xmm7
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
@@ -2900,7 +2900,7 @@
addps %xmm12, %xmm3
movss %xmm9, %xmm8
- pshufd $0x1b, %xmm8, %xmm12
+ pshufd $0x1b, %xmm8, %xmm12
movss %xmm5, %xmm4
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
@@ -2911,7 +2911,7 @@
addps %xmm12, %xmm1
movss %xmm10, %xmm9
- pshufd $0x1b, %xmm9, %xmm12
+ pshufd $0x1b, %xmm9, %xmm12
movss %xmm6, %xmm5
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
@@ -2919,7 +2919,7 @@
addps %xmm12, %xmm3
movss %xmm11, %xmm10
- pshufd $0x1b, %xmm10, %xmm12
+ pshufd $0x1b, %xmm10, %xmm12
movss %xmm7, %xmm6
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
@@ -2927,7 +2927,7 @@
addps %xmm12, %xmm1
movss %xmm8, %xmm11
- pshufd $0x1b, %xmm11, %xmm12
+ pshufd $0x1b, %xmm11, %xmm12
movss %xmm4, %xmm7
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
@@ -2947,7 +2947,7 @@
movaps -32 * SIZE(Y), %xmm9
movss %xmm9, %xmm8
- pshufd $0x1b, %xmm8, %xmm12
+ pshufd $0x1b, %xmm8, %xmm12
movss %xmm5, %xmm4
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
@@ -2959,7 +2959,7 @@
movaps -28 * SIZE(Y), %xmm10
movss %xmm10, %xmm9
- pshufd $0x1b, %xmm9, %xmm12
+ pshufd $0x1b, %xmm9, %xmm12
movss %xmm6, %xmm5
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
@@ -2971,7 +2971,7 @@
movaps -24 * SIZE(Y), %xmm11
movss %xmm11, %xmm10
- pshufd $0x1b, %xmm10, %xmm12
+ pshufd $0x1b, %xmm10, %xmm12
movss %xmm7, %xmm6
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
@@ -2983,7 +2983,7 @@
movaps -20 * SIZE(Y), %xmm8
movss %xmm8, %xmm11
- pshufd $0x1b, %xmm11, %xmm12
+ pshufd $0x1b, %xmm11, %xmm12
movss %xmm4, %xmm7
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
@@ -3003,7 +3003,7 @@
movaps -32 * SIZE(Y), %xmm9
movss %xmm9, %xmm8
- pshufd $0x1b, %xmm8, %xmm12
+ pshufd $0x1b, %xmm8, %xmm12
movss %xmm5, %xmm4
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
@@ -3015,7 +3015,7 @@
movaps -28 * SIZE(Y), %xmm10
movss %xmm10, %xmm9
- pshufd $0x1b, %xmm9, %xmm12
+ pshufd $0x1b, %xmm9, %xmm12
movss %xmm6, %xmm5
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
@@ -3038,7 +3038,7 @@
movaps -32 * SIZE(Y), %xmm9
movss %xmm9, %xmm8
- pshufd $0x1b, %xmm8, %xmm12
+ pshufd $0x1b, %xmm8, %xmm12
movss %xmm5, %xmm4
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
@@ -3058,7 +3058,7 @@
movss %xmm5, %xmm8
shufps $0x24, %xmm4, %xmm4
- pshufd $0x18, %xmm8, %xmm12
+ pshufd $0x18, %xmm8, %xmm12
shufps $0x24, %xmm8, %xmm8
mulps %xmm4, %xmm8
@@ -3121,7 +3121,7 @@
ALIGN_3
.L203:
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
movsd (Y), %xmm8
@@ -3135,7 +3135,7 @@
addq INCX, X
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm9, %xmm12
+ pshufd $0xb1, %xmm9, %xmm12
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
movsd (Y), %xmm9
@@ -3149,7 +3149,7 @@
addq INCX, X
addps %xmm12, %xmm3
- pshufd $0xb1, %xmm10, %xmm12
+ pshufd $0xb1, %xmm10, %xmm12
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
movsd (Y), %xmm10
@@ -3163,7 +3163,7 @@
addq INCX, X
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm11, %xmm12
+ pshufd $0xb1, %xmm11, %xmm12
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
movsd (Y), %xmm11
@@ -3177,7 +3177,7 @@
addq INCX, X
addps %xmm12, %xmm3
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
movsd (Y), %xmm8
@@ -3191,7 +3191,7 @@
addq INCX, X
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm9, %xmm12
+ pshufd $0xb1, %xmm9, %xmm12
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
movsd (Y), %xmm9
@@ -3205,7 +3205,7 @@
addq INCX, X
addps %xmm12, %xmm3
- pshufd $0xb1, %xmm10, %xmm12
+ pshufd $0xb1, %xmm10, %xmm12
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
movsd (Y), %xmm10
@@ -3219,7 +3219,7 @@
addq INCX, X
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm11, %xmm12
+ pshufd $0xb1, %xmm11, %xmm12
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
movsd (Y), %xmm11
@@ -3239,7 +3239,7 @@
ALIGN_3
.L204:
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
movsd (Y), %xmm8
@@ -3253,7 +3253,7 @@
addq INCX, X
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm9, %xmm12
+ pshufd $0xb1, %xmm9, %xmm12
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
movsd (Y), %xmm9
@@ -3267,7 +3267,7 @@
addq INCX, X
addps %xmm12, %xmm3
- pshufd $0xb1, %xmm10, %xmm12
+ pshufd $0xb1, %xmm10, %xmm12
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
movsd (Y), %xmm10
@@ -3281,7 +3281,7 @@
addq INCX, X
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm11, %xmm12
+ pshufd $0xb1, %xmm11, %xmm12
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
movsd (Y), %xmm11
@@ -3295,25 +3295,25 @@
addq INCX, X
addps %xmm12, %xmm3
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
mulps %xmm4, %xmm12
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm9, %xmm12
+ pshufd $0xb1, %xmm9, %xmm12
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
mulps %xmm5, %xmm12
addps %xmm12, %xmm3
- pshufd $0xb1, %xmm10, %xmm12
+ pshufd $0xb1, %xmm10, %xmm12
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
mulps %xmm6, %xmm12
addps %xmm12, %xmm1
- pshufd $0xb1, %xmm11, %xmm12
+ pshufd $0xb1, %xmm11, %xmm12
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
mulps %xmm7, %xmm12
@@ -3333,7 +3333,7 @@
movhps (Y), %xmm8
addq INCY, Y
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
mulps %xmm4, %xmm12
@@ -3348,7 +3348,7 @@
movhps (Y), %xmm9
addq INCY, Y
- pshufd $0xb1, %xmm9, %xmm12
+ pshufd $0xb1, %xmm9, %xmm12
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
mulps %xmm5, %xmm12
@@ -3363,7 +3363,7 @@
movhps (Y), %xmm10
addq INCY, Y
- pshufd $0xb1, %xmm10, %xmm12
+ pshufd $0xb1, %xmm10, %xmm12
mulps %xmm6, %xmm10
addps %xmm10, %xmm0
mulps %xmm6, %xmm12
@@ -3378,7 +3378,7 @@
movhps (Y), %xmm11
addq INCY, Y
- pshufd $0xb1, %xmm11, %xmm12
+ pshufd $0xb1, %xmm11, %xmm12
mulps %xmm7, %xmm11
addps %xmm11, %xmm2
mulps %xmm7, %xmm12
@@ -3398,7 +3398,7 @@
movhps (Y), %xmm8
addq INCY, Y
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
mulps %xmm4, %xmm12
@@ -3413,7 +3413,7 @@
movhps (Y), %xmm9
addq INCY, Y
- pshufd $0xb1, %xmm9, %xmm12
+ pshufd $0xb1, %xmm9, %xmm12
mulps %xmm5, %xmm9
addps %xmm9, %xmm2
mulps %xmm5, %xmm12
@@ -3433,7 +3433,7 @@
movhps (Y), %xmm8
addq INCY, Y
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
mulps %xmm4, %xmm12
@@ -3453,7 +3453,7 @@
#endif
movsd (Y), %xmm8
- pshufd $0xb1, %xmm8, %xmm12
+ pshufd $0xb1, %xmm8, %xmm12
mulps %xmm4, %xmm8
addps %xmm8, %xmm0
mulps %xmm4, %xmm12
@@ -3483,7 +3483,7 @@
subss %xmm3, %xmm1
#endif
unpcklps %xmm1, %xmm0
-
+
#ifdef WINDOWS_ABI
movq %xmm0, %rax
#endif
diff --git a/kernel/x86_64/zdot_sse2.S b/kernel/x86_64/zdot_sse2.S
index 63acecc..3ab6f45 100644
--- a/kernel/x86_64/zdot_sse2.S
+++ b/kernel/x86_64/zdot_sse2.S
@@ -50,7 +50,7 @@
#define N ARG2 /* rdx */
#define X ARG3 /* r8 */
#define INCX ARG4 /* r9*/
-#define Y %r10
+#define Y %r10
#define INCY %r11
#endif
@@ -64,7 +64,7 @@
#define MOVLPS movlps
#endif
-
+
PROLOGUE
PROFCODE
@@ -122,7 +122,7 @@
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
movaps -8 * SIZE(Y), %xmm8
@@ -130,7 +130,7 @@
movaps -8 * SIZE(X), %xmm4
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
movaps -6 * SIZE(Y), %xmm9
@@ -142,7 +142,7 @@
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
#endif
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
movaps -4 * SIZE(Y), %xmm10
@@ -150,7 +150,7 @@
movaps -4 * SIZE(X), %xmm6
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
movaps -2 * SIZE(Y), %xmm11
@@ -162,7 +162,7 @@
PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
#endif
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
movaps 0 * SIZE(Y), %xmm8
@@ -170,7 +170,7 @@
movaps 0 * SIZE(X), %xmm4
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
movaps 2 * SIZE(Y), %xmm9
@@ -182,7 +182,7 @@
PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
#endif
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
movaps 4 * SIZE(Y), %xmm10
@@ -190,7 +190,7 @@
movaps 4 * SIZE(X), %xmm6
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
movaps 6 * SIZE(Y), %xmm11
@@ -206,7 +206,7 @@
ALIGN_3
.L12:
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
movaps -8 * SIZE(Y), %xmm8
@@ -214,7 +214,7 @@
movaps -8 * SIZE(X), %xmm4
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
movaps -6 * SIZE(Y), %xmm9
@@ -222,7 +222,7 @@
movaps -6 * SIZE(X), %xmm5
addpd %xmm12, %xmm3
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
movaps -4 * SIZE(Y), %xmm10
@@ -230,7 +230,7 @@
movaps -4 * SIZE(X), %xmm6
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
movaps -2 * SIZE(Y), %xmm11
@@ -238,25 +238,25 @@
movaps -2 * SIZE(X), %xmm7
addpd %xmm12, %xmm3
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
mulpd %xmm4, %xmm12
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
mulpd %xmm5, %xmm12
addpd %xmm12, %xmm3
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
mulpd %xmm6, %xmm12
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
mulpd %xmm7, %xmm12
@@ -275,13 +275,13 @@
movaps -14 * SIZE(X), %xmm5
movaps -14 * SIZE(Y), %xmm9
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
mulpd %xmm4, %xmm12
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
mulpd %xmm5, %xmm12
@@ -292,13 +292,13 @@
movaps -10 * SIZE(X), %xmm7
movaps -10 * SIZE(Y), %xmm11
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
mulpd %xmm6, %xmm12
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
mulpd %xmm7, %xmm12
@@ -317,13 +317,13 @@
movaps -14 * SIZE(X), %xmm5
movaps -14 * SIZE(Y), %xmm9
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
mulpd %xmm4, %xmm12
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
mulpd %xmm5, %xmm12
@@ -340,7 +340,7 @@
movaps -16 * SIZE(X), %xmm4
movaps -16 * SIZE(Y), %xmm8
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
mulpd %xmm4, %xmm12
@@ -375,7 +375,7 @@
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
movaps -8 * SIZE(Y), %xmm8
@@ -384,7 +384,7 @@
movhps -7 * SIZE(X), %xmm4
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
movaps -6 * SIZE(Y), %xmm9
@@ -397,7 +397,7 @@
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
#endif
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
movaps -4 * SIZE(Y), %xmm10
@@ -406,7 +406,7 @@
movhps -3 * SIZE(X), %xmm6
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
movaps -2 * SIZE(Y), %xmm11
@@ -419,7 +419,7 @@
PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
#endif
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
movaps 0 * SIZE(Y), %xmm8
@@ -428,7 +428,7 @@
movhps 1 * SIZE(X), %xmm4
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
movaps 2 * SIZE(Y), %xmm9
@@ -441,7 +441,7 @@
PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
#endif
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
movaps 4 * SIZE(Y), %xmm10
@@ -450,7 +450,7 @@
movhps 5 * SIZE(X), %xmm6
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
movaps 6 * SIZE(Y), %xmm11
@@ -468,7 +468,7 @@
.L22:
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
movaps -8 * SIZE(Y), %xmm8
@@ -477,7 +477,7 @@
movhps -7 * SIZE(X), %xmm4
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
movaps -6 * SIZE(Y), %xmm9
@@ -486,7 +486,7 @@
movhps -5 * SIZE(X), %xmm5
addpd %xmm12, %xmm3
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
movaps -4 * SIZE(Y), %xmm10
@@ -495,7 +495,7 @@
movhps -3 * SIZE(X), %xmm6
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
movaps -2 * SIZE(Y), %xmm11
@@ -504,25 +504,25 @@
movhps -1 * SIZE(X), %xmm7
addpd %xmm12, %xmm3
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
mulpd %xmm4, %xmm12
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
mulpd %xmm5, %xmm12
addpd %xmm12, %xmm3
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
mulpd %xmm6, %xmm12
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
mulpd %xmm7, %xmm12
@@ -540,7 +540,7 @@
movhps -15 * SIZE(X), %xmm4
movaps -16 * SIZE(Y), %xmm8
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
mulpd %xmm4, %xmm12
@@ -550,7 +550,7 @@
movhps -13 * SIZE(X), %xmm5
movaps -14 * SIZE(Y), %xmm9
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
mulpd %xmm5, %xmm12
@@ -560,7 +560,7 @@
movhps -11 * SIZE(X), %xmm6
movaps -12 * SIZE(Y), %xmm10
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
mulpd %xmm6, %xmm12
@@ -570,7 +570,7 @@
movhps -9 * SIZE(X), %xmm7
movaps -10 * SIZE(Y), %xmm11
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
mulpd %xmm7, %xmm12
@@ -588,7 +588,7 @@
movhps -15 * SIZE(X), %xmm4
movaps -16 * SIZE(Y), %xmm8
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
mulpd %xmm4, %xmm12
@@ -598,7 +598,7 @@
movhps -13 * SIZE(X), %xmm5
movaps -14 * SIZE(Y), %xmm9
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
mulpd %xmm5, %xmm12
@@ -616,7 +616,7 @@
movhps -15 * SIZE(X), %xmm4
movaps -16 * SIZE(Y), %xmm8
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
mulpd %xmm4, %xmm12
@@ -654,7 +654,7 @@
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
#endif
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
movaps -8 * SIZE(X), %xmm8
@@ -663,7 +663,7 @@
movhps -7 * SIZE(Y), %xmm4
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
movaps -6 * SIZE(X), %xmm9
@@ -676,7 +676,7 @@
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
movaps -4 * SIZE(X), %xmm10
@@ -685,7 +685,7 @@
movhps -3 * SIZE(Y), %xmm6
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
movaps -2 * SIZE(X), %xmm11
@@ -698,7 +698,7 @@
PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
#endif
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
movaps 0 * SIZE(X), %xmm8
@@ -707,7 +707,7 @@
movhps 1 * SIZE(Y), %xmm4
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
movaps 2 * SIZE(X), %xmm9
@@ -720,7 +720,7 @@
PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
#endif
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
movaps 4 * SIZE(X), %xmm10
@@ -729,7 +729,7 @@
movhps 5 * SIZE(Y), %xmm6
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
movaps 6 * SIZE(X), %xmm11
@@ -747,7 +747,7 @@
.L32:
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
movaps -8 * SIZE(X), %xmm8
@@ -756,7 +756,7 @@
movhps -7 * SIZE(Y), %xmm4
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
movaps -6 * SIZE(X), %xmm9
@@ -765,7 +765,7 @@
movhps -5 * SIZE(Y), %xmm5
addpd %xmm12, %xmm3
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
movaps -4 * SIZE(X), %xmm10
@@ -774,7 +774,7 @@
movhps -3 * SIZE(Y), %xmm6
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
movaps -2 * SIZE(X), %xmm11
@@ -783,25 +783,25 @@
movhps -1 * SIZE(Y), %xmm7
addpd %xmm12, %xmm3
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
mulpd %xmm4, %xmm12
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
mulpd %xmm5, %xmm12
addpd %xmm12, %xmm3
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
mulpd %xmm6, %xmm12
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
mulpd %xmm7, %xmm12
@@ -819,7 +819,7 @@
movhps -15 * SIZE(Y), %xmm4
movaps -16 * SIZE(X), %xmm8
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
mulpd %xmm4, %xmm12
@@ -829,7 +829,7 @@
movhps -13 * SIZE(Y), %xmm5
movaps -14 * SIZE(X), %xmm9
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
mulpd %xmm5, %xmm12
@@ -839,7 +839,7 @@
movhps -11 * SIZE(Y), %xmm6
movaps -12 * SIZE(X), %xmm10
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
mulpd %xmm6, %xmm12
@@ -849,7 +849,7 @@
movhps -9 * SIZE(Y), %xmm7
movaps -10 * SIZE(X), %xmm11
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
mulpd %xmm7, %xmm12
@@ -867,7 +867,7 @@
movhps -15 * SIZE(Y), %xmm4
movaps -16 * SIZE(X), %xmm8
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
mulpd %xmm4, %xmm12
@@ -877,7 +877,7 @@
movhps -13 * SIZE(Y), %xmm5
movaps -14 * SIZE(X), %xmm9
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
mulpd %xmm5, %xmm12
@@ -898,7 +898,7 @@
movhps -15 * SIZE(Y), %xmm4
movaps -16 * SIZE(X), %xmm8
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
mulpd %xmm4, %xmm12
@@ -933,7 +933,7 @@
#endif
movsd %xmm9, %xmm8
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
movsd %xmm5, %xmm4
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
@@ -943,7 +943,7 @@
addpd %xmm12, %xmm1
movsd %xmm10, %xmm9
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
movsd %xmm6, %xmm5
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm0
@@ -957,7 +957,7 @@
#endif
movsd %xmm11, %xmm10
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
movsd %xmm7, %xmm6
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
@@ -967,7 +967,7 @@
addpd %xmm12, %xmm1
movsd %xmm8, %xmm11
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
movsd %xmm4, %xmm7
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm0
@@ -981,7 +981,7 @@
#endif
movsd %xmm9, %xmm8
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
movsd %xmm5, %xmm4
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
@@ -991,7 +991,7 @@
addpd %xmm12, %xmm1
movsd %xmm10, %xmm9
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
movsd %xmm6, %xmm5
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm0
@@ -1005,7 +1005,7 @@
#endif
movsd %xmm11, %xmm10
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
movsd %xmm7, %xmm6
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
@@ -1015,7 +1015,7 @@
addpd %xmm12, %xmm1
movsd %xmm8, %xmm11
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
movsd %xmm4, %xmm7
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm0
@@ -1033,7 +1033,7 @@
.L42:
movsd %xmm9, %xmm8
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
movsd %xmm5, %xmm4
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
@@ -1043,7 +1043,7 @@
addpd %xmm12, %xmm1
movsd %xmm10, %xmm9
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
movsd %xmm6, %xmm5
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm0
@@ -1053,7 +1053,7 @@
addpd %xmm12, %xmm1
movsd %xmm11, %xmm10
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
movsd %xmm7, %xmm6
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
@@ -1063,7 +1063,7 @@
addpd %xmm12, %xmm1
movsd %xmm8, %xmm11
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
movsd %xmm4, %xmm7
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm0
@@ -1073,7 +1073,7 @@
addpd %xmm12, %xmm1
movsd %xmm9, %xmm8
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
movsd %xmm5, %xmm4
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
@@ -1083,7 +1083,7 @@
addpd %xmm12, %xmm1
movsd %xmm10, %xmm9
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
movsd %xmm6, %xmm5
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm0
@@ -1091,7 +1091,7 @@
addpd %xmm12, %xmm1
movsd %xmm11, %xmm10
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
movsd %xmm7, %xmm6
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
@@ -1099,7 +1099,7 @@
addpd %xmm12, %xmm1
movsd %xmm8, %xmm11
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
movsd %xmm4, %xmm7
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm0
@@ -1120,7 +1120,7 @@
movaps -14 * SIZE(Y), %xmm10
movsd %xmm9, %xmm8
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
movsd %xmm5, %xmm4
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
@@ -1131,7 +1131,7 @@
movaps -12 * SIZE(Y), %xmm11
movsd %xmm10, %xmm9
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
movsd %xmm6, %xmm5
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm0
@@ -1142,7 +1142,7 @@
movaps -10 * SIZE(Y), %xmm8
movsd %xmm11, %xmm10
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
movsd %xmm7, %xmm6
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
@@ -1150,7 +1150,7 @@
addpd %xmm12, %xmm1
movsd %xmm8, %xmm11
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
movsd %xmm4, %xmm7
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm0
@@ -1169,7 +1169,7 @@
movaps -16 * SIZE(Y), %xmm9
movsd %xmm9, %xmm8
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
movsd %xmm5, %xmm4
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
@@ -1180,7 +1180,7 @@
movaps -14 * SIZE(Y), %xmm10
movsd %xmm10, %xmm9
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
movsd %xmm6, %xmm5
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm0
@@ -1201,7 +1201,7 @@
movlps -16 * SIZE(X), %xmm4
movlps -16 * SIZE(Y), %xmm8
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
mulpd %xmm4, %xmm12
@@ -1213,7 +1213,7 @@
SHUFPD_1 %xmm1, %xmm1
SHUFPD_1 %xmm2, %xmm2
SHUFPD_1 %xmm3, %xmm3
- jmp .L98
+ jmp .L98
ALIGN_3
.L50:
@@ -1254,7 +1254,7 @@
ALIGN_3
.L53:
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
MOVLPS 0 * SIZE(Y), %xmm8
@@ -1266,7 +1266,7 @@
addq INCX, X
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
MOVLPS 0 * SIZE(Y), %xmm9
@@ -1278,7 +1278,7 @@
addq INCX, X
addpd %xmm12, %xmm3
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
MOVLPS 0 * SIZE(Y), %xmm10
@@ -1290,7 +1290,7 @@
addq INCX, X
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
MOVLPS 0 * SIZE(Y), %xmm11
@@ -1302,7 +1302,7 @@
addq INCX, X
addpd %xmm12, %xmm3
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
MOVLPS 0 * SIZE(Y), %xmm8
@@ -1315,7 +1315,7 @@
addq INCX, X
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
MOVLPS 0 * SIZE(Y), %xmm9
@@ -1328,7 +1328,7 @@
addq INCX, X
addpd %xmm12, %xmm3
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
MOVLPS 0 * SIZE(Y), %xmm10
@@ -1340,7 +1340,7 @@
addq INCX, X
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
MOVLPS 0 * SIZE(Y), %xmm11
@@ -1357,7 +1357,7 @@
ALIGN_3
.L54:
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
MOVLPS 0 * SIZE(Y), %xmm8
@@ -1369,7 +1369,7 @@
addq INCX, X
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
MOVLPS 0 * SIZE(Y), %xmm9
@@ -1381,7 +1381,7 @@
addq INCX, X
addpd %xmm12, %xmm3
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
MOVLPS 0 * SIZE(Y), %xmm10
@@ -1393,7 +1393,7 @@
addq INCX, X
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
MOVLPS 0 * SIZE(Y), %xmm11
@@ -1405,25 +1405,25 @@
addq INCX, X
addpd %xmm12, %xmm3
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
mulpd %xmm4, %xmm12
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
mulpd %xmm5, %xmm12
addpd %xmm12, %xmm3
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
mulpd %xmm6, %xmm12
addpd %xmm12, %xmm1
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
mulpd %xmm7, %xmm12
@@ -1441,7 +1441,7 @@
movhps 1 * SIZE(Y), %xmm8
addq INCY, Y
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
mulpd %xmm4, %xmm12
@@ -1454,7 +1454,7 @@
movhps 1 * SIZE(Y), %xmm9
addq INCY, Y
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
mulpd %xmm5, %xmm12
@@ -1467,7 +1467,7 @@
movhps 1 * SIZE(Y), %xmm10
addq INCY, Y
- pshufd $0x4e, %xmm10, %xmm12
+ pshufd $0x4e, %xmm10, %xmm12
mulpd %xmm6, %xmm10
addpd %xmm10, %xmm0
mulpd %xmm6, %xmm12
@@ -1480,7 +1480,7 @@
movhps 1 * SIZE(Y), %xmm11
addq INCY, Y
- pshufd $0x4e, %xmm11, %xmm12
+ pshufd $0x4e, %xmm11, %xmm12
mulpd %xmm7, %xmm11
addpd %xmm11, %xmm2
mulpd %xmm7, %xmm12
@@ -1498,7 +1498,7 @@
movhps 1 * SIZE(Y), %xmm8
addq INCY, Y
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
mulpd %xmm4, %xmm12
@@ -1511,7 +1511,7 @@
movhps 1 * SIZE(Y), %xmm9
addq INCY, Y
- pshufd $0x4e, %xmm9, %xmm12
+ pshufd $0x4e, %xmm9, %xmm12
mulpd %xmm5, %xmm9
addpd %xmm9, %xmm2
mulpd %xmm5, %xmm12
@@ -1527,7 +1527,7 @@
MOVLPS 0 * SIZE(Y), %xmm8
movhps 1 * SIZE(Y), %xmm8
- pshufd $0x4e, %xmm8, %xmm12
+ pshufd $0x4e, %xmm8, %xmm12
mulpd %xmm4, %xmm8
addpd %xmm8, %xmm0
mulpd %xmm4, %xmm12
diff --git a/kernel/x86_64/zgemm3m_kernel_2x8_nehalem.S b/kernel/x86_64/zgemm3m_kernel_2x8_nehalem.S
index 97eb1ec..0069066 100644
--- a/kernel/x86_64/zgemm3m_kernel_2x8_nehalem.S
+++ b/kernel/x86_64/zgemm3m_kernel_2x8_nehalem.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -103,7 +103,7 @@
#endif
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -148,7 +148,7 @@
movlps %xmm0, ALPHA_R
movlps %xmm1, ALPHA_I
-
+
subq $-16 * SIZE, A
subq $-16 * SIZE, B
@@ -162,7 +162,7 @@
movq %r11, OFFSET
#ifndef LEFT
negq %r11
-#endif
+#endif
movq %r11, KK
#endif
@@ -205,7 +205,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
prefetcht0 -16 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -241,7 +241,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -256,7 +256,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm12
@@ -606,7 +606,7 @@
decq I
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $1, M
@@ -626,7 +626,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -640,7 +640,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -655,7 +655,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
mulpd %xmm0, %xmm1
addpd %xmm1, %xmm8
movaps -14 * SIZE(BO), %xmm1
@@ -830,7 +830,7 @@
movhps %xmm2, 1 * SIZE(CO2, LDC, 2)
movlps %xmm3, 0 * SIZE(CO2, %rax)
movhps %xmm3, 1 * SIZE(CO2, %rax)
- ALIGN_4
+ ALIGN_4
.L29:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -879,7 +879,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -901,7 +901,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -916,7 +916,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm8
@@ -1120,7 +1120,7 @@
decq I
BRANCH
jg .L31
- ALIGN_4
+ ALIGN_4
.L40:
testq $1, M
@@ -1140,7 +1140,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1154,7 +1154,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1169,7 +1169,7 @@
jle .L45
ALIGN_3
-.L42:
+.L42:
mulpd %xmm0, %xmm1
addpd %xmm1, %xmm8
movaps -14 * SIZE(BO), %xmm1
@@ -1279,7 +1279,7 @@
movhps %xmm2, 1 * SIZE(CO2)
movlps %xmm3, 0 * SIZE(CO2, LDC)
movhps %xmm3, 1 * SIZE(CO2, LDC)
- ALIGN_4
+ ALIGN_4
.L49:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -1324,7 +1324,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -1342,7 +1342,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1357,7 +1357,7 @@
jle .L55
ALIGN_3
-.L52:
+.L52:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm8
@@ -1482,7 +1482,7 @@
decq I
BRANCH
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $1, M
@@ -1502,7 +1502,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1514,7 +1514,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1529,7 +1529,7 @@
jle .L65
ALIGN_3
-.L62:
+.L62:
mulpd %xmm0, %xmm1
movddup -15 * SIZE(AO), %xmm0
addpd %xmm1, %xmm8
@@ -1605,7 +1605,7 @@
movhps %xmm0, 1 * SIZE(CO1)
movlps %xmm1, 0 * SIZE(CO2)
movhps %xmm1, 1 * SIZE(CO2)
- ALIGN_4
+ ALIGN_4
.L69:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -1649,7 +1649,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -1666,7 +1666,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1681,7 +1681,7 @@
jle .L75
ALIGN_3
-.L72:
+.L72:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm1, %xmm8
@@ -1766,7 +1766,7 @@
decq I
BRANCH
jg .L71
- ALIGN_4
+ ALIGN_4
.L80:
testq $1, M
@@ -1786,7 +1786,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
#ifndef TRMMKERNEL
movaps -16 * SIZE(AO), %xmm0
@@ -1822,7 +1822,7 @@
jle .L85
ALIGN_3
-.L82:
+.L82:
mulpd %xmm0, %xmm1
#ifndef TRMMKERNEL
movapd -14 * SIZE(AO), %xmm0
@@ -1902,7 +1902,7 @@
movlps %xmm0, 0 * SIZE(CO1)
movhps %xmm0, 1 * SIZE(CO1)
- ALIGN_4
+ ALIGN_4
.L999:
movq 0(%rsp), %rbx
diff --git a/kernel/x86_64/zgemm3m_kernel_4x2_atom.S b/kernel/x86_64/zgemm3m_kernel_4x2_atom.S
index 189505d..1049c01 100644
--- a/kernel/x86_64/zgemm3m_kernel_4x2_atom.S
+++ b/kernel/x86_64/zgemm3m_kernel_4x2_atom.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -96,7 +96,7 @@
#endif
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -136,14 +136,14 @@
movsd %xmm0, ALPHA_R
movsd %xmm1, ALPHA_I
-
+
salq $ZBASE_SHIFT, LDC
movq N, J
sarq $1, J
jle .L40
ALIGN_4
-
+
.L10:
movq C, CO1
leaq (C, LDC, 1), CO2
@@ -154,7 +154,7 @@
movq K, %rax
salq $BASE_SHIFT + 1, %rax
leaq (B, %rax), BB
-
+
movq M, I
sarq $2, I
jle .L20
@@ -202,7 +202,7 @@
addsd %xmm7, %xmm14
movsd 3 * SIZE(AO), %xmm7
mulsd %xmm3, %xmm2
-
+
addsd %xmm6, %xmm15
PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO)
movaps %xmm4, %xmm6
@@ -771,7 +771,7 @@
movsd %xmm9, 0 * SIZE(CO2)
movsd %xmm11, 1 * SIZE(CO2)
ALIGN_4
-
+
.L39:
movq BO, B
decq J # j --
@@ -1185,7 +1185,7 @@
movsd %xmm8, 0 * SIZE(CO1)
movsd %xmm9, 1 * SIZE(CO1)
ALIGN_4
-
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_barcelona.S b/kernel/x86_64/zgemm3m_kernel_4x4_barcelona.S
index 4199bd9..76ed76d 100644
--- a/kernel/x86_64/zgemm3m_kernel_4x4_barcelona.S
+++ b/kernel/x86_64/zgemm3m_kernel_4x4_barcelona.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define BUFFERED
#define OLD_M %rdi
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -305,7 +305,7 @@
movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\
movapd %xmm0, %xmm2 ;\
addq $8 * SIZE, %rax
-
+
#define KERNEL_SUB1(xx) \
mulpd %xmm1, %xmm0 ;\
mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\
@@ -416,7 +416,7 @@
#endif
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, (%rsp)
movq %rbp, 8(%rsp)
@@ -475,7 +475,7 @@
movsd %xmm0, 0 + ALPHA
movsd %xmm1, 8 + ALPHA
-
+
salq $ZBASE_SHIFT, LDC
#ifdef TRMMKERNEL
@@ -483,7 +483,7 @@
movsd %xmm12, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
movq N, J
sarq $2, J # j = (n >> 2)
@@ -496,18 +496,18 @@
#endif
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
#ifdef BUFFERED
movq K, %rax
sarq $2, %rax
jle .L03
ALIGN_3
-
+
.L02:
prefetch (RPREFETCHSIZE + 0) * SIZE(B)
@@ -567,7 +567,7 @@
subq $1, %rax
jne .L04
ALIGN_4
-
+
.L10:
#endif
movq A, AO # aoffset = a
@@ -594,7 +594,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
prefetch (RPREFETCHSIZE + 0) * SIZE(BB)
prefetch (RPREFETCHSIZE + 8) * SIZE(BB)
@@ -626,7 +626,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -831,7 +831,7 @@
movhpd %xmm2, 5 * SIZE(CO1)
movsd %xmm3, 6 * SIZE(CO1)
movhpd %xmm3, 7 * SIZE(CO1)
-
+
movsd 0 * SIZE(CO2), %xmm0
movhpd 1 * SIZE(CO2), %xmm0
movsd 2 * SIZE(CO2), %xmm1
@@ -866,7 +866,7 @@
movhpd %xmm2, 5 * SIZE(CO2)
movsd %xmm3, 6 * SIZE(CO2)
movhpd %xmm3, 7 * SIZE(CO2)
-
+
movsd 0 * SIZE(CO1, LDC, 2), %xmm0
movhpd 1 * SIZE(CO1, LDC, 2), %xmm0
movsd 2 * SIZE(CO1, LDC, 2), %xmm1
@@ -901,7 +901,7 @@
movhpd %xmm2, 5 * SIZE(CO1, LDC, 2)
movsd %xmm3, 6 * SIZE(CO1, LDC, 2)
movhpd %xmm3, 7 * SIZE(CO1, LDC, 2)
-
+
movsd 0 * SIZE(CO2, LDC, 2), %xmm0
movhpd 1 * SIZE(CO2, LDC, 2), %xmm0
movsd 2 * SIZE(CO2, LDC, 2), %xmm1
@@ -943,7 +943,7 @@
decq I # i --
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $3, M
@@ -969,7 +969,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -986,7 +986,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1155,7 +1155,7 @@
movhpd %xmm0, 1 * SIZE(CO1, LDC, 2)
movsd %xmm1, 2 * SIZE(CO1, LDC, 2)
movhpd %xmm1, 3 * SIZE(CO1, LDC, 2)
-
+
movsd 0 * SIZE(CO2, LDC, 2), %xmm0
movhpd 1 * SIZE(CO2, LDC, 2), %xmm0
movsd 2 * SIZE(CO2, LDC, 2), %xmm1
@@ -1176,7 +1176,7 @@
addq $4 * SIZE, CO1
addq $4 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -1197,7 +1197,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1214,7 +1214,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1328,7 +1328,7 @@
movsd %xmm0, 0 * SIZE(CO1, LDC, 2)
movhpd %xmm0, 1 * SIZE(CO1, LDC, 2)
-
+
movsd 0 * SIZE(CO2, LDC, 2), %xmm0
movhpd 1 * SIZE(CO2, LDC, 2), %xmm0
@@ -1339,8 +1339,8 @@
movsd %xmm0, 0 * SIZE(CO2, LDC, 2)
movhpd %xmm0, 1 * SIZE(CO2, LDC, 2)
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $4, KK
@@ -1367,18 +1367,18 @@
#ifdef BUFFERED
leaq 16 * SIZE + BUFFER, BO
#endif
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
#ifdef BUFFERED
movq K, %rax
sarq $2, %rax
jle .L43
ALIGN_4
-
+
.L42:
prefetchnta (RPREFETCHSIZE + 0) * SIZE(B)
@@ -1419,7 +1419,7 @@
subq $1, %rax
jne .L44
ALIGN_4
-
+
.L50:
#endif
movq C, CO1 # coffset1 = c
@@ -1447,7 +1447,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movddup -16 * SIZE(BO), %xmm1
movddup -15 * SIZE(BO), %xmm5
@@ -1467,7 +1467,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1612,7 +1612,7 @@
movhpd %xmm2, 5 * SIZE(CO1)
movsd %xmm3, 6 * SIZE(CO1)
movhpd %xmm3, 7 * SIZE(CO1)
-
+
movsd 0 * SIZE(CO2), %xmm0
movhpd 1 * SIZE(CO2), %xmm0
movsd 2 * SIZE(CO2), %xmm1
@@ -1653,7 +1653,7 @@
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1676,7 +1676,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1692,7 +1692,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1780,7 +1780,7 @@
.L69:
addpd %xmm10, %xmm8
addpd %xmm11, %xmm9
-
+
movsd 0 * SIZE(CO1), %xmm0
movhpd 1 * SIZE(CO1), %xmm0
movsd 2 * SIZE(CO1), %xmm1
@@ -1819,7 +1819,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L70:
testq $1, M
@@ -1842,7 +1842,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movddup -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1858,7 +1858,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1950,8 +1950,8 @@
movsd %xmm0, 0 * SIZE(CO2)
movhpd %xmm0, 1 * SIZE(CO2)
- ALIGN_4
-
+ ALIGN_4
+
.L79:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $2, KK
@@ -1973,18 +1973,18 @@
#ifdef BUFFERED
leaq 16 * SIZE + BUFFER, BO
#endif
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
#ifdef BUFFERED
movq K, %rax
sarq $3, %rax
jle .L83
ALIGN_4
-
+
.L82:
prefetchnta (RPREFETCHSIZE + 0) * SIZE(B)
@@ -2025,7 +2025,7 @@
decq %rax
jne .L84
ALIGN_4
-
+
.L90:
#endif
movq C, CO1 # coffset1 = c
@@ -2052,7 +2052,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movapd -8 * SIZE(AO), %xmm2
pxor %xmm8, %xmm8
@@ -2071,7 +2071,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2189,12 +2189,12 @@
movhpd %xmm2, 5 * SIZE(CO1)
movsd %xmm3, 6 * SIZE(CO1)
movhpd %xmm3, 7 * SIZE(CO1)
-
+
addq $8 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L91
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
@@ -2217,7 +2217,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movddup -16 * SIZE(BO), %xmm0
pxor %xmm8, %xmm8
@@ -2233,7 +2233,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2325,7 +2325,7 @@
movhpd %xmm1, 3 * SIZE(CO1)
addq $4 * SIZE, CO1
- ALIGN_4
+ ALIGN_4
.L110:
testq $1, M
@@ -2348,7 +2348,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2360,7 +2360,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_core2.S b/kernel/x86_64/zgemm3m_kernel_4x4_core2.S
index 1b466fb..a78890d 100644
--- a/kernel/x86_64/zgemm3m_kernel_4x4_core2.S
+++ b/kernel/x86_64/zgemm3m_kernel_4x4_core2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,7 +49,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -95,7 +95,7 @@
#endif
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -145,7 +145,7 @@
movsd %xmm0, 0 + ALPHA
movsd %xmm1, 8 + ALPHA
-
+
subq $-16 * SIZE, A
subq $-16 * SIZE, B
@@ -159,7 +159,7 @@
movsd %xmm12, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
movq N, J
@@ -171,18 +171,18 @@
.L01:
/* Copying to Sub Buffer */
leaq 16 * SIZE + BUFFER, BO
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
sarq $2, %rax
NOBRANCH
jle .L05
ALIGN_4
-
+
.L02:
movapd -16 * SIZE(B), %xmm0
prefetchnta (PREFETCH_R + 0) * SIZE(B)
@@ -213,7 +213,7 @@
unpckhpd %xmm6, %xmm6
movddup %xmm7, %xmm15
unpckhpd %xmm7, %xmm7
-
+
prefetcht0 (PREFETCH_W + 8) * SIZE(BO)
movapd %xmm8, -16 * SIZE(BO)
movapd %xmm0, -14 * SIZE(BO)
@@ -271,7 +271,7 @@
BRANCH
jne .L06
ALIGN_4
-
+
.L10:
leaq (PREFETCH_R + 0) * SIZE(B), BB
@@ -297,7 +297,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
movaps -14 * SIZE(AO), %xmm1
@@ -332,7 +332,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -347,7 +347,7 @@
jle .L15
ALIGN_4
-.L12:
+.L12:
PADDING;
addpd %xmm2, %xmm10
movaps -16 * SIZE(BO), %xmm2
@@ -572,7 +572,7 @@
movhpd %xmm2, 5 * SIZE(CO1)
movsd %xmm3, 6 * SIZE(CO1)
movhpd %xmm3, 7 * SIZE(CO1)
-
+
movsd 0 * SIZE(CO2), %xmm0
movhpd 1 * SIZE(CO2), %xmm0
movsd 2 * SIZE(CO2), %xmm1
@@ -607,7 +607,7 @@
movhpd %xmm2, 5 * SIZE(CO2)
movsd %xmm3, 6 * SIZE(CO2)
movhpd %xmm3, 7 * SIZE(CO2)
-
+
movsd 0 * SIZE(CO1, LDC, 2), %xmm0
movhpd 1 * SIZE(CO1, LDC, 2), %xmm0
movsd 2 * SIZE(CO1, LDC, 2), %xmm1
@@ -642,7 +642,7 @@
movhpd %xmm2, 5 * SIZE(CO1, LDC, 2)
movsd %xmm3, 6 * SIZE(CO1, LDC, 2)
movhpd %xmm3, 7 * SIZE(CO1, LDC, 2)
-
+
movsd 0 * SIZE(CO2, LDC, 2), %xmm0
movhpd 1 * SIZE(CO2, LDC, 2), %xmm0
movsd 2 * SIZE(CO2, LDC, 2), %xmm1
@@ -683,7 +683,7 @@
decq I # i --
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -701,7 +701,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
pxor %xmm8, %xmm8
movapd -16 * SIZE(AO), %xmm0
@@ -719,7 +719,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -734,7 +734,7 @@
jle .L25
ALIGN_4
-.L21:
+.L21:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm2, %xmm8
movapd -16 * SIZE(BO), %xmm2
@@ -893,7 +893,7 @@
movhpd %xmm0, 1 * SIZE(CO1, LDC, 2)
movsd %xmm1, 2 * SIZE(CO1, LDC, 2)
movhpd %xmm1, 3 * SIZE(CO1, LDC, 2)
-
+
movsd 0 * SIZE(CO2, LDC, 2), %xmm0
movhpd 1 * SIZE(CO2, LDC, 2), %xmm0
movsd 2 * SIZE(CO2, LDC, 2), %xmm1
@@ -915,7 +915,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
ALIGN_4
-
+
.L30:
testq $1, M
BRANCH
@@ -932,7 +932,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
pxor %xmm8, %xmm8
movsd -16 * SIZE(AO), %xmm0
@@ -951,7 +951,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -966,7 +966,7 @@
jle .L35
ALIGN_4
-.L31:
+.L31:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addsd %xmm2, %xmm8
movsd -16 * SIZE(BO), %xmm2
@@ -1101,7 +1101,7 @@
movsd %xmm0, 0 * SIZE(CO1, LDC, 2)
movhpd %xmm0, 1 * SIZE(CO1, LDC, 2)
-
+
movsd 0 * SIZE(CO2, LDC, 2), %xmm0
movhpd 1 * SIZE(CO2, LDC, 2), %xmm0
@@ -1133,11 +1133,11 @@
.L41:
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
sarq $3, %rax
@@ -1145,7 +1145,7 @@
addq %rax, %rax
ALIGN_4
-
+
.L42:
movddup -16 * SIZE(B), %xmm8
movddup -15 * SIZE(B), %xmm9
@@ -1191,7 +1191,7 @@
subq $1, %rax
jne .L44
ALIGN_4
-
+
.L45:
movq C, CO1
leaq (C, LDC, 1), CO2
@@ -1214,7 +1214,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1238,7 +1238,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1252,7 +1252,7 @@
jle .L55
ALIGN_4
-.L51:
+.L51:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm2, %xmm8
@@ -1398,7 +1398,7 @@
movhpd %xmm2, 5 * SIZE(CO1)
movsd %xmm3, 6 * SIZE(CO1)
movhpd %xmm3, 7 * SIZE(CO1)
-
+
movsd 0 * SIZE(CO2), %xmm0
movhpd 1 * SIZE(CO2), %xmm0
movsd 2 * SIZE(CO2), %xmm1
@@ -1438,7 +1438,7 @@
addq $8 * SIZE, CO2
subq $1, I
jg .L50
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1455,7 +1455,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
pxor %xmm8, %xmm8
movapd -16 * SIZE(AO), %xmm0
@@ -1475,7 +1475,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1489,7 +1489,7 @@
jle .L65
ALIGN_4
-.L61:
+.L61:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm2, %xmm8
@@ -1604,7 +1604,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
ALIGN_4
-
+
.L70:
testq $1, M
jle .L79
@@ -1620,7 +1620,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movsd -15 * SIZE(AO), %xmm1
@@ -1639,7 +1639,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1653,7 +1653,7 @@
jle .L75
ALIGN_4
-.L71:
+.L71:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addsd %xmm2, %xmm8
@@ -1766,11 +1766,11 @@
.L81:
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
sarq $4, %rax
@@ -1778,7 +1778,7 @@
addq %rax, %rax
ALIGN_4
-
+
.L82:
movddup -16 * SIZE(B), %xmm8
movddup -15 * SIZE(B), %xmm9
@@ -1821,7 +1821,7 @@
subq $1, %rax
jne .L84
ALIGN_4
-
+
.L85:
movq C, CO1
movq A, AO
@@ -1843,7 +1843,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
movapd -16 * SIZE(BO), %xmm4
@@ -1865,7 +1865,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1879,7 +1879,7 @@
jle .L95
ALIGN_4
-.L91:
+.L91:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm0, %xmm8
@@ -1989,12 +1989,12 @@
movhpd %xmm2, 5 * SIZE(CO1)
movsd %xmm3, 6 * SIZE(CO1)
movhpd %xmm3, 7 * SIZE(CO1)
-
+
addq $8 * SIZE, CO1 # coffset += 4
subq $1, I
jg .L90
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
@@ -2011,7 +2011,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
movapd -16 * SIZE(BO), %xmm4
@@ -2032,7 +2032,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2046,7 +2046,7 @@
jle .L105
ALIGN_4
-.L101:
+.L101:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm0, %xmm8
@@ -2128,7 +2128,7 @@
addq $4 * SIZE, CO1
ALIGN_4
-
+
.L110:
testq $1, M
jle .L999
@@ -2144,7 +2144,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
movsd -16 * SIZE(BO), %xmm4
@@ -2165,7 +2165,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2179,7 +2179,7 @@
jle .L115
ALIGN_4
-.L111:
+.L111:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm0, %xmm8
diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_penryn.S b/kernel/x86_64/zgemm3m_kernel_4x4_penryn.S
index 7dd2c91..b000dc5 100644
--- a/kernel/x86_64/zgemm3m_kernel_4x4_penryn.S
+++ b/kernel/x86_64/zgemm3m_kernel_4x4_penryn.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -122,7 +122,7 @@
#endif
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -168,7 +168,7 @@
movlps %xmm0, ALPHA_R
movlps %xmm1, ALPHA_I
-
+
subq $-16 * SIZE, A
subq $-17 * SIZE, B
@@ -217,7 +217,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
xorpd %xmm3, %xmm3
@@ -251,7 +251,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -266,7 +266,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addpd %xmm3, %xmm11
movaps -15 * SIZE(BO), %xmm3
@@ -643,7 +643,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
movaps -17 * SIZE(BO), %xmm2
@@ -664,7 +664,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -679,7 +679,7 @@
jle .L25
ALIGN_4
-.L22:
+.L22:
addpd %xmm3, %xmm11
movaps -15 * SIZE(BO), %xmm3
pshufd $0x4e, %xmm2, %xmm7
@@ -871,7 +871,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -891,7 +891,7 @@
leaq (, %rax, SIZE), %rax
addq %rax, AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movaps -17 * SIZE(BO), %xmm2
@@ -907,7 +907,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -922,7 +922,7 @@
jle .L35
ALIGN_4
-.L32:
+.L32:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
shufps $0x44, %xmm0, %xmm0
@@ -1043,7 +1043,7 @@
movhps %xmm0, 1 * SIZE(CO1, LDC, 2)
movlps %xmm1, 0 * SIZE(CO2, LDC, 2)
movhps %xmm1, 1 * SIZE(CO2, LDC, 2)
- ALIGN_4
+ ALIGN_4
.L39:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -1072,7 +1072,7 @@
movq OFFSET, %rax
movq %rax, KK
#endif
-
+
movq K, %rax
salq $BASE_SHIFT + 1, %rax
leaq (B, %rax), BB
@@ -1096,7 +1096,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
PREFETCHB -16 * SIZE(BB)
subq $-4 * SIZE, BB
@@ -1117,7 +1117,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1132,7 +1132,7 @@
jle .L55
ALIGN_4
-.L52:
+.L52:
movaps %xmm2, %xmm4
pshufd $0x4e, %xmm2, %xmm7
@@ -1336,7 +1336,7 @@
decq I
BRANCH
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1356,7 +1356,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1370,7 +1370,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1385,7 +1385,7 @@
jle .L65
ALIGN_4
-.L62:
+.L62:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x4e, %xmm2, %xmm7
@@ -1508,7 +1508,7 @@
addq $4 * SIZE, CO1
addq $4 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L70:
testq $1, M
@@ -1528,7 +1528,7 @@
leaq (, %rax, SIZE), %rax
addq %rax, AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movaps -17 * SIZE(BO), %xmm2
@@ -1541,7 +1541,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1556,7 +1556,7 @@
jle .L75
ALIGN_4
-.L72:
+.L72:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
shufps $0x44, %xmm0, %xmm0
@@ -1638,7 +1638,7 @@
movhps %xmm0, 1 * SIZE(CO1)
movlps %xmm1, 0 * SIZE(CO2)
movhps %xmm1, 1 * SIZE(CO2)
- ALIGN_4
+ ALIGN_4
.L79:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -1681,7 +1681,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
addq %rax, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
movaps -14 * SIZE(AO), %xmm1
@@ -1698,7 +1698,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1713,7 +1713,7 @@
jle .L95
ALIGN_4
-.L92:
+.L92:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x44, %xmm2, %xmm3
@@ -1848,7 +1848,7 @@
decq I
BRANCH
jg .L91
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
@@ -1868,7 +1868,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
addq %rax, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1880,7 +1880,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1895,7 +1895,7 @@
jle .L105
ALIGN_4
-.L102:
+.L102:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x44, %xmm2, %xmm3
@@ -1984,13 +1984,13 @@
movhps %xmm1, 3 * SIZE(CO1)
addq $4 * SIZE, CO1
- ALIGN_4
+ ALIGN_4
.L110:
testq $1, M
BRANCH
jle .L999
-
+
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
@@ -2003,7 +2003,7 @@
leaq (, %rax, SIZE), %rax
addq %rax, AO
addq %rax, BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
movsd -17 * SIZE(BO), %xmm2
@@ -2016,7 +2016,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2031,7 +2031,7 @@
jle .L115
ALIGN_4
-.L112:
+.L112:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
mulsd %xmm0, %xmm2
@@ -2100,7 +2100,7 @@
movlps %xmm0, 0 * SIZE(CO1)
movhps %xmm0, 1 * SIZE(CO1)
- ALIGN_4
+ ALIGN_4
.L999:
movq 0(%rsp), %rbx
diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_sse2.S b/kernel/x86_64/zgemm3m_kernel_4x4_sse2.S
index 3b313b3..cb90b4c 100644
--- a/kernel/x86_64/zgemm3m_kernel_4x4_sse2.S
+++ b/kernel/x86_64/zgemm3m_kernel_4x4_sse2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -329,14 +329,14 @@
movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
addpd %xmm6, %xmm15 ;\
movapd 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
-#endif
-
+#endif
+
#if defined(OS_LINUX) && defined(CORE_BARCELONA)
.align 32768
#endif
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -395,7 +395,7 @@
movsd %xmm0, 0 + ALPHA
movsd %xmm1, 8 + ALPHA
-
+
salq $ZBASE_SHIFT, LDC
#ifdef TRMMKERNEL
@@ -403,7 +403,7 @@
movsd %xmm12, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
movq N, J
sarq $2, J # j = (n >> 2)
@@ -415,17 +415,17 @@
leaq 16 * SIZE + BUFFER, BO
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
sarq $2, %rax
jle .L03
ALIGN_3
-
+
#define RPREFETCHSIZE (8 * 7 + 4)
#define WPREFETCHSIZE (8 * 8 + 4)
@@ -534,7 +534,7 @@
subq $1, %rax
jne .L04
ALIGN_3
-
+
.L10:
movq A, AO # aoffset = a
leaq (RPREFETCHSIZE + 0) * SIZE(B), BB
@@ -560,7 +560,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
movapd -16 * SIZE(BO), %xmm1
@@ -589,7 +589,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -916,7 +916,7 @@
movhpd %xmm2, 5 * SIZE(CO1)
movsd %xmm3, 6 * SIZE(CO1)
movhpd %xmm3, 7 * SIZE(CO1)
-
+
movsd 0 * SIZE(CO2), %xmm0
movhpd 1 * SIZE(CO2), %xmm0
movsd 2 * SIZE(CO2), %xmm1
@@ -951,7 +951,7 @@
movhpd %xmm2, 5 * SIZE(CO2)
movsd %xmm3, 6 * SIZE(CO2)
movhpd %xmm3, 7 * SIZE(CO2)
-
+
movsd 0 * SIZE(CO1, LDC, 2), %xmm0
movhpd 1 * SIZE(CO1, LDC, 2), %xmm0
movsd 2 * SIZE(CO1, LDC, 2), %xmm1
@@ -986,7 +986,7 @@
movhpd %xmm2, 5 * SIZE(CO1, LDC, 2)
movsd %xmm3, 6 * SIZE(CO1, LDC, 2)
movhpd %xmm3, 7 * SIZE(CO1, LDC, 2)
-
+
movsd 0 * SIZE(CO2, LDC, 2), %xmm0
movhpd 1 * SIZE(CO2, LDC, 2), %xmm0
movsd 2 * SIZE(CO2, LDC, 2), %xmm1
@@ -1027,7 +1027,7 @@
decq I # i --
BRANCH
jg .L11
- ALIGN_3
+ ALIGN_3
.L20:
testq $3, M
@@ -1049,7 +1049,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1068,7 +1068,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1281,7 +1281,7 @@
movhpd %xmm0, 1 * SIZE(CO1, LDC, 2)
movsd %xmm1, 2 * SIZE(CO1, LDC, 2)
movhpd %xmm1, 3 * SIZE(CO1, LDC, 2)
-
+
movsd 0 * SIZE(CO2, LDC, 2), %xmm0
movhpd 1 * SIZE(CO2, LDC, 2), %xmm0
movsd 2 * SIZE(CO2, LDC, 2), %xmm1
@@ -1302,7 +1302,7 @@
addq $4 * SIZE, CO1
addq $4 * SIZE, CO2
- ALIGN_3
+ ALIGN_3
.L30:
testq $1, M
@@ -1321,7 +1321,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1340,7 +1340,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1528,7 +1528,7 @@
movsd %xmm0, 0 * SIZE(CO1, LDC, 2)
movhpd %xmm0, 1 * SIZE(CO1, LDC, 2)
-
+
movsd 0 * SIZE(CO2, LDC, 2), %xmm0
movhpd 1 * SIZE(CO2, LDC, 2), %xmm0
@@ -1538,8 +1538,8 @@
movsd %xmm0, 0 * SIZE(CO2, LDC, 2)
movhpd %xmm0, 1 * SIZE(CO2, LDC, 2)
- ALIGN_3
-
+ ALIGN_3
+
.L39:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $4, KK
@@ -1561,17 +1561,17 @@
.L41:
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
sarq $2, %rax
jle .L43
ALIGN_3
-
+
.L42:
PREFETCH 56 * SIZE(B)
@@ -1629,7 +1629,7 @@
decq %rax
jne .L44
ALIGN_3
-
+
.L50:
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -1652,7 +1652,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1676,7 +1676,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1872,7 +1872,7 @@
movhpd %xmm2, 5 * SIZE(CO1)
movsd %xmm3, 6 * SIZE(CO1)
movhpd %xmm3, 7 * SIZE(CO1)
-
+
movsd 0 * SIZE(CO2), %xmm0
movhpd 1 * SIZE(CO2), %xmm0
movsd 2 * SIZE(CO2), %xmm1
@@ -1912,7 +1912,7 @@
addq $8 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L51
- ALIGN_3
+ ALIGN_3
.L60:
testq $2, M
@@ -1931,7 +1931,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1950,7 +1950,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2097,7 +2097,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_3
+ ALIGN_3
.L70:
testq $1, M
@@ -2116,7 +2116,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2135,7 +2135,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2262,8 +2262,8 @@
movsd %xmm0, 0 * SIZE(CO2)
movhpd %xmm0, 1 * SIZE(CO2)
- ALIGN_3
-
+ ALIGN_3
+
.L79:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $2, KK
@@ -2279,17 +2279,17 @@
.L81:
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
sarq $3, %rax
jle .L83
ALIGN_3
-
+
.L82:
PREFETCH 56 * SIZE(B)
@@ -2344,7 +2344,7 @@
decq %rax
jne .L84
ALIGN_3
-
+
.L90:
movq C, CO1 # coffset1 = c
movq A, AO # aoffset = a
@@ -2366,7 +2366,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2387,7 +2387,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2525,11 +2525,11 @@
movhpd %xmm2, 5 * SIZE(CO1)
movsd %xmm3, 6 * SIZE(CO1)
movhpd %xmm3, 7 * SIZE(CO1)
-
+
addq $8 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L91
- ALIGN_3
+ ALIGN_3
.L100:
testq $2, M
@@ -2548,7 +2548,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2564,7 +2564,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2662,7 +2662,7 @@
movhpd %xmm1, 3 * SIZE(CO1)
addq $4 * SIZE, CO1
- ALIGN_3
+ ALIGN_3
.L110:
testq $1, M
@@ -2681,7 +2681,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movsd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2697,7 +2697,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2786,7 +2786,7 @@
movsd %xmm0, 0 * SIZE(CO1)
movhpd %xmm0, 1 * SIZE(CO1)
ALIGN_3
-
+
.L999:
movq %rbx, %rsp
diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_sse3.S b/kernel/x86_64/zgemm3m_kernel_4x4_sse3.S
index 73f5fce..ce46dbd 100644
--- a/kernel/x86_64/zgemm3m_kernel_4x4_sse3.S
+++ b/kernel/x86_64/zgemm3m_kernel_4x4_sse3.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -338,7 +338,7 @@
#endif
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -377,19 +377,19 @@
movsd %xmm0, ALPHA_R
movsd %xmm1, ALPHA_I
-
+
salq $ZBASE_SHIFT, LDC
movq N, J
sarq $2, J # j = (n >> 2)
jle .L40
ALIGN_4
-
+
.L10:
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -418,7 +418,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (B, %rax, 4), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -448,7 +448,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -465,7 +465,7 @@
NOBRANCH
je .L15
-.L1X:
+.L1X:
KERNEL1 (16 * 0)
KERNEL2 (16 * 0)
KERNEL3 (16 * 0)
@@ -946,7 +946,7 @@
movhpd %xmm10, 5 * SIZE(CO1)
movsd %xmm11, 6 * SIZE(CO1)
movhpd %xmm11, 7 * SIZE(CO1)
-
+
movsd 0 * SIZE(CO2), %xmm8
movhpd 1 * SIZE(CO2), %xmm8
movsd 2 * SIZE(CO2), %xmm9
@@ -1016,7 +1016,7 @@
movhpd %xmm10, 5 * SIZE(CO1, LDC, 2)
movsd %xmm11, 6 * SIZE(CO1, LDC, 2)
movhpd %xmm11, 7 * SIZE(CO1, LDC, 2)
-
+
movsd 0 * SIZE(CO2, LDC, 2), %xmm8
movhpd 1 * SIZE(CO2, LDC, 2), %xmm8
movsd 2 * SIZE(CO2, LDC, 2), %xmm9
@@ -1077,7 +1077,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (B, %rax, 4), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1093,7 +1093,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1308,7 +1308,7 @@
movhpd %xmm8, 1 * SIZE(CO1, LDC, 2)
movsd %xmm9, 2 * SIZE(CO1, LDC, 2)
movhpd %xmm9, 3 * SIZE(CO1, LDC, 2)
-
+
movsd 0 * SIZE(CO2, LDC, 2), %xmm8
movhpd 1 * SIZE(CO2, LDC, 2), %xmm8
movsd 2 * SIZE(CO2, LDC, 2), %xmm9
@@ -1329,7 +1329,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -1347,7 +1347,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (B, %rax, 4), BO
-#endif
+#endif
movddup 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1363,7 +1363,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1503,7 +1503,7 @@
movsd %xmm8, 0 * SIZE(CO1, LDC, 2)
movhpd %xmm8, 1 * SIZE(CO1, LDC, 2)
-
+
movsd 0 * SIZE(CO2, LDC, 2), %xmm8
movhpd 1 * SIZE(CO2, LDC, 2), %xmm8
@@ -1514,8 +1514,8 @@
movsd %xmm8, 0 * SIZE(CO2, LDC, 2)
movhpd %xmm8, 1 * SIZE(CO2, LDC, 2)
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $4, KK
@@ -1535,7 +1535,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -1557,7 +1557,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (B, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1581,7 +1581,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1787,7 +1787,7 @@
movhpd %xmm10, 5 * SIZE(CO1)
movsd %xmm11, 6 * SIZE(CO1)
movhpd %xmm11, 7 * SIZE(CO1)
-
+
movsd 0 * SIZE(CO2), %xmm8
movhpd 1 * SIZE(CO2), %xmm8
movsd 2 * SIZE(CO2), %xmm9
@@ -1828,7 +1828,7 @@
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1846,7 +1846,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (B, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1862,7 +1862,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2011,7 +2011,7 @@
addq $4 * SIZE, CO1
addq $4 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L70:
testq $1, M
@@ -2029,7 +2029,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (B, %rax, 2), BO
-#endif
+#endif
movddup 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2045,7 +2045,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2144,8 +2144,8 @@
movsd %xmm8, 0 * SIZE(CO2)
movhpd %xmm8, 1 * SIZE(CO2)
- ALIGN_4
-
+ ALIGN_4
+
.L79:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $2, KK
@@ -2162,7 +2162,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq C, CO1
movq A, AO
@@ -2183,7 +2183,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (B, %rax, 1), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2205,7 +2205,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2342,11 +2342,11 @@
movhpd %xmm10, 5 * SIZE(CO1)
movsd %xmm11, 6 * SIZE(CO1)
movhpd %xmm11, 7 * SIZE(CO1)
-
+
addq $8 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L91
- ALIGN_4
+ ALIGN_4
.L100:
testq $2, M
@@ -2364,7 +2364,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (B, %rax, 1), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2380,7 +2380,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2478,7 +2478,7 @@
movhpd %xmm9, 3 * SIZE(CO1)
addq $4 * SIZE, CO1
- ALIGN_4
+ ALIGN_4
.L110:
testq $1, M
@@ -2496,7 +2496,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (B, %rax, 1), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2517,7 +2517,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2591,8 +2591,8 @@
movsd %xmm8, 0 * SIZE(CO1)
movhpd %xmm8, 1 * SIZE(CO1)
- ALIGN_4
-
+ ALIGN_4
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/zgemm3m_kernel_4x8_nehalem.S b/kernel/x86_64/zgemm3m_kernel_4x8_nehalem.S
index 92be8fc..8da31d2 100644
--- a/kernel/x86_64/zgemm3m_kernel_4x8_nehalem.S
+++ b/kernel/x86_64/zgemm3m_kernel_4x8_nehalem.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -103,7 +103,7 @@
#endif
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -151,7 +151,7 @@
movlps %xmm0, ALPHA_R
movlps %xmm0, ALPHA_I
-
+
subq $-32 * SIZE, A
subq $-32 * SIZE, B
@@ -200,7 +200,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
prefetcht0 -32 * SIZE(BB)
subq $-16 * SIZE, BB
@@ -236,7 +236,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -251,7 +251,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm12
@@ -608,7 +608,7 @@
decq I
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -627,7 +627,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -646,7 +646,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -661,7 +661,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
addps %xmm1, %xmm8
pshufd $0x50, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -862,7 +862,7 @@
addq $4 * SIZE, CO1
addq $4 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -881,7 +881,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
xorps %xmm2, %xmm2
movsd -32 * SIZE(AO), %xmm0
@@ -894,7 +894,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -909,7 +909,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
pshufd $0x00, %xmm0, %xmm1
addps %xmm2, %xmm8
movaps -32 * SIZE(BO), %xmm2
@@ -1023,8 +1023,8 @@
movhps %xmm2, (CO2, LDC)
movlps %xmm3, (CO2, LDC, 2)
movhps %xmm3, (CO2, %rax)
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $8, KK
@@ -1071,7 +1071,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -1093,7 +1093,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1108,7 +1108,7 @@
jle .L45
ALIGN_3
-.L42:
+.L42:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm8
@@ -1309,7 +1309,7 @@
decq I
BRANCH
jg .L41
- ALIGN_4
+ ALIGN_4
.L50:
testq $2, M
@@ -1328,7 +1328,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -1342,7 +1342,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1357,7 +1357,7 @@
jle .L55
ALIGN_3
-.L52:
+.L52:
addps %xmm1, %xmm8
pshufd $0x50, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -1474,7 +1474,7 @@
addq $4 * SIZE, CO1
addq $4 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L60:
testq $1, M
@@ -1493,7 +1493,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
xorps %xmm2, %xmm2
movsd -32 * SIZE(AO), %xmm0
@@ -1505,7 +1505,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1520,7 +1520,7 @@
jle .L65
ALIGN_3
-.L62:
+.L62:
pshufd $0x00, %xmm0, %xmm1
addps %xmm2, %xmm8
movaps -32 * SIZE(BO), %xmm2
@@ -1601,8 +1601,8 @@
movhps %xmm0, (CO1, LDC)
movlps %xmm1, (CO2)
movhps %xmm1, (CO2, LDC)
- ALIGN_4
-
+ ALIGN_4
+
.L69:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $4, KK
@@ -1645,7 +1645,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -1662,7 +1662,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1677,7 +1677,7 @@
jle .L75
ALIGN_3
-.L72:
+.L72:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm8
@@ -1798,7 +1798,7 @@
decq I
BRANCH
jg .L71
- ALIGN_4
+ ALIGN_4
.L80:
testq $2, M
@@ -1817,7 +1817,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -1831,7 +1831,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1846,7 +1846,7 @@
jle .L85
ALIGN_3
-.L82:
+.L82:
addps %xmm1, %xmm8
movsd -32 * SIZE(BO), %xmm1
unpcklps %xmm1, %xmm1
@@ -1931,7 +1931,7 @@
addq $4 * SIZE, CO1
addq $4 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L90:
testq $1, M
@@ -1950,7 +1950,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
xorps %xmm2, %xmm2
movsd -32 * SIZE(AO), %xmm0
@@ -1962,7 +1962,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1977,7 +1977,7 @@
jle .L95
ALIGN_3
-.L92:
+.L92:
pshufd $0x00, %xmm0, %xmm1
addps %xmm2, %xmm8
movsd -32 * SIZE(BO), %xmm2
@@ -2050,8 +2050,8 @@
movlps %xmm0, (CO1)
movhps %xmm0, (CO2)
- ALIGN_4
-
+ ALIGN_4
+
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $2, KK
@@ -2093,7 +2093,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -2106,7 +2106,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2121,7 +2121,7 @@
jle .L105
ALIGN_3
-.L102:
+.L102:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm1, %xmm8
@@ -2208,7 +2208,7 @@
decq I
BRANCH
jg .L101
- ALIGN_4
+ ALIGN_4
.L110:
testq $2, M
@@ -2227,7 +2227,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -2238,7 +2238,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2253,7 +2253,7 @@
jle .L115
ALIGN_3
-.L112:
+.L112:
addps %xmm1, %xmm8
movss -32 * SIZE(BO), %xmm1
unpcklps %xmm1, %xmm1
@@ -2329,7 +2329,7 @@
movhps %xmm0, 2 * SIZE(CO1)
addq $4 * SIZE, CO1
- ALIGN_4
+ ALIGN_4
.L120:
testq $1, M
@@ -2348,7 +2348,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
xorps %xmm2, %xmm2
movss -32 * SIZE(AO), %xmm0
@@ -2359,7 +2359,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2374,7 +2374,7 @@
jle .L125
ALIGN_3
-.L122:
+.L122:
addss %xmm2, %xmm8
movss -32 * SIZE(BO), %xmm2
mulss %xmm0, %xmm2
@@ -2441,8 +2441,8 @@
addps %xmm4, %xmm0
movlps %xmm0, (CO1)
- ALIGN_4
-
+ ALIGN_4
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_barcelona.S b/kernel/x86_64/zgemm3m_kernel_8x4_barcelona.S
index 80c8524..3dbc0dd 100644
--- a/kernel/x86_64/zgemm3m_kernel_8x4_barcelona.S
+++ b/kernel/x86_64/zgemm3m_kernel_8x4_barcelona.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,14 +49,14 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
#define CO1 %r15
#define CO2 %r12
#define BB %rbp
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 64
@@ -299,7 +299,7 @@
movaps 100 * SIZE(BO, %rax, 8), %xmm3 ;\
movaps %xmm0, %xmm2 ;\
addq $16 * SIZE, %rax
-
+
#define KERNEL_SUB1(xx) \
mulps %xmm1, %xmm0 ;\
mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\
@@ -407,7 +407,7 @@
#endif
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -464,13 +464,13 @@
movss %xmm1, 4 + ALPHA
movss %xmm0, 8 + ALPHA
movss %xmm1, 12 + ALPHA
-
+
#ifdef TRMMKERNEL
movsd %xmm4, OFFSET
movsd %xmm4, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
subq $-32 * SIZE, A
@@ -485,16 +485,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $2, %rax
jle .L03
ALIGN_4
-
+
.L02:
prefetch (RPREFETCHSIZE + 0) * SIZE(B)
@@ -581,7 +581,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L10:
movq C, CO1
leaq (C, LDC, 1), CO2
@@ -606,7 +606,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
prefetch 0 * SIZE(BB)
prefetch 16 * SIZE(BB)
@@ -638,7 +638,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -866,7 +866,7 @@
movhps %xmm5, 10 * SIZE(CO2)
movlps %xmm13, 12 * SIZE(CO2)
movhps %xmm13, 14 * SIZE(CO2)
-
+
movups 0 * SIZE(CO1, LDC, 2), %xmm0
movups 4 * SIZE(CO1, LDC, 2), %xmm1
movups 8 * SIZE(CO1, LDC, 2), %xmm2
@@ -929,7 +929,7 @@
addq $16 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $4, M
@@ -946,7 +946,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -966,7 +966,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1196,7 +1196,7 @@
addq $8 * SIZE, CO1 # coffset += 4
addq $8 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L30:
testq $2, M
@@ -1213,7 +1213,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -24 * SIZE(AO), %xmm10
@@ -1233,7 +1233,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1439,7 +1439,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L40:
testq $1, M
@@ -1457,7 +1457,7 @@
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm8
movss -28 * SIZE(AO), %xmm10
@@ -1477,7 +1477,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1676,8 +1676,8 @@
addps %xmm8, %xmm4
movlps %xmm4, 0 * SIZE(CO2, LDC, 2)
- ALIGN_4
-
+ ALIGN_4
+
.L49:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $4, KK
@@ -1694,16 +1694,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $2, %rax
jle .L53
ALIGN_4
-
+
.L52:
prefetch (RPREFETCHSIZE + 0) * SIZE(B)
@@ -1766,7 +1766,7 @@
decq %rax
jne .L54
ALIGN_4
-
+
.L60:
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -1789,7 +1789,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -1814,7 +1814,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2039,7 +2039,7 @@
addq $16 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L61
- ALIGN_4
+ ALIGN_4
.L70:
testq $4, M
@@ -2057,7 +2057,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -2077,7 +2077,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2219,7 +2219,7 @@
addq $8 * SIZE, CO1 # coffset += 4
addq $8 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L80:
testq $2, M
@@ -2236,7 +2236,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -24 * SIZE(AO), %xmm10
@@ -2256,7 +2256,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2395,7 +2395,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L90:
testq $1, M
@@ -2412,7 +2412,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm8
movss -28 * SIZE(AO), %xmm10
@@ -2432,7 +2432,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2566,8 +2566,8 @@
addps %xmm8, %xmm2
movlps %xmm2, 0 * SIZE(CO2)
- ALIGN_4
-
+ ALIGN_4
+
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $2, KK
@@ -2584,16 +2584,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $3, %rax
jle .L103
ALIGN_4
-
+
.L102:
prefetch (RPREFETCHSIZE + 0) * SIZE(B)
@@ -2650,7 +2650,7 @@
decq %rax
jne .L104
ALIGN_4
-
+
.L110:
movq C, CO1 # coffset1 = c
movq A, AO # aoffset = a
@@ -2672,7 +2672,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -2696,7 +2696,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2836,7 +2836,7 @@
addq $16 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L111
- ALIGN_4
+ ALIGN_4
.L120:
testq $4, M
@@ -2853,7 +2853,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -2871,7 +2871,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2968,7 +2968,7 @@
movhps %xmm0, 6 * SIZE(CO1)
addq $8 * SIZE, CO1 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L130:
testq $2, M
@@ -2985,7 +2985,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -24 * SIZE(AO), %xmm10
@@ -3003,7 +3003,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -3101,7 +3101,7 @@
movhps %xmm2, 2 * SIZE(CO1)
addq $4 * SIZE, CO1 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L140:
testq $1, M
@@ -3118,7 +3118,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm8
movss -28 * SIZE(AO), %xmm10
@@ -3136,7 +3136,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -3221,8 +3221,8 @@
addps %xmm8, %xmm2
movlps %xmm2, 0 * SIZE(CO1)
- ALIGN_4
-
+ ALIGN_4
+
.L999:
movq %rbx, %rsp
movq 0(%rsp), %rbx
diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_core2.S b/kernel/x86_64/zgemm3m_kernel_8x4_core2.S
index 2ddbb5c..0b97d85 100644
--- a/kernel/x86_64/zgemm3m_kernel_8x4_core2.S
+++ b/kernel/x86_64/zgemm3m_kernel_8x4_core2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,7 +49,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -95,7 +95,7 @@
#endif
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -156,7 +156,7 @@
movsd %xmm12, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
movq OLD_M, M
@@ -173,16 +173,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq 32 * SIZE + BUFFER, BO
-
+
movq K, %rax
sarq $2, %rax
jle .L05
ALIGN_4
-
+
.L02:
prefetcht0 (PREFETCH_R + 0) * SIZE(B)
movaps -32 * SIZE(B), %xmm3
@@ -261,7 +261,7 @@
subq $1, %rax
jne .L06
ALIGN_4
-
+
.L10:
movq B, BB
@@ -286,7 +286,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
pxor %xmm8, %xmm8
movaps -32 * SIZE(AO), %xmm0
@@ -320,7 +320,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -334,7 +334,7 @@
jle .L15
ALIGN_4
-.L12:
+.L12:
addps %xmm2, %xmm10
movaps -32 * SIZE(BO), %xmm2
addps %xmm3, %xmm14
@@ -588,7 +588,7 @@
movhps %xmm5, 10 * SIZE(CO2)
movlps %xmm13, 12 * SIZE(CO2)
movhps %xmm13, 14 * SIZE(CO2)
-
+
movsd 0 * SIZE(CO1, LDC, 2), %xmm0
movhps 2 * SIZE(CO1, LDC, 2), %xmm0
movsd 4 * SIZE(CO1, LDC, 2), %xmm1
@@ -654,12 +654,12 @@
movhps %xmm5, 10 * SIZE(CO2, LDC, 2)
movlps %xmm15, 12 * SIZE(CO2, LDC, 2)
movhps %xmm15, 14 * SIZE(CO2, LDC, 2)
-
+
addq $16 * SIZE, CO1
addq $16 * SIZE, CO2
subq $1, I
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $4, M
@@ -676,7 +676,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -688,7 +688,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -702,7 +702,7 @@
jle .L25
ALIGN_4
-.L21:
+.L21:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movaps -32 * SIZE(AO), %xmm0
@@ -846,7 +846,7 @@
movhps %xmm4, 2 * SIZE(CO2)
movlps %xmm9, 4 * SIZE(CO2)
movhps %xmm9, 6 * SIZE(CO2)
-
+
movsd 0 * SIZE(CO1, LDC, 2), %xmm0
movhps 2 * SIZE(CO1, LDC, 2), %xmm0
movsd 4 * SIZE(CO1, LDC, 2), %xmm1
@@ -886,7 +886,7 @@
addq $8 * SIZE, CO1
addq $8 * SIZE, CO2
ALIGN_4
-
+
.L30:
testq $2, M
jle .L40
@@ -902,7 +902,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -919,7 +919,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -933,7 +933,7 @@
jle .L35
ALIGN_4
-.L31:
+.L31:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movsd -32 * SIZE(AO), %xmm0
@@ -1061,7 +1061,7 @@
movlps %xmm4, 0 * SIZE(CO2)
movhps %xmm4, 2 * SIZE(CO2)
-
+
movsd 0 * SIZE(CO1, LDC, 2), %xmm0
movhps 2 * SIZE(CO1, LDC, 2), %xmm0
@@ -1102,7 +1102,7 @@
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
leaq (BO, %rax, 8), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1119,7 +1119,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1133,7 +1133,7 @@
jle .L45
ALIGN_4
-.L41:
+.L41:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movss -32 * SIZE(AO), %xmm0
@@ -1257,7 +1257,7 @@
addps %xmm0, %xmm4
movlps %xmm4, 0 * SIZE(CO2)
-
+
movsd 0 * SIZE(CO1, LDC, 2), %xmm0
pshufd $0x50, %xmm10, %xmm4
@@ -1293,11 +1293,11 @@
.L51:
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
sarq $3, %rax
@@ -1305,7 +1305,7 @@
addq %rax, %rax
ALIGN_4
-
+
.L52:
movaps -32 * SIZE(B), %xmm3
movaps -28 * SIZE(B), %xmm7
@@ -1361,7 +1361,7 @@
subq $1, %rax
jne .L54
ALIGN_4
-
+
.L55:
movq C, CO1
leaq (C, LDC, 1), CO2
@@ -1384,7 +1384,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1399,7 +1399,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1413,7 +1413,7 @@
jle .L65
ALIGN_4
-.L61:
+.L61:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movaps -32 * SIZE(AO), %xmm0
@@ -1597,12 +1597,12 @@
movhps %xmm5, 10 * SIZE(CO2)
movlps %xmm13, 12 * SIZE(CO2)
movhps %xmm13, 14 * SIZE(CO2)
-
+
addq $16 * SIZE, CO1
addq $16 * SIZE, CO2
subq $1, I
jg .L60
- ALIGN_4
+ ALIGN_4
.L70:
testq $4, M
@@ -1619,7 +1619,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1631,7 +1631,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1645,7 +1645,7 @@
jle .L75
ALIGN_4
-.L71:
+.L71:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movaps -32 * SIZE(AO), %xmm0
@@ -1760,7 +1760,7 @@
addq $8 * SIZE, CO1
addq $8 * SIZE, CO2
ALIGN_4
-
+
.L80:
testq $2, M
jle .L90
@@ -1776,7 +1776,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1788,7 +1788,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1802,7 +1802,7 @@
jle .L85
ALIGN_4
-.L81:
+.L81:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movsd -32 * SIZE(AO), %xmm0
@@ -1919,7 +1919,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1931,7 +1931,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1945,7 +1945,7 @@
jle .L95
ALIGN_4
-.L91:
+.L91:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
@@ -2059,11 +2059,11 @@
.L101:
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq K, %rax
sarq $4, %rax
@@ -2071,7 +2071,7 @@
addq %rax, %rax
ALIGN_4
-
+
.L102:
movss -32 * SIZE(B), %xmm0
movss -31 * SIZE(B), %xmm1
@@ -2125,7 +2125,7 @@
subq $1, %rax
jne .L104
ALIGN_4
-
+
.L105:
movq C, CO1
movq A, AO
@@ -2147,7 +2147,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -2161,7 +2161,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2175,7 +2175,7 @@
jle .L115
ALIGN_4
-.L111:
+.L111:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movaps -32 * SIZE(AO), %xmm0
@@ -2294,7 +2294,7 @@
addq $16 * SIZE, CO1
subq $1, I
jg .L110
- ALIGN_4
+ ALIGN_4
.L120:
testq $4, M
@@ -2311,7 +2311,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -2323,7 +2323,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2337,7 +2337,7 @@
jle .L125
ALIGN_4
-.L121:
+.L121:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movaps -32 * SIZE(AO), %xmm0
@@ -2419,7 +2419,7 @@
addq $8 * SIZE, CO1
ALIGN_4
-
+
.L130:
testq $2, M
jle .L140
@@ -2435,7 +2435,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -2447,7 +2447,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2461,7 +2461,7 @@
jle .L135
ALIGN_4
-.L131:
+.L131:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movsd -32 * SIZE(AO), %xmm0
@@ -2549,7 +2549,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -2561,7 +2561,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2575,7 +2575,7 @@
jle .L145
ALIGN_4
-.L141:
+.L141:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movss -32 * SIZE(AO), %xmm0
diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_penryn.S b/kernel/x86_64/zgemm3m_kernel_8x4_penryn.S
index bf2d96e..1255c2c 100644
--- a/kernel/x86_64/zgemm3m_kernel_8x4_penryn.S
+++ b/kernel/x86_64/zgemm3m_kernel_8x4_penryn.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -103,7 +103,7 @@
#endif
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -150,7 +150,7 @@
movlps %xmm0, ALPHA_R
movlps %xmm0, ALPHA_I
-
+
subq $-32 * SIZE, A
subq $-32 * SIZE, B
@@ -199,7 +199,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 8), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
xorpd %xmm3, %xmm3
@@ -234,7 +234,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -249,7 +249,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH -32 * SIZE(PREA)
addps %xmm6, %xmm10
addps %xmm3, %xmm14
@@ -654,7 +654,7 @@
movhps %xmm5, 10 * SIZE(CO2)
movlps %xmm13, 12 * SIZE(CO2)
movhps %xmm13, 14 * SIZE(CO2)
-
+
movsd 0 * SIZE(CO1, LDC, 2), %xmm0
movhps 2 * SIZE(CO1, LDC, 2), %xmm0
movsd 4 * SIZE(CO1, LDC, 2), %xmm1
@@ -720,13 +720,13 @@
movhps %xmm5, 10 * SIZE(CO2, LDC, 2)
movlps %xmm15, 12 * SIZE(CO2, LDC, 2)
movhps %xmm15, 14 * SIZE(CO2, LDC, 2)
-
+
addq $16 * SIZE, CO1
addq $16 * SIZE, CO2
decq I
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $4, M
@@ -745,7 +745,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
xorps %xmm3, %xmm3
@@ -765,7 +765,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -780,7 +780,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
addps %xmm6, %xmm10
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x39, %xmm2, %xmm7
@@ -939,7 +939,7 @@
movhps %xmm4, 2 * SIZE(CO2)
movlps %xmm9, 4 * SIZE(CO2)
movhps %xmm9, 6 * SIZE(CO2)
-
+
movsd 0 * SIZE(CO1, LDC, 2), %xmm0
movhps 2 * SIZE(CO1, LDC, 2), %xmm0
movsd 4 * SIZE(CO1, LDC, 2), %xmm1
@@ -978,7 +978,7 @@
addq $8 * SIZE, CO1
addq $8 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L30:
testq $2, M
@@ -997,7 +997,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
xorps %xmm3, %xmm3
@@ -1016,7 +1016,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1031,7 +1031,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x44, %xmm0, %xmm1
@@ -1157,7 +1157,7 @@
addq $4 * SIZE, CO1
addq $4 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L40:
testq $1, M
@@ -1176,7 +1176,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movsd -32 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1188,7 +1188,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1203,7 +1203,7 @@
jle .L45
ALIGN_3
-.L42:
+.L42:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x00, %xmm0, %xmm1
@@ -1338,7 +1338,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 8), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
prefetcht2 -32 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -1365,7 +1365,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1380,7 +1380,7 @@
jle .L55
ALIGN_3
-.L52:
+.L52:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm3, %xmm8
@@ -1569,13 +1569,13 @@
movhps %xmm5, 10 * SIZE(CO2)
movlps %xmm11, 12 * SIZE(CO2)
movhps %xmm11, 14 * SIZE(CO2)
-
+
addq $16 * SIZE, CO1
addq $16 * SIZE, CO2
decq I
BRANCH
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $4, M
@@ -1594,7 +1594,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
xorps %xmm3, %xmm3
@@ -1611,7 +1611,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1626,7 +1626,7 @@
jle .L65
ALIGN_3
-.L62:
+.L62:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm3, %xmm8
@@ -1748,7 +1748,7 @@
addq $8 * SIZE, CO1
addq $8 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L70:
testq $2, M
@@ -1767,7 +1767,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
xorps %xmm3, %xmm3
@@ -1780,7 +1780,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1795,7 +1795,7 @@
jle .L75
ALIGN_3
-.L72:
+.L72:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm3, %xmm8
@@ -1883,7 +1883,7 @@
addq $4 * SIZE, CO1
addq $4 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L80:
testq $1, M
@@ -1902,7 +1902,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movsd -32 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1914,7 +1914,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1929,7 +1929,7 @@
jle .L85
ALIGN_3
-.L82:
+.L82:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x00, %xmm0, %xmm1
@@ -2048,7 +2048,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 8), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -2064,7 +2064,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2079,7 +2079,7 @@
jle .L95
ALIGN_3
-.L92:
+.L92:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x00, %xmm2, %xmm3
@@ -2198,7 +2198,7 @@
decq I
BRANCH
jg .L91
- ALIGN_4
+ ALIGN_4
.L100:
testq $4, M
@@ -2217,7 +2217,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -2229,7 +2229,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2244,7 +2244,7 @@
jle .L105
ALIGN_3
-.L102:
+.L102:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x00, %xmm2, %xmm3
@@ -2328,7 +2328,7 @@
movhps %xmm8, 6 * SIZE(CO1)
addq $8 * SIZE, CO1
- ALIGN_4
+ ALIGN_4
.L110:
testq $2, M
@@ -2347,7 +2347,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
xorps %xmm3, %xmm3
@@ -2360,7 +2360,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2375,7 +2375,7 @@
jle .L115
ALIGN_3
-.L112:
+.L112:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x00, %xmm2, %xmm3
@@ -2449,7 +2449,7 @@
movhps %xmm4, 2 * SIZE(CO1)
addq $4 * SIZE, CO1
- ALIGN_4
+ ALIGN_4
.L120:
testq $1, M
@@ -2468,7 +2468,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -2480,7 +2480,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2495,7 +2495,7 @@
jle .L125
ALIGN_3
-.L122:
+.L122:
mulss %xmm0, %xmm2
movss -31 * SIZE(AO), %xmm0
addss %xmm2, %xmm8
diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_sse.S b/kernel/x86_64/zgemm3m_kernel_8x4_sse.S
index 6bd9148..a3c6916 100644
--- a/kernel/x86_64/zgemm3m_kernel_8x4_sse.S
+++ b/kernel/x86_64/zgemm3m_kernel_8x4_sse.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
@@ -56,7 +56,7 @@
#define CO1 %r15
#define CO2 %rbp
#define BB %r12
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 64
@@ -276,7 +276,7 @@
addps %xmm5, %xmm14 ;\
movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
addps %xmm6, %xmm15 ;\
- movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
+ movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
#define KERNEL5(xx) \
mulps %xmm0, %xmm1 ;\
@@ -342,7 +342,7 @@
#endif
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -388,7 +388,7 @@
#endif
EMMS
-
+
movq %rsp, %rbx # save old stack
subq $128 + LOCAL_BUFFER_SIZE, %rsp
andq $-1024, %rsp # align stack
@@ -402,13 +402,13 @@
movss %xmm1, 4 + ALPHA
movss %xmm0, 8 + ALPHA
movss %xmm1, 12 + ALPHA
-
+
#ifdef TRMMKERNEL
movsd %xmm4, OFFSET
movsd %xmm4, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
subq $-32 * SIZE, A
@@ -423,18 +423,18 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $2, %rax
jle .L03
addq %rax, %rax
ALIGN_4
-
+
.L02:
PREFETCH (RPREFETCHSIZE + 0) * SIZE(B)
@@ -457,7 +457,7 @@
punpckldq %mm5, %mm5
punpckldq %mm6, %mm6
punpckldq %mm7, %mm7
-
+
movq %mm0, 0 * SIZE(BO)
movq %mm0, 2 * SIZE(BO)
movq %mm1, 4 * SIZE(BO)
@@ -517,7 +517,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L10:
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -545,7 +545,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
movaps -32 * SIZE(BO), %xmm1
@@ -574,7 +574,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -925,7 +925,7 @@
movhps %xmm5, 10 * SIZE(CO2)
movlps %xmm13, 12 * SIZE(CO2)
movhps %xmm13, 14 * SIZE(CO2)
-
+
movsd 0 * SIZE(CO1, LDC, 2), %xmm0
movhps 2 * SIZE(CO1, LDC, 2), %xmm0
movsd 4 * SIZE(CO1, LDC, 2), %xmm1
@@ -996,7 +996,7 @@
addq $16 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $4, M
@@ -1013,7 +1013,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -1033,7 +1033,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1275,7 +1275,7 @@
addq $8 * SIZE, CO1 # coffset += 4
addq $8 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L30:
testq $2, M
@@ -1292,7 +1292,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -24 * SIZE(AO), %xmm10
@@ -1312,7 +1312,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1524,7 +1524,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L40:
testq $1, M
@@ -1542,7 +1542,7 @@
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm8
movss -28 * SIZE(AO), %xmm10
@@ -1562,7 +1562,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1763,8 +1763,8 @@
addps %xmm8, %xmm4
movlps %xmm4, 0 * SIZE(CO2, LDC, 2)
- ALIGN_4
-
+ ALIGN_4
+
.L49:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $4, KK
@@ -1781,16 +1781,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $2, %rax
jle .L53
ALIGN_4
-
+
.L52:
#if defined(PENTIUM4) || defined(GENERIC)
movss 0 * SIZE(B), %xmm0
@@ -1846,7 +1846,7 @@
punpckldq %mm5, %mm5
punpckldq %mm6, %mm6
punpckldq %mm7, %mm7
-
+
movq %mm0, 0 * SIZE(BO)
movq %mm0, 2 * SIZE(BO)
movq %mm1, 4 * SIZE(BO)
@@ -1909,7 +1909,7 @@
decq %rax
jne .L54
ALIGN_4
-
+
.L60:
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -1932,7 +1932,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -1957,7 +1957,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2197,7 +2197,7 @@
addq $16 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L61
- ALIGN_4
+ ALIGN_4
.L70:
testq $4, M
@@ -2215,7 +2215,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -2235,7 +2235,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2384,7 +2384,7 @@
addq $8 * SIZE, CO1 # coffset += 4
addq $8 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L80:
testq $2, M
@@ -2401,7 +2401,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -24 * SIZE(AO), %xmm10
@@ -2421,7 +2421,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2562,7 +2562,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L90:
testq $1, M
@@ -2579,7 +2579,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm8
movss -28 * SIZE(AO), %xmm10
@@ -2599,7 +2599,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2733,8 +2733,8 @@
addps %xmm8, %xmm2
movlps %xmm2, 0 * SIZE(CO2)
- ALIGN_4
-
+ ALIGN_4
+
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $2, KK
@@ -2751,16 +2751,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $3, %rax
jle .L103
ALIGN_4
-
+
.L102:
#if defined(PENTIUM4) || defined(GENERIC)
@@ -2817,7 +2817,7 @@
punpckldq %mm5, %mm5
punpckldq %mm6, %mm6
punpckldq %mm7, %mm7
-
+
movq %mm0, 0 * SIZE(BO)
movq %mm0, 2 * SIZE(BO)
movq %mm1, 4 * SIZE(BO)
@@ -2869,7 +2869,7 @@
decq %rax
jne .L104
ALIGN_4
-
+
.L110:
movq C, CO1 # coffset1 = c
movq A, AO # aoffset = a
@@ -2891,7 +2891,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -2915,7 +2915,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -3068,7 +3068,7 @@
addq $16 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L111
- ALIGN_4
+ ALIGN_4
.L120:
testq $4, M
@@ -3085,7 +3085,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -16 * SIZE(AO), %xmm10
@@ -3103,7 +3103,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -3207,7 +3207,7 @@
movhps %xmm0, 6 * SIZE(CO1)
addq $8 * SIZE, CO1 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L130:
testq $2, M
@@ -3224,7 +3224,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm8
movaps -24 * SIZE(AO), %xmm10
@@ -3242,7 +3242,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -3343,7 +3343,7 @@
movhps %xmm2, 2 * SIZE(CO1)
addq $4 * SIZE, CO1 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L140:
testq $1, M
@@ -3360,7 +3360,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movss -32 * SIZE(AO), %xmm8
movss -28 * SIZE(AO), %xmm10
@@ -3378,7 +3378,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -3465,8 +3465,8 @@
addps %xmm8, %xmm2
movlps %xmm2, 0 * SIZE(CO1)
- ALIGN_4
-
+ ALIGN_4
+
.L999:
movq %rbx, %rsp
EMMS
diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_sse3.S b/kernel/x86_64/zgemm3m_kernel_8x4_sse3.S
index 67537a7..adf0a53 100644
--- a/kernel/x86_64/zgemm3m_kernel_8x4_sse3.S
+++ b/kernel/x86_64/zgemm3m_kernel_8x4_sse3.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -54,7 +54,7 @@
#define CO1 %r14
#define CO2 %r15
#define BB %rbp
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 64
@@ -332,7 +332,7 @@
#endif
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -382,18 +382,18 @@
andq $-1024, %rsp # align stack
STACK_TOUCHING
-
+
movss %xmm0, 0 + ALPHA
movss %xmm1, 4 + ALPHA
movss %xmm0, 8 + ALPHA
movss %xmm1, 12 + ALPHA
-
+
#ifdef TRMMKERNEL
movsd %xmm4, OFFSET
movsd %xmm4, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
salq $ZBASE_SHIFT, LDC
@@ -406,16 +406,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $2, %rax
jle .L03
ALIGN_4
-
+
.L02:
movddup 0 * SIZE(B), %xmm0
movddup 2 * SIZE(B), %xmm1
@@ -464,7 +464,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L10:
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -493,7 +493,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -524,7 +524,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -540,7 +540,7 @@
salq $4, %rax
je .L15
-.L1X:
+.L1X:
KERNEL1 (64 * 0)
KERNEL2 (64 * 0)
KERNEL3 (64 * 0)
@@ -927,7 +927,7 @@
addq $16 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $4, M
@@ -944,7 +944,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -964,7 +964,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1203,7 +1203,7 @@
addq $8 * SIZE, CO1 # coffset += 4
addq $8 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L30:
testq $2, M
@@ -1220,7 +1220,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movddup 0 * SIZE(AO), %xmm8
movddup 8 * SIZE(AO), %xmm10
@@ -1237,7 +1237,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1405,7 +1405,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L40:
testq $1, M
@@ -1422,7 +1422,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movss 0 * SIZE(AO), %xmm8
movss 4 * SIZE(AO), %xmm10
@@ -1437,7 +1437,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1555,8 +1555,8 @@
movhps %xmm12, 0 * SIZE(CO2)
movlps %xmm0, 0 * SIZE(CO1, LDC, 2)
movhps %xmm0, 0 * SIZE(CO2, LDC, 2)
- ALIGN_4
-
+ ALIGN_4
+
.L49:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $4, KK
@@ -1573,16 +1573,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $3, %rax
jle .L53
ALIGN_4
-
+
.L52:
movddup 0 * SIZE(B), %xmm0
movddup 2 * SIZE(B), %xmm1
@@ -1628,7 +1628,7 @@
decq %rax
jne .L54
ALIGN_4
-
+
.L60:
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -1651,7 +1651,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -1673,7 +1673,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1914,7 +1914,7 @@
addq $16 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L61
- ALIGN_4
+ ALIGN_4
.L70:
testq $4, M
@@ -1931,7 +1931,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movsldup 0 * SIZE(BO), %xmm9
@@ -1948,7 +1948,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2098,7 +2098,7 @@
addq $8 * SIZE, CO1 # coffset += 4
addq $8 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L80:
testq $2, M
@@ -2115,7 +2115,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movddup 0 * SIZE(AO), %xmm8
movddup 8 * SIZE(AO), %xmm10
@@ -2130,7 +2130,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2242,7 +2242,7 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L90:
testq $1, M
@@ -2259,7 +2259,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movss 0 * SIZE(AO), %xmm8
movss 4 * SIZE(AO), %xmm10
@@ -2274,7 +2274,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2376,8 +2376,8 @@
movlps %xmm12, 0 * SIZE(CO1)
movhps %xmm12, 0 * SIZE(CO2)
- ALIGN_4
-
+ ALIGN_4
+
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $2, KK
@@ -2393,16 +2393,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $3, %rax
jle .L103
ALIGN_4
-
+
.L102:
movss 0 * SIZE(B), %xmm0
@@ -2455,7 +2455,7 @@
decq %rax
jne .L104
ALIGN_4
-
+
.L110:
movq C, CO1 # coffset1 = c
movq A, AO # aoffset = a
@@ -2477,7 +2477,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -2498,7 +2498,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2654,7 +2654,7 @@
addq $16 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L111
- ALIGN_4
+ ALIGN_4
.L120:
testq $4, M
@@ -2672,7 +2672,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -2693,7 +2693,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2795,7 +2795,7 @@
movhps %xmm0, 6 * SIZE(CO1)
addq $8 * SIZE, CO1 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L130:
testq $2, M
@@ -2812,7 +2812,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 0 * SIZE(BO), %xmm9
@@ -2829,7 +2829,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2923,7 +2923,7 @@
movhps %xmm12, 2 * SIZE(CO1)
addq $4 * SIZE, CO1 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L140:
testq $1, M
@@ -2940,7 +2940,7 @@
leaq (, %rax, 4), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movss 0 * SIZE(AO), %xmm8
movss 4 * SIZE(AO), %xmm10
@@ -2957,7 +2957,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -3043,8 +3043,8 @@
addps %xmm8, %xmm12
movlps %xmm12, 0 * SIZE(CO1)
- ALIGN_4
-
+ ALIGN_4
+
.L999:
movq %rbx, %rsp
movq 0(%rsp), %rbx
diff --git a/kernel/x86_64/zgemm_beta.S b/kernel/x86_64/zgemm_beta.S
index ffc775b..1612d92 100644
--- a/kernel/x86_64/zgemm_beta.S
+++ b/kernel/x86_64/zgemm_beta.S
@@ -71,7 +71,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
subq $STACKSIZE, %rsp
diff --git a/kernel/x86_64/zgemm_kernel_1x4_nehalem.S b/kernel/x86_64/zgemm_kernel_1x4_nehalem.S
index 4ddfc48..0d6acf3 100644
--- a/kernel/x86_64/zgemm_kernel_1x4_nehalem.S
+++ b/kernel/x86_64/zgemm_kernel_1x4_nehalem.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %rbp
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -114,7 +114,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -160,7 +160,7 @@
movlps %xmm0, ALPHA_R
movlps %xmm1, ALPHA_I
-
+
subq $-16 * SIZE, A
subq $-16 * SIZE, B
@@ -174,7 +174,7 @@
movq %r11, OFFSET
#ifndef LEFT
negq %r11
-#endif
+#endif
movq %r11, KK
#endif
testq M, M
@@ -219,7 +219,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
PADDING
xorps %xmm1, %xmm1
@@ -248,7 +248,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -263,7 +263,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm12
@@ -635,7 +635,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -655,7 +655,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -670,7 +670,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm8
@@ -896,7 +896,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -913,7 +913,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -928,7 +928,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm8
@@ -1062,7 +1062,7 @@
decq I
BRANCH
jg .L31
- ALIGN_4
+ ALIGN_4
.L999:
movq 0(%rsp), %rbx
diff --git a/kernel/x86_64/zgemm_kernel_2x1_atom.S b/kernel/x86_64/zgemm_kernel_2x1_atom.S
index be42e03..d9f320a 100644
--- a/kernel/x86_64/zgemm_kernel_2x1_atom.S
+++ b/kernel/x86_64/zgemm_kernel_2x1_atom.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -54,7 +54,7 @@
#define BO %r14
#define CO1 %r15
#define BB %rbp
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 128
@@ -113,9 +113,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -165,7 +165,7 @@
movsd %xmm4, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
salq $ZBASE_SHIFT, LDC
@@ -179,7 +179,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq C, CO1
addq LDC, C
@@ -206,7 +206,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (B, %rax, 2), BO
-#endif
+#endif
prefetcht0 0 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -237,7 +237,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -261,7 +261,7 @@
ADDSD3 %xmm7, %xmm14
movsd 3 * SIZE(AO), %xmm7
mulsd %xmm3, %xmm2
-
+
ADDSD4 %xmm6, %xmm15
PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO)
movaps %xmm4, %xmm6
@@ -514,7 +514,7 @@
addq $4 * SIZE, CO1
decq I
jg .L10
- ALIGN_4
+ ALIGN_4
.L20:
testq $1, M
@@ -530,7 +530,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (B, %rax, 2), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
@@ -553,7 +553,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -729,7 +729,7 @@
addq $1, KK
#endif
ALIGN_4
-
+
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $1, KK
@@ -739,7 +739,7 @@
decq J # j --
jg .L01
ALIGN_4
-
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/zgemm_kernel_2x2_barcelona.S b/kernel/x86_64/zgemm_kernel_2x2_barcelona.S
index 31fad2b..70e8f60 100644
--- a/kernel/x86_64/zgemm_kernel_2x2_barcelona.S
+++ b/kernel/x86_64/zgemm_kernel_2x2_barcelona.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -56,7 +56,7 @@
#define CO2 %rbx
#define BB %r12
#define J %r15
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 96
@@ -89,7 +89,7 @@
#define movlpd movsd
#define movapd movups
#define movupd movups
-
+
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define ADD1 addpd
#define ADD2 addpd
@@ -420,9 +420,9 @@
#endif
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, (%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -475,14 +475,14 @@
movlpd %xmm12, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
subq $-16 * SIZE, A
subq $-16 * SIZE, B
salq $ZBASE_SHIFT, LDC
-
+
movq N, J
sarq $1, J
jle .L100
@@ -492,7 +492,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -519,7 +519,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
movddup -16 * SIZE(BO), %xmm1
@@ -546,7 +546,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -808,7 +808,7 @@
addq $4 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -824,7 +824,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -841,7 +841,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1013,7 +1013,7 @@
addq $1, KK
#endif
ALIGN_4
-
+
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $2, KK
@@ -1033,7 +1033,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq C, CO1 # coffset1 = c
movq A, AO # aoffset = a
@@ -1054,7 +1054,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movddup -16 * SIZE(BO), %xmm1
movddup -15 * SIZE(BO), %xmm5
@@ -1073,7 +1073,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1244,7 +1244,7 @@
addq $4 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L110
- ALIGN_4
+ ALIGN_4
.L130:
testq $1, M
@@ -1260,7 +1260,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1276,7 +1276,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1393,7 +1393,7 @@
movlpd %xmm8, (CO1)
movhpd %xmm8, 1 * SIZE(CO1)
ALIGN_4
-
+
.L999:
movq (%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S b/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S
index e154d30..94e2f61 100644
--- a/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S
+++ b/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S
@@ -79,8 +79,7 @@
#endif
-#define L_BUFFER_SIZE 512*8*4
-#define LB2_OFFSET 512*8*2
+#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@@ -91,7 +90,6 @@
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
-#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384
@@ -414,16 +412,16 @@
#ifdef WINDOWS_ABI
movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp)
- movups %xmm6, 64(%rsp)
- movups %xmm7, 80(%rsp)
- movups %xmm8, 96(%rsp)
- movups %xmm9, 112(%rsp)
- movups %xmm10, 128(%rsp)
- movups %xmm11, 144(%rsp)
- movups %xmm12, 160(%rsp)
- movups %xmm13, 176(%rsp)
- movups %xmm14, 192(%rsp)
- movups %xmm15, 208(%rsp)
+ vmovups %xmm6, 64(%rsp)
+ vmovups %xmm7, 80(%rsp)
+ vmovups %xmm8, 96(%rsp)
+ vmovups %xmm9, 112(%rsp)
+ vmovups %xmm10, 128(%rsp)
+ vmovups %xmm11, 144(%rsp)
+ vmovups %xmm12, 160(%rsp)
+ vmovups %xmm13, 176(%rsp)
+ vmovups %xmm14, 192(%rsp)
+ vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M
movq ARG2, OLD_N
@@ -433,14 +431,15 @@
movq OLD_C, C
movq OLD_LDC, LDC
#ifdef TRMMKERNEL
- movsd OLD_OFFSET, %xmm12
+ vmovsd OLD_OFFSET, %xmm12
#endif
vmovaps %xmm3, %xmm0
+ vmovsd OLD_ALPHA_I, %xmm1
#else
movq STACKSIZE + 8(%rsp), LDC
#ifdef TRMMKERNEL
- movsd STACKSIZE + 16(%rsp), %xmm12
+ vmovsd STACKSIZE + 16(%rsp), %xmm12
#endif
#endif
@@ -1374,6 +1373,8 @@
.L999:
+ vzeroupper
+
movq SP, %rsp
movq (%rsp), %rbx
movq 8(%rsp), %rbp
@@ -1385,16 +1386,16 @@
#ifdef WINDOWS_ABI
movq 48(%rsp), %rdi
movq 56(%rsp), %rsi
- movups 64(%rsp), %xmm6
- movups 80(%rsp), %xmm7
- movups 96(%rsp), %xmm8
- movups 112(%rsp), %xmm9
- movups 128(%rsp), %xmm10
- movups 144(%rsp), %xmm11
- movups 160(%rsp), %xmm12
- movups 176(%rsp), %xmm13
- movups 192(%rsp), %xmm14
- movups 208(%rsp), %xmm15
+ vmovups 64(%rsp), %xmm6
+ vmovups 80(%rsp), %xmm7
+ vmovups 96(%rsp), %xmm8
+ vmovups 112(%rsp), %xmm9
+ vmovups 128(%rsp), %xmm10
+ vmovups 144(%rsp), %xmm11
+ vmovups 160(%rsp), %xmm12
+ vmovups 176(%rsp), %xmm13
+ vmovups 192(%rsp), %xmm14
+ vmovups 208(%rsp), %xmm15
#endif
addq $STACKSIZE, %rsp
diff --git a/kernel/x86_64/zgemm_kernel_2x2_core2.S b/kernel/x86_64/zgemm_kernel_2x2_core2.S
index 799c151..b74e2fe 100644
--- a/kernel/x86_64/zgemm_kernel_2x2_core2.S
+++ b/kernel/x86_64/zgemm_kernel_2x2_core2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,7 +49,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -109,7 +109,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
@@ -175,7 +175,7 @@
movsd %xmm12, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
salq $ZBASE_SHIFT, LDC
@@ -190,11 +190,11 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq 16 * SIZE + BUFFER, BO
-
+
movapd -16 * SIZE(B), %xmm0
movapd -8 * SIZE(B), %xmm4
@@ -202,7 +202,7 @@
sarq $2, %rax
jle .L03
ALIGN_3
-
+
.L02:
prefetcht0 (PREFETCH_R + 0) * SIZE(B)
prefetcht0 (PREFETCH_R + 8) * SIZE(B)
@@ -292,7 +292,7 @@
decq %rax
jne .L04
ALIGN_3
-
+
.L05:
leaq (PREFETCH_R + 0) * SIZE(B), BB
@@ -317,7 +317,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
movaps -14 * SIZE(AO), %xmm1
@@ -351,7 +351,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -366,7 +366,7 @@
jle .L15
ALIGN_4
-.L12:
+.L12:
PADDING;
ADD1 %xmm2, %xmm10
movaps -15 * SIZE(BO), %xmm2
@@ -619,7 +619,7 @@
addsubpd %xmm11, %xmm10
addsubpd %xmm13, %xmm12
addsubpd %xmm15, %xmm14
-
+
#if! defined(TRMMKERNEL) && !defined(BETAZERO)
addpd %xmm0, %xmm8
addpd %xmm1, %xmm10
@@ -654,7 +654,7 @@
addq $4 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -672,7 +672,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -684,7 +684,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -844,7 +844,7 @@
addsubpd %xmm9, %xmm8
addsubpd %xmm11, %xmm10
-
+
#if! defined(TRMMKERNEL) && !defined(BETAZERO)
addpd %xmm0, %xmm8
addpd %xmm1, %xmm10
@@ -868,7 +868,7 @@
addq $1, KK
#endif
ALIGN_4
-
+
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $2, KK
@@ -886,7 +886,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
@@ -895,7 +895,7 @@
sarq $2, %rax
jle .L103
ALIGN_4
-
+
.L102:
movddup -16 * SIZE(B), %xmm8
movddup -15 * SIZE(B), %xmm9
@@ -940,7 +940,7 @@
decq %rax
jne .L104
ALIGN_4
-
+
.L105:
movq C, CO1 # coffset1 = c
movq A, AO # aoffset = a
@@ -962,7 +962,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -975,7 +975,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1145,7 +1145,7 @@
addsubpd %xmm9, %xmm8
addsubpd %xmm13, %xmm12
-
+
#if! defined(TRMMKERNEL) && !defined(BETAZERO)
addpd %xmm0, %xmm8
addpd %xmm2, %xmm12
@@ -1172,7 +1172,7 @@
addq $4 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L110
- ALIGN_4
+ ALIGN_4
.L130:
testq $1, M
@@ -1191,7 +1191,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1203,7 +1203,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1321,10 +1321,10 @@
movsd %xmm8, 0 * SIZE(CO1)
movhpd %xmm8, 1 * SIZE(CO1)
ALIGN_4
-
+
.L999:
movq %r15, %rsp
-
+
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12
diff --git a/kernel/x86_64/zgemm_kernel_2x2_penryn.S b/kernel/x86_64/zgemm_kernel_2x2_penryn.S
index 751110f..24e1afd 100644
--- a/kernel/x86_64/zgemm_kernel_2x2_penryn.S
+++ b/kernel/x86_64/zgemm_kernel_2x2_penryn.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -137,7 +137,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -183,7 +183,7 @@
movlps %xmm0, ALPHA_R
movlps %xmm1, ALPHA_I
-
+
subq $-16 * SIZE, A
subq $-17 * SIZE, B
@@ -197,7 +197,7 @@
movq %r11, OFFSET
#ifndef LEFT
negq %r11
-#endif
+#endif
movq %r11, KK
#endif
@@ -240,7 +240,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
xorpd %xmm3, %xmm3
@@ -270,7 +270,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -285,7 +285,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm3, %xmm12
movaps -15 * SIZE(BO), %xmm3
@@ -580,7 +580,7 @@
decq I # i --
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $1, M
@@ -600,7 +600,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
movaps -17 * SIZE(BO), %xmm2
@@ -619,7 +619,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -634,7 +634,7 @@
jle .L25
ALIGN_4
-.L22:
+.L22:
ADD1 %xmm3, %xmm12
movaps -15 * SIZE(BO), %xmm3
pshufd $0x4e, %xmm2, %xmm7
@@ -809,7 +809,7 @@
addq $2 * SIZE, CO1 # coffset += 4
addq $2 * SIZE, CO2 # coffset += 4
- ALIGN_4
+ ALIGN_4
.L39:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -837,7 +837,7 @@
movq OFFSET, %rax
movq %rax, KK
#endif
-
+
movq M, I
sarq $1, I # i = (m >> 2)
NOBRANCH
@@ -857,7 +857,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
movaps -14 * SIZE(AO), %xmm1
@@ -874,7 +874,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -889,7 +889,7 @@
jle .L55
ALIGN_4
-.L52:
+.L52:
movaps %xmm2, %xmm4
pshufd $0x4e, %xmm2, %xmm7
mulpd %xmm0, %xmm2
@@ -1075,7 +1075,7 @@
decq I
BRANCH
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $1, M
@@ -1095,7 +1095,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
xorps %xmm8, %xmm8
@@ -1109,7 +1109,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1124,7 +1124,7 @@
jle .L65
ALIGN_4
-.L62:
+.L62:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x4e, %xmm2, %xmm7
@@ -1257,7 +1257,7 @@
addq $2 * SIZE, CO1
addq $2 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L79:
#if defined(TRMMKERNEL) && !defined(LEFT)
diff --git a/kernel/x86_64/zgemm_kernel_2x2_piledriver.S b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S
index 9f1392d..848b6f2 100644
--- a/kernel/x86_64/zgemm_kernel_2x2_piledriver.S
+++ b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*********************************************************************
*
-* 2013/10/30 Saar
+* 2014/06/28 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
@@ -104,8 +104,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
-#define L_BUFFER_SIZE 512*8*4
-#define LB2_OFFSET 512*8*2
+#define L_BUFFER_SIZE 256*8*4
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@@ -116,7 +115,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
-#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384
@@ -439,16 +437,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef WINDOWS_ABI
movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp)
- movups %xmm6, 64(%rsp)
- movups %xmm7, 80(%rsp)
- movups %xmm8, 96(%rsp)
- movups %xmm9, 112(%rsp)
- movups %xmm10, 128(%rsp)
- movups %xmm11, 144(%rsp)
- movups %xmm12, 160(%rsp)
- movups %xmm13, 176(%rsp)
- movups %xmm14, 192(%rsp)
- movups %xmm15, 208(%rsp)
+ vmovups %xmm6, 64(%rsp)
+ vmovups %xmm7, 80(%rsp)
+ vmovups %xmm8, 96(%rsp)
+ vmovups %xmm9, 112(%rsp)
+ vmovups %xmm10, 128(%rsp)
+ vmovups %xmm11, 144(%rsp)
+ vmovups %xmm12, 160(%rsp)
+ vmovups %xmm13, 176(%rsp)
+ vmovups %xmm14, 192(%rsp)
+ vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M
movq ARG2, OLD_N
@@ -458,14 +456,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
movq OLD_C, C
movq OLD_LDC, LDC
#ifdef TRMMKERNEL
- movsd OLD_OFFSET, %xmm12
+ vmovsd OLD_OFFSET, %xmm12
#endif
vmovaps %xmm3, %xmm0
+ vmovsd OLD_ALPHA_I, %xmm1
#else
movq STACKSIZE + 8(%rsp), LDC
#ifdef TRMMKERNEL
- movsd STACKSIZE + 16(%rsp), %xmm12
+ vmovsd STACKSIZE + 16(%rsp), %xmm12
#endif
#endif
@@ -1399,6 +1398,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L999:
+ vzeroupper
+
movq SP, %rsp
movq (%rsp), %rbx
movq 8(%rsp), %rbp
@@ -1410,16 +1411,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef WINDOWS_ABI
movq 48(%rsp), %rdi
movq 56(%rsp), %rsi
- movups 64(%rsp), %xmm6
- movups 80(%rsp), %xmm7
- movups 96(%rsp), %xmm8
- movups 112(%rsp), %xmm9
- movups 128(%rsp), %xmm10
- movups 144(%rsp), %xmm11
- movups 160(%rsp), %xmm12
- movups 176(%rsp), %xmm13
- movups 192(%rsp), %xmm14
- movups 208(%rsp), %xmm15
+ vmovups 64(%rsp), %xmm6
+ vmovups 80(%rsp), %xmm7
+ vmovups 96(%rsp), %xmm8
+ vmovups 112(%rsp), %xmm9
+ vmovups 128(%rsp), %xmm10
+ vmovups 144(%rsp), %xmm11
+ vmovups 160(%rsp), %xmm12
+ vmovups 176(%rsp), %xmm13
+ vmovups 192(%rsp), %xmm14
+ vmovups 208(%rsp), %xmm15
#endif
addq $STACKSIZE, %rsp
diff --git a/kernel/x86_64/zgemm_kernel_2x2_sse2.S b/kernel/x86_64/zgemm_kernel_2x2_sse2.S
index 4b83eee..ae7bb8f 100644
--- a/kernel/x86_64/zgemm_kernel_2x2_sse2.S
+++ b/kernel/x86_64/zgemm_kernel_2x2_sse2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -334,9 +334,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -385,7 +385,7 @@
andq $-4096, %rsp # align stack
STACK_TOUCHING
-
+
movq OLD_M, M
movq OLD_N, N
@@ -395,7 +395,7 @@
movlpd %xmm0, 0 + ALPHA_R
movlpd %xmm0, 8 + ALPHA_R
-
+
movlpd %xmm1, 8 + ALPHA_I
xorpd %xmm7, %xmm1
movlpd %xmm1, 0 + ALPHA_I
@@ -408,13 +408,13 @@
movlpd %xmm12, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
subq $-16 * SIZE, A
salq $ZBASE_SHIFT, LDC
-
+
movq N, J
sarq $1, J # j = (n >> 2)
jle .L100
@@ -427,7 +427,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
leaq 16 * SIZE + BUFFER, BO
@@ -435,7 +435,7 @@
sarq $2, %rax
jle .L03
ALIGN_4
-
+
.L02:
PREFETCH (RPREFETCHSIZE + 0) * SIZE(B)
@@ -539,7 +539,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L05:
movq A, AO # aoffset = a
@@ -562,7 +562,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
movapd -16 * SIZE(BO), %xmm1
@@ -590,7 +590,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -928,7 +928,7 @@
pshufd $0x4e, %xmm10, %xmm11
pshufd $0x4e, %xmm12, %xmm13
pshufd $0x4e, %xmm14, %xmm15
-
+
mulpd %xmm6, %xmm8
mulpd %xmm7, %xmm9
mulpd %xmm6, %xmm10
@@ -978,7 +978,7 @@
addq $4 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -995,7 +995,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1011,7 +1011,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1023,7 +1023,7 @@
#endif
sarq $3, %rax
je .L44
- ALIGN_4
+ ALIGN_4
.L41:
mulpd %xmm0, %xmm1
@@ -1129,7 +1129,7 @@
addq $64 * SIZE, BO
decq %rax
jne .L41
- ALIGN_4
+ ALIGN_4
.L44:
#ifndef TRMMKERNEL
@@ -1192,7 +1192,7 @@
addq $ 8 * SIZE, AO
addq $32 * SIZE, BO
- ALIGN_4
+ ALIGN_4
.L45:
#ifndef TRMMKERNEL
@@ -1206,7 +1206,7 @@
andq $3, %rax # if (k & 1)
BRANCH
jle .L47
- ALIGN_4
+ ALIGN_4
.L46:
mulpd %xmm0, %xmm1
@@ -1260,7 +1260,7 @@
pshufd $0x4e, %xmm8, %xmm9
pshufd $0x4e, %xmm10, %xmm11
-
+
mulpd %xmm6, %xmm8
mulpd %xmm7, %xmm9
mulpd %xmm6, %xmm10
@@ -1292,7 +1292,7 @@
addq $1, KK
#endif
ALIGN_4
-
+
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
addl $2, KK
@@ -1310,7 +1310,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
@@ -1319,7 +1319,7 @@
sarq $2, %rax
jle .L103
ALIGN_4
-
+
.L102:
movlpd 0 * SIZE(B), %xmm8
movlpd 1 * SIZE(B), %xmm9
@@ -1374,7 +1374,7 @@
decq %rax
jne .L104
ALIGN_4
-
+
.L105:
movq C, CO1 # coffset1 = c
movq A, AO # aoffset = a
@@ -1396,7 +1396,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1413,7 +1413,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1548,7 +1548,7 @@
pshufd $0x4e, %xmm8, %xmm9
pshufd $0x4e, %xmm12, %xmm13
-
+
mulpd %xmm6, %xmm8
mulpd %xmm7, %xmm9
mulpd %xmm6, %xmm12
@@ -1583,7 +1583,7 @@
addq $4 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L110
- ALIGN_4
+ ALIGN_4
.L130:
testq $1, M
@@ -1600,7 +1600,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movapd -16 * SIZE(AO), %xmm0
movapd -16 * SIZE(BO), %xmm1
@@ -1617,7 +1617,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1783,7 +1783,7 @@
#endif
pshufd $0x4e, %xmm8, %xmm9
-
+
mulpd %xmm6, %xmm8
mulpd %xmm7, %xmm9
@@ -1796,11 +1796,11 @@
movlpd %xmm8, 0 * SIZE(CO1)
movhpd %xmm8, 1 * SIZE(CO1)
ALIGN_4
-
+
.L999:
movq %rbx, %rsp
EMMS
-
+
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12
diff --git a/kernel/x86_64/zgemm_kernel_2x2_sse3.S b/kernel/x86_64/zgemm_kernel_2x2_sse3.S
index afb0924..b78f1a9 100644
--- a/kernel/x86_64/zgemm_kernel_2x2_sse3.S
+++ b/kernel/x86_64/zgemm_kernel_2x2_sse3.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -55,7 +55,7 @@
#define CO1 %r15
#define CO2 %rbx
#define BB %rbp
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 128
@@ -351,9 +351,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -403,7 +403,7 @@
movsd %xmm4, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
salq $ZBASE_SHIFT, LDC
@@ -416,7 +416,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -442,7 +442,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (B, %rax, 4), BO
-#endif
+#endif
prefetcht0 0 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -473,7 +473,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -487,8 +487,8 @@
andq $-8, %rax
salq $4, %rax
je .L12
-
-.L1X:
+
+.L1X:
KERNEL1 (16 * 0)
KERNEL2 (16 * 0)
KERNEL3 (16 * 0)
@@ -757,7 +757,7 @@
addsubpd %xmm3, %xmm2
addsubpd %xmm5, %xmm4
addsubpd %xmm7, %xmm6
-
+
#if! defined(TRMMKERNEL) && !defined(BETAZERO)
addpd %xmm8, %xmm0
addpd %xmm9, %xmm2
@@ -792,7 +792,7 @@
addq $4 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -809,7 +809,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (B, %rax, 4), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -825,7 +825,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1022,7 +1022,7 @@
addsubpd %xmm1, %xmm0
addsubpd %xmm3, %xmm2
-
+
#if! defined(TRMMKERNEL) && !defined(BETAZERO)
addpd %xmm8, %xmm0
addpd %xmm9, %xmm2
@@ -1046,7 +1046,7 @@
addq $1, KK
#endif
ALIGN_4
-
+
.L99:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $2, KK
@@ -1065,7 +1065,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
movq C, CO1 # coffset1 = c
movq A, AO # aoffset = a
@@ -1086,7 +1086,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 4), AO
leaq (B, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1104,7 +1104,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1311,7 +1311,7 @@
addsubpd %xmm1, %xmm0
addsubpd %xmm5, %xmm4
-
+
#if! defined(TRMMKERNEL) && !defined(BETAZERO)
addpd %xmm8, %xmm0
addpd %xmm10, %xmm4
@@ -1338,7 +1338,7 @@
addq $4 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L110
- ALIGN_4
+ ALIGN_4
.L130:
testq $1, M
@@ -1356,7 +1356,7 @@
leaq (, %rax, SIZE), %rax
leaq (AO, %rax, 2), AO
leaq (B, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1372,7 +1372,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1509,7 +1509,7 @@
movsd %xmm0, 0 * SIZE(CO1)
movhpd %xmm0, 1 * SIZE(CO1)
ALIGN_4
-
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/zgemm_kernel_2x4_nehalem.S b/kernel/x86_64/zgemm_kernel_2x4_nehalem.S
index 6a16b7e..ce60123 100644
--- a/kernel/x86_64/zgemm_kernel_2x4_nehalem.S
+++ b/kernel/x86_64/zgemm_kernel_2x4_nehalem.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %rbp
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -114,7 +114,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -163,7 +163,7 @@
movlps %xmm0, ALPHA_R
movlps %xmm1, ALPHA_I
-
+
subq $-32 * SIZE, A
subq $-32 * SIZE, B
@@ -177,7 +177,7 @@
movq %r11, OFFSET
#ifndef LEFT
negq %r11
-#endif
+#endif
movq %r11, KK
#endif
@@ -210,7 +210,7 @@
.L11:
prefetcht2 -32 * SIZE(BB)
subq $-16 * SIZE, BB
-
+
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
@@ -223,7 +223,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
xorps %xmm1, %xmm1
xorps %xmm2, %xmm2
@@ -251,7 +251,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -266,7 +266,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm12
@@ -547,7 +547,7 @@
decq I # i --
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $1, M
@@ -566,7 +566,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -585,7 +585,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -600,7 +600,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
ADD1 %xmm1, %xmm8
pshufd $0xa0, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -783,8 +783,8 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L29:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $4, KK
@@ -831,7 +831,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -852,7 +852,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -867,7 +867,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm8
@@ -1048,7 +1048,7 @@
decq I # i --
BRANCH
jg .L31
- ALIGN_4
+ ALIGN_4
.L40:
testq $1, M
@@ -1067,7 +1067,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -1086,7 +1086,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1101,7 +1101,7 @@
jle .L45
ALIGN_3
-.L42:
+.L42:
ADD1 %xmm1, %xmm8
pshufd $0xa0, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -1230,8 +1230,8 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L49:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $2, KK
@@ -1273,7 +1273,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -1293,7 +1293,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1308,7 +1308,7 @@
jle .L55
ALIGN_3
-.L52:
+.L52:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm8
@@ -1436,7 +1436,7 @@
decq I # i --
BRANCH
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $1, M
@@ -1455,7 +1455,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -1469,7 +1469,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1484,7 +1484,7 @@
jle .L65
ALIGN_3
-.L62:
+.L62:
ADD1 %xmm1, %xmm8
pshufd $0xa0, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -1597,8 +1597,8 @@
#endif
movsd %xmm8, (CO1)
- ALIGN_4
-
+ ALIGN_4
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/zgemm_kernel_4x2_barcelona.S b/kernel/x86_64/zgemm_kernel_4x2_barcelona.S
index c59a50d..06d0bbe 100644
--- a/kernel/x86_64/zgemm_kernel_4x2_barcelona.S
+++ b/kernel/x86_64/zgemm_kernel_4x2_barcelona.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -55,7 +55,7 @@
#define CO1 %r15
#define CO2 %rbp
#define BB %r12
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 64
@@ -301,7 +301,7 @@
movaps 100 * SIZE(BO, %rax, 8), %xmm3 ;\
movaps %xmm0, %xmm2 ;\
addq $16 * SIZE, %rax
-
+
#define KERNEL_SUB1(xx) \
mulps %xmm1, %xmm0 ;\
mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\
@@ -409,9 +409,9 @@
#endif
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -469,7 +469,7 @@
shufps $0, %xmm0, %xmm0
movaps %xmm0, 0 + ALPHA_R
-
+
movss %xmm1, 4 + ALPHA_I
movss %xmm1, 12 + ALPHA_I
xorps %xmm7, %xmm1
@@ -496,7 +496,7 @@
movsd %xmm12, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
salq $ZBASE_SHIFT, LDC
@@ -509,19 +509,19 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
movaps POSINV, %xmm15
-
+
movq K, %rax
sarq $2, %rax
jle .L03
addq %rax, %rax
ALIGN_4
-
+
.L02:
prefetch (RPREFETCHSIZE + 0) * SIZE(B)
@@ -543,7 +543,7 @@
pshufd $0xff, %xmm7, %xmm7
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm15, %xmm1
xorps %xmm15, %xmm3
xorps %xmm15, %xmm5
@@ -587,7 +587,7 @@
pshufd $0xff, %xmm3, %xmm3
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm15, %xmm1
xorps %xmm15, %xmm3
#else
@@ -605,7 +605,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L10:
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -630,7 +630,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
prefetch -20 * SIZE(BB)
prefetch 28 * SIZE(BB)
@@ -660,7 +660,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -921,7 +921,7 @@
addq $8 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -938,7 +938,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
movaps -16 * SIZE(AO), %xmm2
@@ -960,7 +960,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1182,8 +1182,8 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
-
+ ALIGN_4
+
.L30:
testq $1, M
je .L39
@@ -1199,7 +1199,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
movaps -24 * SIZE(AO), %xmm2
@@ -1219,7 +1219,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1440,7 +1440,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
+ ALIGN_4
.L39:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -1461,17 +1461,17 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
movaps POSINV, %xmm15
-
+
movq K, %rax
sarq $2, %rax
jle .L43
ALIGN_4
-
+
.L42:
prefetch (RPREFETCHSIZE + 0) * SIZE(B)
@@ -1493,7 +1493,7 @@
pshufd $0xff, %xmm7, %xmm7
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm15, %xmm1
xorps %xmm15, %xmm3
xorps %xmm15, %xmm5
@@ -1535,7 +1535,7 @@
pshufd $0x55, %xmm3, %xmm1
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm15, %xmm1
#else
xorps %xmm15, %xmm0
@@ -1549,7 +1549,7 @@
decq %rax
jne .L44
ALIGN_4
-
+
.L50:
movq C, CO1 # coffset1 = c
movq A, AO # aoffset = a
@@ -1572,7 +1572,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1599,7 +1599,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1825,7 +1825,7 @@
addq $8 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1842,7 +1842,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1861,7 +1861,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2014,8 +2014,8 @@
#endif
addq $4 * SIZE, CO1 # coffset += 4
- ALIGN_4
-
+ ALIGN_4
+
.L70:
testq $1, M
je .L999
@@ -2031,7 +2031,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2050,7 +2050,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2194,7 +2194,7 @@
addps %xmm0, %xmm8
#endif
movsd %xmm8, 0 * SIZE(CO1)
- ALIGN_4
+ ALIGN_4
.L999:
movq %rbx, %rsp
diff --git a/kernel/x86_64/zgemm_kernel_4x2_core2.S b/kernel/x86_64/zgemm_kernel_4x2_core2.S
index 1b5d9a0..bb4584a 100644
--- a/kernel/x86_64/zgemm_kernel_4x2_core2.S
+++ b/kernel/x86_64/zgemm_kernel_4x2_core2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,7 +49,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -100,9 +100,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -156,7 +156,7 @@
shufps $0, %xmm0, %xmm0
movaps %xmm0, 0 + ALPHA_R
-
+
movss %xmm1, 4 + ALPHA_I
movss %xmm1, 12 + ALPHA_I
xorps %xmm7, %xmm1
@@ -171,7 +171,7 @@
movsd %xmm12, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
movq OLD_M, M
@@ -187,18 +187,18 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq 32 * SIZE + BUFFER, BO
-
+
movaps -32 * SIZE(B), %xmm3
movq K, %rax
sarq $2, %rax
jle .L03
ALIGN_4
-
+
.L02:
prefetcht0 (PREFETCH_R + 0) * SIZE(B)
movaps -28 * SIZE(B), %xmm7
@@ -279,7 +279,7 @@
subq $1, %rax
jne .L04
ALIGN_4
-
+
.L10:
leaq (PREFETCH_R + 0) * SIZE(B), BB
@@ -304,7 +304,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
movaps -28 * SIZE(AO), %xmm1
@@ -337,7 +337,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -351,7 +351,7 @@
jle .L15
ALIGN_4
-.L12:
+.L12:
addps %xmm2, %xmm10
movaps -32 * SIZE(BO), %xmm2
addps %xmm3, %xmm14
@@ -417,7 +417,7 @@
addps %xmm2, %xmm10
movaps 0 * SIZE(BO), %xmm2
addps %xmm3, %xmm14
- PADDING;
+ PADDING;
movaps %xmm6, %xmm3
PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
mulps %xmm0, %xmm6
@@ -647,7 +647,7 @@
addq $8 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -664,7 +664,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -676,7 +676,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -867,8 +867,8 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
-
+ ALIGN_4
+
.L30:
testq $1, M
je .L39
@@ -884,7 +884,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -896,7 +896,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1080,7 +1080,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
+ ALIGN_4
.L39:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -1101,16 +1101,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $2, %rax
jle .L43
ALIGN_4
-
+
.L42:
movss -32 * SIZE(B), %xmm8
movss -31 * SIZE(B), %xmm9
@@ -1168,7 +1168,7 @@
subq $1, %rax
jne .L44
ALIGN_4
-
+
.L50:
movq C, CO1 # coffset1 = c
movq A, AO # aoffset = a
@@ -1190,7 +1190,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
pxor %xmm8, %xmm8
prefetcht0 3 * SIZE(CO1)
@@ -1203,7 +1203,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1405,7 +1405,7 @@
addq $8 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1422,7 +1422,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1434,7 +1434,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1566,8 +1566,8 @@
addq $2, KK
#endif
addq $4 * SIZE, CO1 # coffset += 4
- ALIGN_4
-
+ ALIGN_4
+
.L70:
testq $1, M
je .L999
@@ -1583,7 +1583,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1595,7 +1595,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1711,7 +1711,7 @@
#endif
movsd %xmm8, 0 * SIZE(CO1)
- ALIGN_4
+ ALIGN_4
.L999:
movq %r15, %rsp
diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S
index 515939d..a71fff7 100644
--- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S
+++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S
@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
/********************************************************************************
-* 2013/11/13 Saar
+* 2014/06/28 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
@@ -92,8 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
-#define L_BUFFER_SIZE 512*8*4
-#define LB2_OFFSET 512*8*2
+#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@@ -104,7 +103,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
-#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384
@@ -695,16 +693,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef WINDOWS_ABI
movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp)
- movups %xmm6, 64(%rsp)
- movups %xmm7, 80(%rsp)
- movups %xmm8, 96(%rsp)
- movups %xmm9, 112(%rsp)
- movups %xmm10, 128(%rsp)
- movups %xmm11, 144(%rsp)
- movups %xmm12, 160(%rsp)
- movups %xmm13, 176(%rsp)
- movups %xmm14, 192(%rsp)
- movups %xmm15, 208(%rsp)
+ vmovups %xmm6, 64(%rsp)
+ vmovups %xmm7, 80(%rsp)
+ vmovups %xmm8, 96(%rsp)
+ vmovups %xmm9, 112(%rsp)
+ vmovups %xmm10, 128(%rsp)
+ vmovups %xmm11, 144(%rsp)
+ vmovups %xmm12, 160(%rsp)
+ vmovups %xmm13, 176(%rsp)
+ vmovups %xmm14, 192(%rsp)
+ vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M
movq ARG2, OLD_N
@@ -717,6 +715,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
movsd OLD_OFFSET, %xmm12
#endif
vmovaps %xmm3, %xmm0
+ vmovsd OLD_ALPHA_I, %xmm1
#else
movq STACKSIZE + 8(%rsp), LDC
@@ -1783,6 +1782,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L999:
+ vzeroupper
+
movq SP, %rsp
movq (%rsp), %rbx
movq 8(%rsp), %rbp
@@ -1794,16 +1795,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef WINDOWS_ABI
movq 48(%rsp), %rdi
movq 56(%rsp), %rsi
- movups 64(%rsp), %xmm6
- movups 80(%rsp), %xmm7
- movups 96(%rsp), %xmm8
- movups 112(%rsp), %xmm9
- movups 128(%rsp), %xmm10
- movups 144(%rsp), %xmm11
- movups 160(%rsp), %xmm12
- movups 176(%rsp), %xmm13
- movups 192(%rsp), %xmm14
- movups 208(%rsp), %xmm15
+ vmovups 64(%rsp), %xmm6
+ vmovups 80(%rsp), %xmm7
+ vmovups 96(%rsp), %xmm8
+ vmovups 112(%rsp), %xmm9
+ vmovups 128(%rsp), %xmm10
+ vmovups 144(%rsp), %xmm11
+ vmovups 160(%rsp), %xmm12
+ vmovups 176(%rsp), %xmm13
+ vmovups 192(%rsp), %xmm14
+ vmovups 208(%rsp), %xmm15
#endif
addq $ STACKSIZE, %rsp
diff --git a/kernel/x86_64/zgemm_kernel_4x2_penryn.S b/kernel/x86_64/zgemm_kernel_4x2_penryn.S
index 241148d..9aa852a 100644
--- a/kernel/x86_64/zgemm_kernel_4x2_penryn.S
+++ b/kernel/x86_64/zgemm_kernel_4x2_penryn.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -114,7 +114,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -164,7 +164,7 @@
movlps %xmm0, ALPHA_R
movlps %xmm1, ALPHA_I
-
+
subq $-32 * SIZE, A
subq $-32 * SIZE, B
@@ -178,7 +178,7 @@
movq %r11, OFFSET
#ifndef LEFT
negq %r11
-#endif
+#endif
movq %r11, KK
#endif
@@ -221,7 +221,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
pxor %xmm3, %xmm3
@@ -248,13 +248,13 @@
subq $-24 * SIZE, BB
leaq (PREFETCHSIZE + 0) * SIZE(AO), PREA
-
+
#ifndef TRMMKERNEL
movq K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -269,7 +269,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH -32 * SIZE(PREA)
ADD1 %xmm6, %xmm10
ADD1 %xmm3, %xmm14
@@ -687,7 +687,7 @@
decq I # i --
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -706,7 +706,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
pxor %xmm4, %xmm4
@@ -722,7 +722,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -737,7 +737,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
ADD1 %xmm6, %xmm10
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0xb1, %xmm2, %xmm7
@@ -917,7 +917,7 @@
addq $4 * SIZE, CO1
addq $4 * SIZE, CO2
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -936,7 +936,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movsd -32 * SIZE(AO), %xmm0
pxor %xmm4, %xmm4
@@ -952,7 +952,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -967,7 +967,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
ADD1 %xmm6, %xmm10
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0xb1, %xmm2, %xmm7
@@ -1140,8 +1140,8 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L39:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $2, KK
@@ -1187,7 +1187,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
pxor %xmm3, %xmm3
@@ -1208,7 +1208,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1223,7 +1223,7 @@
jle .L45
ALIGN_3
-.L42:
+.L42:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm3, %xmm8
@@ -1422,7 +1422,7 @@
decq I # i --
BRANCH
jg .L41
- ALIGN_4
+ ALIGN_4
.L50:
testq $2, M
@@ -1441,7 +1441,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
pxor %xmm3, %xmm3
@@ -1455,7 +1455,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1470,7 +1470,7 @@
jle .L55
ALIGN_3
-.L52:
+.L52:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm3, %xmm8
@@ -1603,7 +1603,7 @@
#endif
addq $4 * SIZE, CO1
- ALIGN_4
+ ALIGN_4
.L60:
testq $1, M
@@ -1622,7 +1622,7 @@
salq $ZBASE_SHIFT, %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 1), BO
-#endif
+#endif
movsd -32 * SIZE(AO), %xmm0
pxor %xmm3, %xmm3
@@ -1636,7 +1636,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1651,7 +1651,7 @@
jle .L65
ALIGN_3
-.L62:
+.L62:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
addps %xmm3, %xmm8
@@ -1763,8 +1763,8 @@
addps %xmm0, %xmm8
#endif
movsd %xmm8, 0 * SIZE(CO1)
- ALIGN_4
-
+ ALIGN_4
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/zgemm_kernel_4x2_sse.S b/kernel/x86_64/zgemm_kernel_4x2_sse.S
index 04dbf1a..7d606aa 100644
--- a/kernel/x86_64/zgemm_kernel_4x2_sse.S
+++ b/kernel/x86_64/zgemm_kernel_4x2_sse.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
@@ -56,7 +56,7 @@
#define CO1 %r15
#define CO2 %rbp
#define BB %r12
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 64
@@ -92,7 +92,7 @@
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 5 + 8)
#endif
-
+
#if defined(PENTIUM4) || defined(GENERIC)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
@@ -274,7 +274,7 @@
addps %xmm5, %xmm14 ;\
movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
addps %xmm6, %xmm15 ;\
- movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
+ movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
#define KERNEL5(xx) \
mulps %xmm0, %xmm1 ;\
@@ -336,9 +336,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -396,7 +396,7 @@
shufps $0, %xmm0, %xmm0
movaps %xmm0, 0 + ALPHA_R
-
+
movss %xmm1, 4 + ALPHA_I
movss %xmm1, 12 + ALPHA_I
xorps %xmm7, %xmm1
@@ -423,7 +423,7 @@
movsd %xmm12, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
salq $ZBASE_SHIFT, LDC
@@ -436,19 +436,19 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
movaps POSINV, %xmm7
-
+
movq K, %rax
sarq $2, %rax
jle .L03
addq %rax, %rax
ALIGN_4
-
+
.L02:
PREFETCH (RPREFETCHSIZE + 0) * SIZE(B)
@@ -475,7 +475,7 @@
PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO)
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm7, %xmm9
xorps %xmm7, %xmm11
xorps %xmm7, %xmm13
@@ -521,7 +521,7 @@
shufps $0, %xmm11, %xmm11
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm7, %xmm9
xorps %xmm7, %xmm11
#else
@@ -539,7 +539,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L10:
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -564,7 +564,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
movaps -32 * SIZE(BO), %xmm1
@@ -594,7 +594,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -971,7 +971,7 @@
addq $8 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -988,7 +988,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
movaps -16 * SIZE(AO), %xmm2
@@ -1010,7 +1010,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1232,8 +1232,8 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
-
+ ALIGN_4
+
.L30:
testq $1, M
je .L39
@@ -1249,7 +1249,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 8), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
movaps -24 * SIZE(AO), %xmm2
@@ -1269,7 +1269,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1496,7 +1496,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
+ ALIGN_4
.L39:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -1517,17 +1517,17 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
movaps POSINV, %xmm7
-
+
movq K, %rax
sarq $2, %rax
jle .L43
ALIGN_4
-
+
.L42:
movss 0 * SIZE(B), %xmm8
movss 1 * SIZE(B), %xmm9
@@ -1548,7 +1548,7 @@
shufps $0, %xmm15, %xmm15
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm7, %xmm9
xorps %xmm7, %xmm11
xorps %xmm7, %xmm13
@@ -1599,7 +1599,7 @@
shufps $0, %xmm9, %xmm9
#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
- defined(TN) || defined(TT) || defined(TR) || defined(TC)
+ defined(TN) || defined(TT) || defined(TR) || defined(TC)
xorps %xmm7, %xmm9
#else
xorps %xmm7, %xmm8
@@ -1613,7 +1613,7 @@
decq %rax
jne .L44
ALIGN_4
-
+
.L50:
movq C, CO1 # coffset1 = c
movq A, AO # aoffset = a
@@ -1636,7 +1636,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1663,7 +1663,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1889,7 +1889,7 @@
addq $8 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1906,7 +1906,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1925,7 +1925,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2078,8 +2078,8 @@
#endif
addq $4 * SIZE, CO1 # coffset += 4
- ALIGN_4
-
+ ALIGN_4
+
.L70:
testq $1, M
je .L999
@@ -2095,7 +2095,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps -32 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -2114,7 +2114,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2261,7 +2261,7 @@
addps %xmm0, %xmm8
#endif
movlps %xmm8, 0 * SIZE(CO1)
- ALIGN_4
+ ALIGN_4
.L999:
movq %rbx, %rsp
diff --git a/kernel/x86_64/zgemm_kernel_4x2_sse3.S b/kernel/x86_64/zgemm_kernel_4x2_sse3.S
index ecc3a6f..4e5504c 100644
--- a/kernel/x86_64/zgemm_kernel_4x2_sse3.S
+++ b/kernel/x86_64/zgemm_kernel_4x2_sse3.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -54,7 +54,7 @@
#define CO1 %r14
#define CO2 %r15
#define BB %rbp
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 64
@@ -79,7 +79,7 @@
#define KK 48(%rsp)
#define KKK 56(%rsp)
#define BUFFER 128(%rsp)
-
+
#define PREFETCH prefetcht0
#define PREFETCHSIZE 320
@@ -334,9 +334,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -391,7 +391,7 @@
shufps $0, %xmm0, %xmm0
movaps %xmm0, 0 + ALPHA_R
-
+
movss %xmm1, 4 + ALPHA_I
movss %xmm1, 12 + ALPHA_I
xorps %xmm15, %xmm1
@@ -403,7 +403,7 @@
movsd %xmm4, KK
#ifndef LEFT
negq KK
-#endif
+#endif
#endif
salq $ZBASE_SHIFT, LDC
@@ -416,16 +416,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $2, %rax
jle .L03
ALIGN_4
-
+
.L02:
movddup 0 * SIZE(B), %xmm0
movddup 2 * SIZE(B), %xmm1
@@ -473,7 +473,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L10:
movq C, CO1 # coffset1 = c
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
@@ -501,7 +501,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -529,7 +529,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -543,7 +543,7 @@
andq $-8, %rax
salq $4, %rax
je .L15
-
+
.L1X:
KERNEL1 (32 * 0)
KERNEL2 (32 * 0)
@@ -882,7 +882,7 @@
addq $8 * SIZE, CO2 # coffset += 4
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -899,7 +899,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -918,7 +918,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1150,8 +1150,8 @@
addq $4 * SIZE, CO1 # coffset += 4
addq $4 * SIZE, CO2 # coffset += 4
- ALIGN_4
-
+ ALIGN_4
+
.L30:
testq $1, M
je .L39
@@ -1167,7 +1167,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 4), BO
-#endif
+#endif
movddup 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1185,7 +1185,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1317,12 +1317,12 @@
movhlps %xmm6, %xmm1
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
- defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
cmpeqps %xmm7, %xmm7
pslld $31, %xmm7
xorps %xmm7, %xmm1
-#endif
-
+#endif
+
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
shufps $0xb1, %xmm1, %xmm1
@@ -1369,7 +1369,7 @@
#if defined(TRMMKERNEL) && defined(LEFT)
addq $1, KK
#endif
- ALIGN_4
+ ALIGN_4
.L39:
#if defined(TRMMKERNEL) && !defined(LEFT)
@@ -1390,16 +1390,16 @@
#if defined(TRMMKERNEL) && defined(LEFT)
movq OFFSET, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
movq K, %rax
sarq $3, %rax
jle .L43
ALIGN_4
-
+
.L42:
movddup 0 * SIZE(B), %xmm0
movddup 2 * SIZE(B), %xmm1
@@ -1445,7 +1445,7 @@
decq %rax
jne .L44
ALIGN_4
-
+
.L50:
movq C, CO1 # coffset1 = c
movq A, AO # aoffset = a
@@ -1467,7 +1467,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 4), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1488,7 +1488,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1727,7 +1727,7 @@
addq $8 * SIZE, CO1 # coffset += 4
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1744,7 +1744,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 2), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1758,7 +1758,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -1907,8 +1907,8 @@
addq $2, KK
#endif
addq $4 * SIZE, CO1 # coffset += 4
- ALIGN_4
-
+ ALIGN_4
+
.L70:
testq $1, M
je .L999
@@ -1924,7 +1924,7 @@
leaq (, %rax, 8), %rax
leaq (AO, %rax, 1), AO
leaq (BO, %rax, 2), BO
-#endif
+#endif
movddup 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1938,7 +1938,7 @@
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
movq K, %rax
subq KK, %rax
- movq %rax, KKK
+ movq %rax, KKK
#else
movq KK, %rax
#ifdef LEFT
@@ -2033,12 +2033,12 @@
movhlps %xmm0, %xmm1
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
- defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
cmpeqps %xmm7, %xmm7
pslld $31, %xmm7
xorps %xmm7, %xmm1
-#endif
-
+#endif
+
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
defined(NR) || defined(NC) || defined(TR) || defined(TC)
shufps $0xb1, %xmm1, %xmm1
@@ -2068,7 +2068,7 @@
addps %xmm8, %xmm0
#endif
movsd %xmm0, 0 * SIZE(CO1)
- ALIGN_4
+ ALIGN_4
.L999:
movq %rbx, %rsp
diff --git a/kernel/x86_64/zgemm_kernel_4x4_sandy.S b/kernel/x86_64/zgemm_kernel_4x4_sandy.S
index 2cafb1f..dbde1f0 100644
--- a/kernel/x86_64/zgemm_kernel_4x4_sandy.S
+++ b/kernel/x86_64/zgemm_kernel_4x4_sandy.S
@@ -13,19 +13,19 @@ notice, this list of conditions and the following disclaimer.
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
-3. Neither the name of the ISCAS nor the names of its contributors may
-be used to endorse or promote products derived from this software
+3. Neither the name of the ISCAS nor the names of its contributors may
+be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -59,7 +59,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef WINDOWS_ABI
-#define STACKSIZE 128
+#define STACKSIZE 128
#define old_ldc 8+STACKSIZE(%rsp)
#define old_offset 16+STACKSIZE(%rsp)
@@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OR orq
#define JNE jne
#define JMP jmp
-#define NOP
+#define NOP
#define XOR xorpd
#undef MOVQ
#define MOVQ movq
@@ -207,7 +207,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ADD1_DX SUB_DX
#define ADD1_DY SUB_DY
#define ADD2_DY ADDSUB_DY
-#else
+#else
#define ADD1_DX ADD_DX
#define ADD1_DY ADD_DY
#define ADD2_DY ADDSUB_DY
@@ -289,7 +289,7 @@ SALQ $6, k;
LEAQ (bb, k, 1), prebb; # Rn=4 SIZE=8 COMPLEX=2
MOVQ ba,ptrba;
MOVQ bm,i;
-SARQ $2,i; # Rm = 4
+SARQ $2,i; # Rm = 4
JLE .L1_loopE;
ALIGN_5;
.L1_bodyB:;
@@ -333,7 +333,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $4, %rax;
-#else
+#else
ADDQ $4, %rax;
#endif
MOVQ %rax, kkk;
@@ -764,7 +764,7 @@ SUB_DY yvec11, yvec7, yvec11;
SUB_DY yvec10, yvec7, yvec10;
SUB_DY yvec9, yvec7, yvec9;
SUB_DY yvec8, yvec7, yvec8;
-#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
VPERMILP_DY $0x05, yvec15, yvec15;
VPERMILP_DY $0x05, yvec14, yvec14;
VPERMILP_DY $0x05, yvec13, yvec13;
@@ -882,7 +882,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 4), ptrba;
LEAQ (ptrbb, %rax, 4), ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $4, kk;
#endif
ADDQ $8*SIZE,C0;
@@ -996,7 +996,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 4), ptrba;
LEAQ (ptrbb, %rax, 4), ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $4, kk;
#endif
ADDQ $8*SIZE, C0;
@@ -1032,7 +1032,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $2, %rax;
-#else
+#else
ADDQ $4, %rax;
#endif
MOVQ %rax, kkk;
@@ -1304,7 +1304,7 @@ SUB_DY yvec15, yvec7, yvec15;
SUB_DY yvec14, yvec7, yvec14;
SUB_DY yvec13, yvec7, yvec13;
SUB_DY yvec12, yvec7, yvec12;
-#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
VPERMILP_DY $0x05, yvec15, yvec15;
VPERMILP_DY $0x05, yvec14, yvec14;
VPERMILP_DY $0x05, yvec13, yvec13;
@@ -1374,7 +1374,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 2), ptrba;
LEAQ (ptrbb, %rax, 4), ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $2, kk;
#endif
ADDQ $4*SIZE, C0;
@@ -1437,7 +1437,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 2), ptrba;
LEAQ (ptrbb, %rax, 4), ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $2, kk;
#endif
ADDQ $4*SIZE, C0;
@@ -1468,7 +1468,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $1, %rax;
-#else
+#else
ADDQ $4, %rax;
#endif
MOVQ %rax, kkk;
@@ -1634,7 +1634,7 @@ ADDSUB_DY yvec14, yvec7, yvec14;
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
SUB_DY yvec15, yvec7, yvec15;
SUB_DY yvec14, yvec7, yvec14;
-#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
VPERMILP_DY $0x05, yvec15, yvec15;
VPERMILP_DY $0x05, yvec14, yvec14;
ADDSUB_DY yvec15, yvec7, yvec15;
@@ -1669,7 +1669,7 @@ ADD_DX xvec0, xvec15, xvec15;
ADD_DX xvec1, xvec7, xvec7;
ADD_DX xvec2, xvec14, xvec14;
ADD_DX xvec3, xvec6, xvec6;
-#endif
+#endif
STL_DX xvec15, 0*SIZE(C0);
STH_DX xvec15, 1*SIZE(C0);
STL_DX xvec7, 0*SIZE(C0, ldc, 1);
@@ -1685,7 +1685,7 @@ SALQ $ZBASE_SHIFT, %rax;
ADDQ %rax, ptrba;
LEAQ (ptrbb, %rax, 4), ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $1, kk;
#endif
ADDQ $2*SIZE, C0;
@@ -1742,7 +1742,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $4, %rax;
-#else
+#else
ADDQ $2, %rax;
#endif
MOVQ %rax, kkk;
@@ -1994,7 +1994,7 @@ SUB_DY yvec15, yvec7, yvec15;
SUB_DY yvec14, yvec7, yvec14;
SUB_DY yvec13, yvec7, yvec13;
SUB_DY yvec12, yvec7, yvec12;
-#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
VPERMILP_DY $0x05, yvec15, yvec15;
VPERMILP_DY $0x05, yvec14, yvec14;
VPERMILP_DY $0x05, yvec13, yvec13;
@@ -2032,7 +2032,7 @@ EXTRA_DY $1, yvec15, xvec7;
EXTRA_DY $1, yvec14, xvec6;
EXTRA_DY $1, yvec13, xvec5;
EXTRA_DY $1, yvec12, xvec4;
-#### Testing Alignment ####
+#### Testing Alignment ####
MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
@@ -2064,7 +2064,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 4), ptrba;
LEAQ (ptrbb, %rax, 2), ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $4, kk;
#endif
ADDQ $8*SIZE, C0;
@@ -2125,7 +2125,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 4), ptrba;
LEAQ (ptrbb, %rax, 2), ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $4, kk;
#endif
ADDQ $8*SIZE, C0;
@@ -2159,7 +2159,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $2, %rax;
-#else
+#else
ADDQ $2, %rax;
#endif
MOVQ %rax, kkk;
@@ -2336,7 +2336,7 @@ ADDSUB_DY yvec13, yvec7, yvec13;
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
SUB_DY yvec15, yvec7, yvec15;
SUB_DY yvec13, yvec7, yvec13;
-#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
VPERMILP_DY $0x05, yvec15, yvec15;
VPERMILP_DY $0x05, yvec13, yvec13;
ADDSUB_DY yvec15, yvec7, yvec15;
@@ -2389,7 +2389,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 2), ptrba;
LEAQ (ptrbb, %rax, 2), ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $2, kk;
#endif
ADDQ $4*SIZE, C0;
@@ -2420,7 +2420,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $1, %rax;
-#else
+#else
ADDQ $2, %rax;
#endif
MOVQ %rax, kkk;
@@ -2542,7 +2542,7 @@ XOR_DY yvec7, yvec7, yvec7;
ADDSUB_DY yvec15, yvec7, yvec15;
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
SUB_DY yvec15, yvec7, yvec15;
-#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
VPERMILP_DY $0x05, yvec15, yvec15;
ADDSUB_DY yvec15, yvec7, yvec15;
VPERMILP_DY $0x05, yvec15, yvec15;
@@ -2577,7 +2577,7 @@ SALQ $ZBASE_SHIFT, %rax;
ADDQ %rax, ptrba;
LEAQ (ptrbb, %rax, 2), ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $1, kk;
#endif
ADDQ $2*SIZE, C0;
@@ -2627,7 +2627,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $4, %rax;
-#else
+#else
ADDQ $1, %rax;
#endif
MOVQ %rax, kkk;
@@ -2797,7 +2797,7 @@ ADDSUB_DY yvec14, yvec7, yvec14;
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
SUB_DY yvec15, yvec7, yvec15;
SUB_DY yvec14, yvec7, yvec14;
-#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
VPERMILP_DY $0x05, yvec15, yvec15;
VPERMILP_DY $0x05, yvec14, yvec14;
ADDSUB_DY yvec15, yvec7, yvec15;
@@ -2850,7 +2850,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 4), ptrba;
ADDQ %rax, ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $4, kk;
#endif
ADDQ $8*SIZE, C0;
@@ -2882,7 +2882,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $2, %rax;
-#else
+#else
ADDQ $1, %rax;
#endif
MOVQ %rax, kkk;
@@ -2986,7 +2986,7 @@ XOR_DY yvec7, yvec7, yvec7;
ADDSUB_DY yvec15, yvec7, yvec15;
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
SUB_DY yvec15, yvec7, yvec15;
-#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
VPERMILP_DY $0x05, yvec15, yvec15;
ADDSUB_DY yvec15, yvec7, yvec15;
VPERMILP_DY $0x05, yvec15, yvec15;
@@ -3021,7 +3021,7 @@ SALQ $ZBASE_SHIFT, %rax;
LEAQ (ptrba, %rax, 2), ptrba;
ADDQ %rax, ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $2, kk;
#endif
ADDQ $4*SIZE, C0;
@@ -3050,7 +3050,7 @@ MOVQ %rax, kkk;
MOVQ kk, %rax;
#ifdef LEFT
ADDQ $1, %rax;
-#else
+#else
ADDQ $1, %rax;
#endif
MOVQ %rax, kkk;
@@ -3194,7 +3194,7 @@ SALQ $ZBASE_SHIFT, %rax;
ADDQ %rax, ptrba;
ADDQ %rax, ptrbb;
#endif
-#if defined(TRMMKERNEL) && defined(LEFT)
+#if defined(TRMMKERNEL) && defined(LEFT)
ADDQ $1, kk;
#endif
ADDQ $2*SIZE, C0;
diff --git a/kernel/x86_64/zgemm_ncopy_1.S b/kernel/x86_64/zgemm_ncopy_1.S
index 9f9ae73..60b51f5 100644
--- a/kernel/x86_64/zgemm_ncopy_1.S
+++ b/kernel/x86_64/zgemm_ncopy_1.S
@@ -74,7 +74,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %r14
pushq %r13
diff --git a/kernel/x86_64/zgemm_ncopy_2.S b/kernel/x86_64/zgemm_ncopy_2.S
index 8876b61..8451982 100644
--- a/kernel/x86_64/zgemm_ncopy_2.S
+++ b/kernel/x86_64/zgemm_ncopy_2.S
@@ -92,7 +92,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %r14
pushq %r13
diff --git a/kernel/x86_64/zgemm_tcopy_1.S b/kernel/x86_64/zgemm_tcopy_1.S
index b4348e6..02c0614 100644
--- a/kernel/x86_64/zgemm_tcopy_1.S
+++ b/kernel/x86_64/zgemm_tcopy_1.S
@@ -74,7 +74,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %r14
pushq %r13
@@ -118,7 +118,7 @@
#ifndef DOUBLE
movsd 0 * SIZE(AO1), %xmm0
movhps 0 * SIZE(AO1, LDA, 1), %xmm0
-
+
movaps %xmm0, 0 * SIZE(B)
#else
prefetcht0 RPREFETCHSIZE * SIZE(AO1)
diff --git a/kernel/x86_64/zgemm_tcopy_2.S b/kernel/x86_64/zgemm_tcopy_2.S
index f83022d..121bbc4 100644
--- a/kernel/x86_64/zgemm_tcopy_2.S
+++ b/kernel/x86_64/zgemm_tcopy_2.S
@@ -85,7 +85,7 @@
PROLOGUE
PROFCODE
-
+
#ifdef WINDOWS_ABI
pushq %rdi
pushq %rsi
diff --git a/kernel/x86_64/zgemv_n.S b/kernel/x86_64/zgemv_n.S
index 71b76ac..b903cfb 100644
--- a/kernel/x86_64/zgemv_n.S
+++ b/kernel/x86_64/zgemv_n.S
@@ -43,7 +43,7 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 128
-
+
#define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp)
#define OLD_INCY 24 + STACKSIZE(%rsp)
@@ -57,7 +57,7 @@
#define XX 88(%rsp)
#define LDAX 96(%rsp)
#define ALPHAR 104(%rsp)
-#define ALPHAI 112(%rsp)
+#define ALPHAI 112(%rsp)
#define M %rdi
#define N %rsi
@@ -71,7 +71,7 @@
#else
#define STACKSIZE 304
-
+
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
#define OLD_LDA 56 + STACKSIZE(%rsp)
@@ -194,7 +194,7 @@
movlpd %xmm1, ALPHA_I
subq $-16 * SIZE, A
-
+
testq M, M
jle .L999
testq N, N
@@ -230,7 +230,7 @@
testq $SIZE, A
jne .L100
#endif
-
+
#if GEMV_UNROLL >= 4
cmpq $4, N
@@ -260,7 +260,7 @@
pcmpeqb %xmm5, %xmm5
psllq $63, %xmm5
- shufps $0xc0, %xmm5, %xmm5
+ shufps $0xc0, %xmm5, %xmm5
pshufd $0x4e, %xmm8, %xmm9
pshufd $0x4e, %xmm10, %xmm11
@@ -764,7 +764,7 @@
pcmpeqb %xmm11, %xmm11
psllq $63, %xmm11
- shufps $0xc0, %xmm11, %xmm11
+ shufps $0xc0, %xmm11, %xmm11
pshufd $0x4e, %xmm12, %xmm13
pshufd $0x4e, %xmm14, %xmm15
@@ -1083,7 +1083,7 @@
pcmpeqb %xmm11, %xmm11
psllq $63, %xmm11
- shufps $0xc0, %xmm11, %xmm11
+ shufps $0xc0, %xmm11, %xmm11
pshufd $0x4e, %xmm12, %xmm13
@@ -1316,7 +1316,7 @@
pcmpeqb %xmm5, %xmm5
psllq $63, %xmm5
- shufps $0xc0, %xmm5, %xmm5
+ shufps $0xc0, %xmm5, %xmm5
pshufd $0x4e, %xmm8, %xmm9
pshufd $0x4e, %xmm10, %xmm11
@@ -1864,7 +1864,7 @@
pcmpeqb %xmm11, %xmm11
psllq $63, %xmm11
- shufps $0xc0, %xmm11, %xmm11
+ shufps $0xc0, %xmm11, %xmm11
pshufd $0x4e, %xmm12, %xmm13
pshufd $0x4e, %xmm14, %xmm15
@@ -2206,7 +2206,7 @@
pcmpeqb %xmm11, %xmm11
psllq $63, %xmm11
- shufps $0xc0, %xmm11, %xmm11
+ shufps $0xc0, %xmm11, %xmm11
pshufd $0x4e, %xmm12, %xmm13
diff --git a/kernel/x86_64/zgemv_n_atom.S b/kernel/x86_64/zgemv_n_atom.S
index 289c076..4fa7092 100644
--- a/kernel/x86_64/zgemv_n_atom.S
+++ b/kernel/x86_64/zgemv_n_atom.S
@@ -49,12 +49,12 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 64
-
+
#define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp)
#define OLD_INCY 24 + STACKSIZE(%rsp)
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
-
+
#define M %rdi
#define N %rsi
#define A %rcx
@@ -67,7 +67,7 @@
#else
#define STACKSIZE 256
-
+
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
#define OLD_LDA 56 + STACKSIZE(%rsp)
@@ -173,7 +173,7 @@
movaps %xmm1, ALPHA_I
subq $-16 * SIZE, A
-
+
testq M, M
jle .L999
testq N, N
diff --git a/kernel/x86_64/zgemv_n_dup.S b/kernel/x86_64/zgemv_n_dup.S
index 8a49fc9..42c1963 100644
--- a/kernel/x86_64/zgemv_n_dup.S
+++ b/kernel/x86_64/zgemv_n_dup.S
@@ -43,14 +43,14 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 64
-
+
#define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp)
#define OLD_INCY 24 + STACKSIZE(%rsp)
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
#define ALPHA_R 48 (%rsp)
#define ALPHA_I 56 (%rsp)
-
+
#define M %rdi
#define N %rsi
#define A %rcx
@@ -63,7 +63,7 @@
#else
#define STACKSIZE 256
-
+
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
#define OLD_LDA 56 + STACKSIZE(%rsp)
@@ -149,7 +149,7 @@
movlps %xmm1, ALPHA_I
subq $-16 * SIZE, A
-
+
testq M, M
jle .L999
testq N, N
diff --git a/kernel/x86_64/zgemv_t.S b/kernel/x86_64/zgemv_t.S
index 30f76dc..c789279 100644
--- a/kernel/x86_64/zgemv_t.S
+++ b/kernel/x86_64/zgemv_t.S
@@ -43,12 +43,12 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 128
-
+
#define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp)
#define OLD_INCY 24 + STACKSIZE(%rsp)
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
-
+
#define MMM 64(%rsp)
#define NN 72(%rsp)
#define AA 80(%rsp)
@@ -68,7 +68,7 @@
#else
#define STACKSIZE 288
-
+
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
#define OLD_LDA 56 + STACKSIZE(%rsp)
@@ -193,7 +193,7 @@
#endif
subq $-16 * SIZE, A
-
+
testq M, M
jle .L999
testq N, N
@@ -201,7 +201,7 @@
ALIGN_3
movq BUFFER, X1
-
+
movq Y, Y1
movq M, I
@@ -669,7 +669,7 @@
pcmpeqb %xmm13, %xmm13
psllq $63, %xmm13
shufps $0xc0, %xmm13, %xmm13
-
+
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
xorpd %xmm13, %xmm0
xorpd %xmm13, %xmm2
@@ -1034,7 +1034,7 @@
pcmpeqb %xmm11, %xmm11
psllq $63, %xmm11
shufps $0xc0, %xmm11, %xmm11
-
+
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
xorpd %xmm11, %xmm0
xorpd %xmm11, %xmm2
@@ -1264,7 +1264,7 @@
pcmpeqb %xmm11, %xmm11
psllq $63, %xmm11
shufps $0xc0, %xmm11, %xmm11
-
+
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
xorpd %xmm11, %xmm0
#else
@@ -1769,7 +1769,7 @@
pcmpeqb %xmm13, %xmm13
psllq $63, %xmm13
shufps $0xc0, %xmm13, %xmm13
-
+
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
xorpd %xmm13, %xmm0
xorpd %xmm13, %xmm2
@@ -2157,7 +2157,7 @@
pcmpeqb %xmm11, %xmm11
psllq $63, %xmm11
shufps $0xc0, %xmm11, %xmm11
-
+
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
xorpd %xmm11, %xmm0
xorpd %xmm11, %xmm2
@@ -2398,7 +2398,7 @@
pcmpeqb %xmm11, %xmm11
psllq $63, %xmm11
shufps $0xc0, %xmm11, %xmm11
-
+
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
xorpd %xmm11, %xmm0
#else
diff --git a/kernel/x86_64/zgemv_t_atom.S b/kernel/x86_64/zgemv_t_atom.S
index 5d3ecdd..73a013a 100644
--- a/kernel/x86_64/zgemv_t_atom.S
+++ b/kernel/x86_64/zgemv_t_atom.S
@@ -49,12 +49,12 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 64
-
+
#define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp)
#define OLD_INCY 24 + STACKSIZE(%rsp)
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
-
+
#define M %rdi
#define N %rsi
#define A %rcx
@@ -67,7 +67,7 @@
#else
#define STACKSIZE 256
-
+
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
#define OLD_LDA 56 + STACKSIZE(%rsp)
@@ -174,7 +174,7 @@
movaps %xmm1, ALPHA_I
subq $-16 * SIZE, A
-
+
testq M, M
jle .L999
testq N, N
@@ -182,7 +182,7 @@
ALIGN_3
movq BUFFER, X1
-
+
movq Y, Y1
movq M, I
diff --git a/kernel/x86_64/zgemv_t_dup.S b/kernel/x86_64/zgemv_t_dup.S
index 2db99b6..d509f0e 100644
--- a/kernel/x86_64/zgemv_t_dup.S
+++ b/kernel/x86_64/zgemv_t_dup.S
@@ -43,12 +43,12 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 64
-
+
#define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp)
#define OLD_INCY 24 + STACKSIZE(%rsp)
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
-
+
#define M %rdi
#define N %rsi
#define A %rcx
@@ -61,7 +61,7 @@
#else
#define STACKSIZE 256
-
+
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
#define OLD_LDA 56 + STACKSIZE(%rsp)
@@ -156,7 +156,7 @@
xorps %xmm5, ALPHA_I
subq $-16 * SIZE, A
-
+
testq M, M
jle .L999
testq N, N
@@ -164,7 +164,7 @@
ALIGN_3
movq BUFFER, X1
-
+
movq Y, Y1
movq M, I
@@ -606,7 +606,7 @@
pcmpeqb %xmm13, %xmm13
psllq $63, %xmm13
shufps $0x40, %xmm13, %xmm13
-
+
#ifndef XCONJ
xorps %xmm13, %xmm1
xorps %xmm13, %xmm3
@@ -935,7 +935,7 @@
pcmpeqb %xmm13, %xmm13
psllq $63, %xmm13
shufps $0x40, %xmm13, %xmm13
-
+
#ifndef XCONJ
xorps %xmm13, %xmm1
xorps %xmm13, %xmm3
@@ -1154,7 +1154,7 @@
pcmpeqb %xmm13, %xmm13
psllq $63, %xmm13
shufps $0x40, %xmm13, %xmm13
-
+
#ifndef XCONJ
xorps %xmm13, %xmm1
#else
diff --git a/kernel/x86_64/znrm2.S b/kernel/x86_64/znrm2.S
index 9502626..4115eab 100644
--- a/kernel/x86_64/znrm2.S
+++ b/kernel/x86_64/znrm2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
@@ -49,7 +49,7 @@
PROLOGUE
PROFCODE
-
+
fldz
testq M, M
jle .L999
@@ -68,7 +68,7 @@
sarq $2, I
jle .L20
ALIGN_4
-
+
.L10:
#if defined(PREFETCH)
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
@@ -132,7 +132,7 @@
sarq $2, I
jle .L60
ALIGN_4
-
+
.L50:
FLD 0 * SIZE(X)
fmul %st(0), %st
diff --git a/kernel/x86_64/znrm2_sse.S b/kernel/x86_64/znrm2_sse.S
index 005536a..f78b83f 100644
--- a/kernel/x86_64/znrm2_sse.S
+++ b/kernel/x86_64/znrm2_sse.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
@@ -50,7 +50,7 @@
PROLOGUE
PROFCODE
-
+
SAVEREGISTERS
pxor %xmm0, %xmm0
@@ -70,7 +70,7 @@
testq $SIZE, X
je .L05
-
+
movss (X), %xmm4
cvtss2sd %xmm4, %xmm6
mulsd %xmm6, %xmm6
@@ -85,7 +85,7 @@
movq M, I
sarq $3, I
jle .L14
-
+
movsd 0 * SIZE(X), %xmm4
movsd 2 * SIZE(X), %xmm5
movsd 4 * SIZE(X), %xmm6
@@ -252,7 +252,7 @@
sarq $3, I
jle .L44
ALIGN_4
-
+
.L41:
movsd (X), %xmm4
addq INCX, X
diff --git a/kernel/x86_64/zrot.S b/kernel/x86_64/zrot.S
index d645d6f..22d031c 100644
--- a/kernel/x86_64/zrot.S
+++ b/kernel/x86_64/zrot.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define N ARG1
#define X ARG2
#define INCX ARG3
@@ -80,7 +80,7 @@
sarq $1, I
jle .L15
ALIGN_4
-
+
.L10:
#if defined(PREFETCHW)
PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
@@ -225,7 +225,7 @@
sarq $1, I
jle .L55
ALIGN_4
-
+
.L51:
FLD 0 * SIZE(X)
FLD 0 * SIZE(Y)
diff --git a/kernel/x86_64/zrot_sse.S b/kernel/x86_64/zrot_sse.S
index da79b4a..6391311 100644
--- a/kernel/x86_64/zrot_sse.S
+++ b/kernel/x86_64/zrot_sse.S
@@ -124,7 +124,7 @@
movaps 4 * SIZE(X), %xmm2
movaps 8 * SIZE(X), %xmm8
movaps 12 * SIZE(X), %xmm10
-
+
decq %rax
jle .L12
ALIGN_3
@@ -552,7 +552,7 @@
movaps 4 * SIZE(X), %xmm2
movaps 8 * SIZE(X), %xmm8
movaps 12 * SIZE(X), %xmm10
-
+
decq %rax
jle .L22
ALIGN_3
@@ -1026,7 +1026,7 @@
movhps 10 * SIZE(X), %xmm8
movsd 12 * SIZE(X), %xmm10
movhps 14 * SIZE(X), %xmm10
-
+
decq %rax
jle .L32
ALIGN_3
diff --git a/kernel/x86_64/zrot_sse2.S b/kernel/x86_64/zrot_sse2.S
index 3681018..e6288c3 100644
--- a/kernel/x86_64/zrot_sse2.S
+++ b/kernel/x86_64/zrot_sse2.S
@@ -99,7 +99,7 @@
movapd 2 * SIZE(X), %xmm2
movapd 4 * SIZE(X), %xmm8
movapd 6 * SIZE(X), %xmm10
-
+
decq %rax
jle .L12
ALIGN_3
@@ -1169,7 +1169,7 @@
movapd 2 * SIZE(X), %xmm2
movapd 4 * SIZE(X), %xmm8
movapd 6 * SIZE(X), %xmm10
-
+
decq %rax
jle .L42
ALIGN_3
diff --git a/kernel/x86_64/zscal_atom.S b/kernel/x86_64/zscal_atom.S
index c01d5c1..1649b85 100644
--- a/kernel/x86_64/zscal_atom.S
+++ b/kernel/x86_64/zscal_atom.S
@@ -65,12 +65,12 @@
#endif
SAVEREGISTERS
-
+
salq $ZBASE_SHIFT, INCX
testq M, M
jle .L999
-
+
pxor %xmm15, %xmm15
comisd %xmm0, %xmm15
jne .L30 # Alpha_r != ZERO
@@ -387,7 +387,7 @@
xorq %rax, %rax
RESTOREREGISTERS
-
+
ret
EPILOGUE
diff --git a/kernel/x86_64/zscal_sse.S b/kernel/x86_64/zscal_sse.S
index 393988e..8505c67 100644
--- a/kernel/x86_64/zscal_sse.S
+++ b/kernel/x86_64/zscal_sse.S
@@ -66,7 +66,7 @@
#endif
SAVEREGISTERS
-
+
salq $ZBASE_SHIFT, INCX
xor FLAG, FLAG
@@ -1040,7 +1040,7 @@
#else
-
+
pshufd $0, %xmm0, %xmm14
pshufd $0, %xmm1, %xmm1
subps %xmm1, %xmm15
@@ -1353,7 +1353,7 @@
xorq %rax, %rax
RESTOREREGISTERS
-
+
ret
EPILOGUE
diff --git a/kernel/x86_64/zscal_sse2.S b/kernel/x86_64/zscal_sse2.S
index a553bbd..223b1e4 100644
--- a/kernel/x86_64/zscal_sse2.S
+++ b/kernel/x86_64/zscal_sse2.S
@@ -72,13 +72,13 @@
#endif
SAVEREGISTERS
-
+
salq $ZBASE_SHIFT, INCX
xor FLAG, FLAG
testq M, M
jle .L999
-
+
pxor %xmm15, %xmm15
comisd %xmm0, %xmm15
jne .L100
@@ -177,7 +177,7 @@
jle .L22
ALIGN_4
-.L21:
+.L21:
#ifdef PREFETCHW
PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif
@@ -280,7 +280,7 @@
#endif
pxor %xmm15, %xmm15
subsd %xmm1, %xmm15
- movlhps %xmm1, %xmm15
+ movlhps %xmm1, %xmm15
cmpq $2 * SIZE, INCX
jne .L120
@@ -804,11 +804,11 @@
jne .L220
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
-
+
movddup %xmm0, %xmm14
pxor %xmm15, %xmm15
subsd %xmm1, %xmm15
- movlhps %xmm1, %xmm15
+ movlhps %xmm1, %xmm15
shufpd $1, %xmm15, %xmm15
movhps 0 * SIZE(X), %xmm0
@@ -1085,7 +1085,7 @@
movddup %xmm0, %xmm14
pxor %xmm15, %xmm15
subsd %xmm1, %xmm15
- movlhps %xmm1, %xmm15
+ movlhps %xmm1, %xmm15
subq $-16 * SIZE, X
@@ -1399,7 +1399,7 @@
movddup %xmm0, %xmm14
pxor %xmm15, %xmm15
subsd %xmm1, %xmm15
- movlhps %xmm1, %xmm15
+ movlhps %xmm1, %xmm15
movq X, XX
@@ -1717,7 +1717,7 @@
xorq %rax, %rax
RESTOREREGISTERS
-
+
ret
EPILOGUE
diff --git a/kernel/x86_64/zswap.S b/kernel/x86_64/zswap.S
index 8f96875..68568f7 100644
--- a/kernel/x86_64/zswap.S
+++ b/kernel/x86_64/zswap.S
@@ -60,7 +60,7 @@
PROLOGUE
PROFCODE
-
+
#ifndef WINDOWS_ABI
#ifndef XDOUBLE
movq 8(%rsp), INCY
diff --git a/kernel/x86_64/zswap_sse.S b/kernel/x86_64/zswap_sse.S
index 2f21759..12f9875 100644
--- a/kernel/x86_64/zswap_sse.S
+++ b/kernel/x86_64/zswap_sse.S
@@ -86,7 +86,7 @@
subq $-32 * SIZE, X
subq $-32 * SIZE, Y
-
+
cmpq $3, M
jle .L16
@@ -312,7 +312,7 @@
.L20:
movaps -33 * SIZE(X), %xmm0
movaps -32 * SIZE(Y), %xmm1
-
+
movss %xmm1, -32 * SIZE(X)
pshufd $0x39, %xmm1, %xmm3
movlps %xmm3, -31 * SIZE(X)
@@ -796,7 +796,7 @@
.L40:
movaps -35 * SIZE(X), %xmm0
movaps -32 * SIZE(Y), %xmm1
-
+
movss %xmm1, -32 * SIZE(X)
subq $3, M
diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S
index 204e5e6..3a5243b 100644
--- a/kernel/x86_64/zsymv_L_sse.S
+++ b/kernel/x86_64/zsymv_L_sse.S
@@ -91,7 +91,7 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 80
-
+
#define OLD_Y 8 + STACKSIZE(%rsp)
#define OLD_INCY 16 + STACKSIZE(%rsp)
#define OLD_BUFFER 24 + STACKSIZE(%rsp)
@@ -99,14 +99,14 @@
#define M ARG1
#define N ARG2
#define A ARG3
-#define LDA ARG4
+#define LDA ARG4
#define X ARG5
-#define INCX ARG6
+#define INCX ARG6
#else
#define STACKSIZE 256
-
+
#define OLD_A 40 + STACKSIZE(%rsp)
#define OLD_LDA 48 + STACKSIZE(%rsp)
#define OLD_X 56 + STACKSIZE(%rsp)
diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S
index 5769d24..295ab1a 100644
--- a/kernel/x86_64/zsymv_L_sse2.S
+++ b/kernel/x86_64/zsymv_L_sse2.S
@@ -97,7 +97,7 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 80
-
+
#define OLD_Y 8 + STACKSIZE(%rsp)
#define OLD_INCY 16 + STACKSIZE(%rsp)
#define OLD_BUFFER 24 + STACKSIZE(%rsp)
@@ -105,14 +105,14 @@
#define M ARG1
#define N ARG2
#define A ARG3
-#define LDA ARG4
+#define LDA ARG4
#define X ARG5
-#define INCX ARG6
+#define INCX ARG6
#else
#define STACKSIZE 256
-
+
#define OLD_A 40 + STACKSIZE(%rsp)
#define OLD_LDA 48 + STACKSIZE(%rsp)
#define OLD_X 56 + STACKSIZE(%rsp)
diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S
index 6f782b1..cf302e4 100644
--- a/kernel/x86_64/zsymv_U_sse.S
+++ b/kernel/x86_64/zsymv_U_sse.S
@@ -93,11 +93,11 @@
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 14)
#endif
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 80
-
+
#define OLD_Y 8 + STACKSIZE(%rsp)
#define OLD_INCY 16 + STACKSIZE(%rsp)
#define OLD_BUFFER 24 + STACKSIZE(%rsp)
@@ -105,14 +105,14 @@
#define M ARG1
#define N ARG2
#define A ARG3
-#define LDA ARG4
+#define LDA ARG4
#define X ARG5
-#define INCX ARG6
+#define INCX ARG6
#else
#define STACKSIZE 256
-
+
#define OLD_A 40 + STACKSIZE(%rsp)
#define OLD_LDA 48 + STACKSIZE(%rsp)
#define OLD_X 56 + STACKSIZE(%rsp)
diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S
index f92779e..7c29013 100644
--- a/kernel/x86_64/zsymv_U_sse2.S
+++ b/kernel/x86_64/zsymv_U_sse2.S
@@ -97,7 +97,7 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 80
-
+
#define OLD_Y 8 + STACKSIZE(%rsp)
#define OLD_INCY 16 + STACKSIZE(%rsp)
#define OLD_BUFFER 24 + STACKSIZE(%rsp)
@@ -105,14 +105,14 @@
#define M ARG1
#define IS ARG2
#define A ARG3
-#define LDA ARG4
+#define LDA ARG4
#define X ARG5
-#define INCX ARG6
+#define INCX ARG6
#else
#define STACKSIZE 256
-
+
#define OLD_A 40 + STACKSIZE(%rsp)
#define OLD_LDA 48 + STACKSIZE(%rsp)
#define OLD_X 56 + STACKSIZE(%rsp)
diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x1_atom.S b/kernel/x86_64/ztrsm_kernel_LN_2x1_atom.S
index 31bd57b..d3bedff 100644
--- a/kernel/x86_64/ztrsm_kernel_LN_2x1_atom.S
+++ b/kernel/x86_64/ztrsm_kernel_LN_2x1_atom.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -55,7 +55,7 @@
#define CO1 %r15
#define BB %rbx
#define KK %rbp
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 128
@@ -107,9 +107,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -144,7 +144,7 @@
movq OLD_OFFSET, KK
movq KK, OFFSET
-
+
salq $ZBASE_SHIFT, LDC
#ifdef LN
@@ -168,7 +168,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, KK
@@ -203,7 +203,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, KK
#endif
@@ -229,7 +229,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm2, %xmm2
@@ -502,7 +502,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
prefetcht0 0 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -547,7 +547,7 @@
ADDSD3 %xmm7, %xmm14
movsd 3 * SIZE(AO), %xmm7
mulsd %xmm3, %xmm2
-
+
ADDSD4 %xmm6, %xmm15
PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO)
movaps %xmm4, %xmm6
@@ -943,7 +943,7 @@
#endif
decq I # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L99:
#ifdef LN
@@ -965,7 +965,7 @@
decq J # j --
jg .L01
ALIGN_4
-
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_core2.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_core2.S
index 065abe0..80485c0 100644
--- a/kernel/x86_64/ztrsm_kernel_LN_2x2_core2.S
+++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_core2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,7 +49,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -84,7 +84,7 @@
#define AORIG 48(%rsp)
#define BORIG 56(%rsp)
#define BUFFER 128(%rsp)
-
+
#define PREFETCH_R (8 * 4 + 0)
#define PREFETCH_W (PREFETCH_R)
@@ -106,9 +106,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -186,7 +186,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -204,10 +204,10 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq 16 * SIZE + BUFFER, BO
-
+
#ifdef RT
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
@@ -220,7 +220,7 @@
salq $ZBASE_SHIFT, %rax
leaq (B, %rax, 2), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -238,7 +238,7 @@
addq %rax, %rax
ALIGN_4
-
+
.L02:
prefetcht0 (PREFETCH_R + 0) * SIZE(B)
@@ -300,7 +300,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L05:
#if defined(LT) || defined(RN)
movq A, AO
@@ -342,7 +342,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -709,7 +709,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
prefetcht2 0 * SIZE(BB)
@@ -752,7 +752,7 @@
jle .L15
ALIGN_4
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movapd -16 * SIZE(AO), %xmm0
@@ -1310,7 +1310,7 @@
decq I # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L99:
#ifdef LN
@@ -1345,7 +1345,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -1361,7 +1361,7 @@
salq $ZBASE_SHIFT, %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 2), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -1377,7 +1377,7 @@
sarq $2, %rax
jle .L103
ALIGN_4
-
+
.L102:
movddup -16 * SIZE(B), %xmm8
movddup -15 * SIZE(B), %xmm9
@@ -1427,7 +1427,7 @@
decq %rax
jne .L104
ALIGN_4
-
+
.L105:
#if defined(LT) || defined(RN)
movq A, AO
@@ -1468,7 +1468,7 @@
movq KK, %rax
salq $0 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1740,7 +1740,7 @@
movq KK, %rax
salq $0 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -2106,7 +2106,7 @@
decq I # i --
jg .L110
- ALIGN_4
+ ALIGN_4
.L199:
#ifdef LN
@@ -2130,7 +2130,7 @@
#endif
ALIGN_4
-
+
.L999:
movq %r15, %rsp
diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_penryn.S
index 093a580..3a691ca 100644
--- a/kernel/x86_64/ztrsm_kernel_LN_2x2_penryn.S
+++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_penryn.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -98,7 +98,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -185,7 +185,7 @@
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -199,7 +199,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
movq K, %rax
salq $ZBASE_SHIFT + 1, %rax
@@ -228,7 +228,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
movaps -16 * SIZE(BO), %xmm2
@@ -253,7 +253,7 @@
jle .L25
ALIGN_4
-.L22:
+.L22:
ADD1 %xmm3, %xmm12
movaps -14 * SIZE(BO), %xmm3
pshufd $0x4e, %xmm2, %xmm7
@@ -554,7 +554,7 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L20:
movq M, I
@@ -578,7 +578,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
prefetcht2 -16 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -622,7 +622,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
ADD1 %xmm3, %xmm12
movaps -14 * SIZE(BO), %xmm3
ADD1 %xmm4, %xmm14
@@ -1267,7 +1267,7 @@
decq I # i --
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L39:
#ifdef LN
@@ -1307,7 +1307,7 @@
movq K, %rax
salq $ZBASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -1320,7 +1320,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
movq K, %rax
salq $ZBASE_SHIFT, %rax
@@ -1329,7 +1329,7 @@
#ifdef LT
movq OFFSET, KK
#endif
-
+
testq $1, M
BRANCH
jle .L60
@@ -1349,7 +1349,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1369,7 +1369,7 @@
jle .L65
ALIGN_4
-.L62:
+.L62:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x4e, %xmm2, %xmm7
@@ -1586,7 +1586,7 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L60:
movq M, I
@@ -1610,7 +1610,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
prefetcht2 -16 * SIZE(BB)
subq $-4 * SIZE, BB
@@ -1636,7 +1636,7 @@
jle .L55
ALIGN_4
-.L52:
+.L52:
movaps %xmm2, %xmm4
pshufd $0x4e, %xmm2, %xmm7
mulpd %xmm0, %xmm2
@@ -1966,7 +1966,7 @@
decq I
BRANCH
jg .L51
- ALIGN_4
+ ALIGN_4
.L79:
#ifdef LN
diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S
index 79f20b6..542bd59 100644
--- a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S
+++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -55,7 +55,7 @@
#define BO %rsi
#define CO1 %r15
#define CO2 %rbp
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 64
@@ -226,9 +226,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -309,7 +309,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -327,11 +327,11 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
#ifdef RT
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
@@ -344,7 +344,7 @@
salq $ZBASE_SHIFT, %rax
leaq (B, %rax, 2), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -362,7 +362,7 @@
addq %rax, %rax
ALIGN_4
-
+
.L02:
PREFETCHNTA 56 * SIZE(B)
@@ -431,7 +431,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L05:
#if defined(LT) || defined(RN)
movq A, AO
@@ -473,7 +473,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
@@ -859,7 +859,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -888,7 +888,7 @@
PREFETCHW 4 * SIZE(CO2)
pxor %xmm7, %xmm7
#endif
-
+
#if defined(LT) || defined(RN)
movq KK, %rax
#else
@@ -898,7 +898,7 @@
andq $-8, %rax
salq $4, %rax
je .L15
-.L1X:
+.L1X:
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
@@ -1422,9 +1422,9 @@
decq I # i --
jg .L10
- ALIGN_4
+ ALIGN_4
+
-
.L99:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -1458,7 +1458,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
@@ -1475,7 +1475,7 @@
salq $ZBASE_SHIFT, %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 2), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -1491,7 +1491,7 @@
sarq $2, %rax
jle .L103
ALIGN_4
-
+
.L102:
movlpd 0 * SIZE(B), %xmm0
movlpd 1 * SIZE(B), %xmm1
@@ -1551,7 +1551,7 @@
decq %rax
jne .L104
ALIGN_4
-
+
.L105:
#if defined(LT) || defined(RN)
movq A, AO
@@ -1592,7 +1592,7 @@
movq KK, %rax
salq $0 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
@@ -1833,7 +1833,7 @@
addq %rax, AORIG
#endif
ALIGN_4
-
+
.L130:
movq M, I
sarq $1, I # i = (m >> 2)
@@ -1860,7 +1860,7 @@
movq KK, %rax
salq $0 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
@@ -1872,7 +1872,7 @@
PREFETCHW 4 * SIZE(CO1)
#endif
-
+
#if defined(LT) || defined(RN)
movq KK, %rax
#else
@@ -2224,7 +2224,7 @@
decq I # i --
jg .L110
- ALIGN_4
+ ALIGN_4
.L199:
#ifdef LN
diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse3.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse3.S
index 74a799a..7547421 100644
--- a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse3.S
+++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse3.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -55,7 +55,7 @@
#define CO1 %r15
#define CO2 %rbx
#define KK %rbp
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 128
@@ -338,9 +338,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -375,7 +375,7 @@
movq OLD_OFFSET, KK
movq KK, OFFSET
-
+
salq $ZBASE_SHIFT, LDC
#ifdef LN
@@ -399,7 +399,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, KK
@@ -422,7 +422,7 @@
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -437,7 +437,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, KK
#endif
@@ -459,7 +459,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -884,7 +884,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -921,8 +921,8 @@
andq $-8, %rax
salq $4, %rax
je .L12
-
-.L1X:
+
+.L1X:
KERNEL1 (16 * 0)
KERNEL2 (16 * 0)
KERNEL3 (16 * 0)
@@ -1403,7 +1403,7 @@
#endif
decq I # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L99:
#ifdef LN
@@ -1440,7 +1440,7 @@
movq K, %rax
salq $0 + ZBASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
movq C, CO1 # coffset1 = c
@@ -1451,7 +1451,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, KK
#endif
@@ -1474,7 +1474,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1756,7 +1756,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2153,7 +2153,7 @@
decq I # i --
jg .L110
- ALIGN_4
+ ALIGN_4
.L149:
#ifdef LN
@@ -2173,7 +2173,7 @@
subq $1, KK
#endif
ALIGN_3
-
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_LN_2x4_nehalem.S
index fc5a4a3..5d931ce 100644
--- a/kernel/x86_64/ztrsm_kernel_LN_2x4_nehalem.S
+++ b/kernel/x86_64/ztrsm_kernel_LN_2x4_nehalem.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -98,7 +98,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -185,7 +185,7 @@
movq K, %rax
salq $2 + ZBASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 4), %rax
subq %rax, C
#endif
@@ -199,7 +199,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
movq K, %rax
salq $ZBASE_SHIFT + 2, %rax
@@ -229,7 +229,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -254,7 +254,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
ADD1 %xmm1, %xmm8
pshufd $0xa0, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -780,7 +780,7 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L20:
movq M, I
@@ -806,11 +806,11 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
prefetchnta -32 * SIZE(BB)
subq $-16 * SIZE, BB
-
+
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
xorps %xmm2, %xmm2
@@ -842,7 +842,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm12
@@ -1585,7 +1585,7 @@
decq I # i --
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L29:
#ifdef LN
@@ -1626,7 +1626,7 @@
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -1640,7 +1640,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, KK
@@ -1666,7 +1666,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -1691,7 +1691,7 @@
jle .L45
ALIGN_3
-.L42:
+.L42:
ADD1 %xmm1, %xmm8
pshufd $0xa0, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -1978,7 +1978,7 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L40:
movq M, I
@@ -2004,7 +2004,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -2030,7 +2030,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm8
@@ -2444,7 +2444,7 @@
decq I # i --
BRANCH
jg .L31
- ALIGN_4
+ ALIGN_4
.L49:
#ifdef LN
@@ -2481,7 +2481,7 @@
movq K, %rax
salq $ZBASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -2493,7 +2493,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, KK
@@ -2519,7 +2519,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -2539,7 +2539,7 @@
jle .L65
ALIGN_3
-.L62:
+.L62:
ADD1 %xmm1, %xmm8
pshufd $0xa0, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -2728,7 +2728,7 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L60:
movq M, I
@@ -2754,7 +2754,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -2779,7 +2779,7 @@
jle .L55
ALIGN_3
-.L52:
+.L52:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm8
@@ -3065,7 +3065,7 @@
decq I # i --
BRANCH
jg .L51
- ALIGN_4
+ ALIGN_4
.L69:
#ifdef LN
diff --git a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S
index f5c100e..cd86db2 100644
--- a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S
+++ b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -225,9 +225,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -314,7 +314,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -332,10 +332,10 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
-
+
#ifdef RT
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
@@ -348,7 +348,7 @@
salq $ZBASE_SHIFT, %rax
leaq (B, %rax, 2), B
leaq (BO, %rax, 8), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -364,7 +364,7 @@
sarq $2, %rax
jle .L03
ALIGN_4
-
+
.L02:
movaps 0 * SIZE(B), %xmm3
movaps 4 * SIZE(B), %xmm7
@@ -448,7 +448,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movq A, AO
@@ -490,7 +490,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movlps 0 * SIZE(AO), %xmm8
movhps 2 * SIZE(AO), %xmm8
@@ -946,7 +946,7 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -971,7 +971,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -1506,7 +1506,7 @@
salq $1 + ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L30:
movq M, I
@@ -1534,7 +1534,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(BO), %xmm9
movaps 4 * SIZE(BO), %xmm11
@@ -1566,7 +1566,7 @@
andq $-8, %rax
salq $4, %rax
je .L15
-.L1X:
+.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
KERNEL3(32 * 0)
@@ -2358,7 +2358,7 @@
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L39:
#ifdef LN
@@ -2394,7 +2394,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -2410,7 +2410,7 @@
salq $ZBASE_SHIFT, %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -2426,7 +2426,7 @@
sarq $2, %rax
jle .L43
ALIGN_4
-
+
.L42:
movaps 0 * SIZE(B), %xmm3
movaps 4 * SIZE(B), %xmm7
@@ -2483,7 +2483,7 @@
decq %rax
jne .L44
ALIGN_4
-
+
.L50:
#if defined(LT) || defined(RN)
movq A, AO
@@ -2523,7 +2523,7 @@
movq KK, %rax
salq $ZBASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm8
movhps 2 * SIZE(AO), %xmm8
@@ -2796,7 +2796,7 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -2821,7 +2821,7 @@
movq KK, %rax
salq $ZBASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -3186,7 +3186,7 @@
salq $1 + ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L70:
movq M, I
@@ -3214,7 +3214,7 @@
movq KK, %rax
salq $ZBASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -3950,7 +3950,7 @@
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L79:
#ifdef LN
diff --git a/kernel/x86_64/ztrsm_kernel_LT_1x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_LT_1x4_nehalem.S
index e53e297..874d34d 100644
--- a/kernel/x86_64/ztrsm_kernel_LT_1x4_nehalem.S
+++ b/kernel/x86_64/ztrsm_kernel_LT_1x4_nehalem.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -97,7 +97,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -187,7 +187,7 @@
movq K, %rax
salq $2 + ZBASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 4), %rax
subq %rax, C
#endif
@@ -201,7 +201,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
movq K, %rax
salq $ZBASE_SHIFT + 2, %rax
@@ -231,7 +231,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
prefetchnta -16 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -267,7 +267,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm12
@@ -859,7 +859,7 @@
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -873,7 +873,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, KK
@@ -899,7 +899,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -925,7 +925,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm8
@@ -1275,7 +1275,7 @@
movq K, %rax
salq $ZBASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
movq C, CO1
@@ -1286,7 +1286,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, KK
@@ -1312,7 +1312,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -1335,7 +1335,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm8
@@ -1555,7 +1555,7 @@
#ifdef RT
subq $1, KK
#endif
- ALIGN_4
+ ALIGN_4
.L999:
movq 0(%rsp), %rbx
diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x1_atom.S b/kernel/x86_64/ztrsm_kernel_LT_2x1_atom.S
index a1760ad..92dc636 100644
--- a/kernel/x86_64/ztrsm_kernel_LT_2x1_atom.S
+++ b/kernel/x86_64/ztrsm_kernel_LT_2x1_atom.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -55,7 +55,7 @@
#define CO1 %r15
#define BB %rbx
#define KK %rbp
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 128
@@ -107,9 +107,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -144,7 +144,7 @@
movq OLD_OFFSET, KK
movq KK, OFFSET
-
+
salq $ZBASE_SHIFT, LDC
#ifdef LN
@@ -168,7 +168,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, KK
@@ -203,7 +203,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, KK
#endif
@@ -232,7 +232,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
prefetcht0 0 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -277,7 +277,7 @@
ADDSD3 %xmm7, %xmm14
movsd 3 * SIZE(AO), %xmm7
mulsd %xmm3, %xmm2
-
+
ADDSD4 %xmm6, %xmm15
PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO)
movaps %xmm4, %xmm6
@@ -673,7 +673,7 @@
#endif
decq I # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L20:
testq $1, M
@@ -693,7 +693,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm0
xorps %xmm2, %xmm2
@@ -944,7 +944,7 @@
addq %rax, AORIG
#endif
ALIGN_4
-
+
.L99:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -965,7 +965,7 @@
decq J # j --
jg .L01
ALIGN_4
-
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_core2.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_core2.S
index 93cbcad..6cf8506 100644
--- a/kernel/x86_64/ztrsm_kernel_LT_2x2_core2.S
+++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_core2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,7 +49,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -84,7 +84,7 @@
#define AORIG 48(%rsp)
#define BORIG 56(%rsp)
#define BUFFER 128(%rsp)
-
+
#define PREFETCH_R (8 * 4 + 0)
#define PREFETCH_W (PREFETCH_R)
@@ -106,9 +106,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -186,7 +186,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -204,10 +204,10 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq 16 * SIZE + BUFFER, BO
-
+
#ifdef RT
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
@@ -220,7 +220,7 @@
salq $ZBASE_SHIFT, %rax
leaq (B, %rax, 2), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -238,7 +238,7 @@
addq %rax, %rax
ALIGN_4
-
+
.L02:
prefetcht0 (PREFETCH_R + 0) * SIZE(B)
@@ -300,7 +300,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L05:
#if defined(LT) || defined(RN)
movq A, AO
@@ -347,7 +347,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
prefetcht2 0 * SIZE(BB)
@@ -390,7 +390,7 @@
jle .L15
ALIGN_4
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movapd -16 * SIZE(AO), %xmm0
@@ -948,7 +948,7 @@
decq I # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -973,7 +973,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1311,7 +1311,7 @@
addq %rax, AORIG
#endif
ALIGN_4
-
+
.L99:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -1345,7 +1345,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -1361,7 +1361,7 @@
salq $ZBASE_SHIFT, %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 2), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -1377,7 +1377,7 @@
sarq $2, %rax
jle .L103
ALIGN_4
-
+
.L102:
movddup -16 * SIZE(B), %xmm8
movddup -15 * SIZE(B), %xmm9
@@ -1427,7 +1427,7 @@
decq %rax
jne .L104
ALIGN_4
-
+
.L105:
#if defined(LT) || defined(RN)
movq A, AO
@@ -1469,7 +1469,7 @@
movq KK, %rax
salq $0 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -1835,7 +1835,7 @@
decq I # i --
jg .L110
- ALIGN_4
+ ALIGN_4
.L130:
testq $1, M
@@ -1862,7 +1862,7 @@
movq KK, %rax
salq $0 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -2107,7 +2107,7 @@
addq %rax, AORIG
#endif
ALIGN_4
-
+
.L199:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -2130,7 +2130,7 @@
#endif
ALIGN_4
-
+
.L999:
movq %r15, %rsp
diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_penryn.S
index e38e87e..0078117 100644
--- a/kernel/x86_64/ztrsm_kernel_LT_2x2_penryn.S
+++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_penryn.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -98,7 +98,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -185,7 +185,7 @@
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -199,7 +199,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
movq K, %rax
salq $ZBASE_SHIFT + 1, %rax
@@ -230,7 +230,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
prefetcht2 -16 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -274,7 +274,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
ADD1 %xmm3, %xmm12
movaps -14 * SIZE(BO), %xmm3
ADD1 %xmm4, %xmm14
@@ -919,7 +919,7 @@
decq I # i --
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $1, M
@@ -941,7 +941,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
movaps -16 * SIZE(BO), %xmm2
@@ -966,7 +966,7 @@
jle .L25
ALIGN_4
-.L22:
+.L22:
ADD1 %xmm3, %xmm12
movaps -14 * SIZE(BO), %xmm3
pshufd $0x4e, %xmm2, %xmm7
@@ -1267,7 +1267,7 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L39:
#ifdef LN
@@ -1307,7 +1307,7 @@
movq K, %rax
salq $ZBASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -1320,7 +1320,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
movq K, %rax
salq $ZBASE_SHIFT + 1, %rax
@@ -1329,7 +1329,7 @@
#ifdef LT
movq OFFSET, KK
#endif
-
+
movq M, I
sarq $1, I # i = (m >> 2)
NOBRANCH
@@ -1351,7 +1351,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
prefetcht2 -16 * SIZE(BB)
subq $-4 * SIZE, BB
@@ -1377,7 +1377,7 @@
jle .L55
ALIGN_4
-.L52:
+.L52:
movaps %xmm2, %xmm4
pshufd $0x4e, %xmm2, %xmm7
mulpd %xmm0, %xmm2
@@ -1707,7 +1707,7 @@
decq I
BRANCH
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $1, M
@@ -1729,7 +1729,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -1749,7 +1749,7 @@
jle .L65
ALIGN_4
-.L62:
+.L62:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x4e, %xmm2, %xmm7
@@ -1966,7 +1966,7 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L79:
#ifdef LN
diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S
index 18edeed..d90bfd8 100644
--- a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S
+++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -55,7 +55,7 @@
#define BO %rsi
#define CO1 %r15
#define CO2 %rbp
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 64
@@ -213,7 +213,7 @@
movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
addpd %xmm14, %xmm7 ;\
movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14
-
+
#ifndef CONJ
#define NN
@@ -227,9 +227,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -310,7 +310,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -328,10 +328,10 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
-
+
#ifdef RT
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
@@ -344,7 +344,7 @@
salq $ZBASE_SHIFT, %rax
leaq (B, %rax, 2), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -362,7 +362,7 @@
addq %rax, %rax
ALIGN_4
-
+
.L02:
PREFETCHNTA 56 * SIZE(B)
@@ -431,7 +431,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L05:
#if defined(LT) || defined(RN)
movq A, AO
@@ -476,7 +476,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -508,7 +508,7 @@
andq $-8, %rax
salq $4, %rax
je .L15
-.L1X:
+.L1X:
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
@@ -1032,7 +1032,7 @@
decq I # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -1057,7 +1057,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
@@ -1416,7 +1416,7 @@
addq %rax, AORIG
#endif
ALIGN_4
-
+
.L99:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -1450,7 +1450,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
@@ -1467,7 +1467,7 @@
salq $ZBASE_SHIFT, %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 2), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -1483,7 +1483,7 @@
sarq $2, %rax
jle .L103
ALIGN_4
-
+
.L102:
movlpd 0 * SIZE(B), %xmm0
movlpd 1 * SIZE(B), %xmm1
@@ -1543,7 +1543,7 @@
decq %rax
jne .L104
ALIGN_4
-
+
.L105:
#if defined(LT) || defined(RN)
movq A, AO
@@ -1585,7 +1585,7 @@
movq KK, %rax
salq $0 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
@@ -1944,7 +1944,7 @@
decq I # i --
jg .L110
- ALIGN_4
+ ALIGN_4
.L130:
testq $1, M
@@ -1971,7 +1971,7 @@
movq KK, %rax
salq $0 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
@@ -2212,7 +2212,7 @@
addq %rax, AORIG
#endif
ALIGN_4
-
+
.L199:
#ifdef LN
leaq (, K, SIZE), %rax
diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse3.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse3.S
index 708a984..c52b058 100644
--- a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse3.S
+++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse3.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -55,7 +55,7 @@
#define CO1 %r15
#define CO2 %rbx
#define KK %rbp
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 128
@@ -338,9 +338,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -375,7 +375,7 @@
movq OLD_OFFSET, KK
movq KK, OFFSET
-
+
salq $ZBASE_SHIFT, LDC
#ifdef LN
@@ -399,7 +399,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, KK
@@ -422,7 +422,7 @@
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -437,7 +437,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, KK
#endif
@@ -462,7 +462,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -494,8 +494,8 @@
andq $-8, %rax
salq $4, %rax
je .L12
-
-.L1X:
+
+.L1X:
KERNEL1 (16 * 0)
KERNEL2 (16 * 0)
KERNEL3 (16 * 0)
@@ -976,7 +976,7 @@
#endif
decq I # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -996,7 +996,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1399,7 +1399,7 @@
addq %rax, AORIG
#endif
ALIGN_4
-
+
.L99:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -1435,7 +1435,7 @@
movq K, %rax
salq $0 + ZBASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
movq C, CO1 # coffset1 = c
@@ -1446,7 +1446,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, KK
#endif
@@ -1471,7 +1471,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1864,7 +1864,7 @@
decq I # i --
jg .L110
- ALIGN_4
+ ALIGN_4
.L130:
testq $1, M
@@ -1885,7 +1885,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2164,7 +2164,7 @@
subq $1, KK
#endif
ALIGN_3
-
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_LT_2x4_nehalem.S
index d07930d..0d6531a 100644
--- a/kernel/x86_64/ztrsm_kernel_LT_2x4_nehalem.S
+++ b/kernel/x86_64/ztrsm_kernel_LT_2x4_nehalem.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -98,7 +98,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -185,7 +185,7 @@
movq K, %rax
salq $2 + ZBASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 4), %rax
subq %rax, C
#endif
@@ -199,7 +199,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
movq K, %rax
salq $ZBASE_SHIFT + 2, %rax
@@ -232,11 +232,11 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
prefetchnta -32 * SIZE(BB)
subq $-16 * SIZE, BB
-
+
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
xorps %xmm2, %xmm2
@@ -268,7 +268,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm12
@@ -1011,7 +1011,7 @@
decq I # i --
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $1, M
@@ -1034,7 +1034,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -1059,7 +1059,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
ADD1 %xmm1, %xmm8
pshufd $0xa0, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -1585,8 +1585,8 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L29:
#ifdef LN
movq K, %rax
@@ -1626,7 +1626,7 @@
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -1640,7 +1640,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, KK
@@ -1669,7 +1669,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -1695,7 +1695,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm8
@@ -2109,7 +2109,7 @@
decq I # i --
BRANCH
jg .L31
- ALIGN_4
+ ALIGN_4
.L40:
testq $1, M
@@ -2132,7 +2132,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -2157,7 +2157,7 @@
jle .L45
ALIGN_3
-.L42:
+.L42:
ADD1 %xmm1, %xmm8
pshufd $0xa0, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -2444,8 +2444,8 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L49:
#ifdef LN
movq K, %rax
@@ -2481,7 +2481,7 @@
movq K, %rax
salq $ZBASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -2493,7 +2493,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, KK
@@ -2522,7 +2522,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -2547,7 +2547,7 @@
jle .L55
ALIGN_3
-.L52:
+.L52:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm8
@@ -2833,7 +2833,7 @@
decq I # i --
BRANCH
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $1, M
@@ -2856,7 +2856,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -2876,7 +2876,7 @@
jle .L65
ALIGN_3
-.L62:
+.L62:
ADD1 %xmm1, %xmm8
pshufd $0xa0, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -3065,8 +3065,8 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L69:
#ifdef LN
movq K, %rax
diff --git a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S
index f58cecd..53e5bb7 100644
--- a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S
+++ b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -225,9 +225,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -314,7 +314,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -332,10 +332,10 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
-
+
#ifdef RT
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
@@ -348,7 +348,7 @@
salq $ZBASE_SHIFT, %rax
leaq (B, %rax, 2), B
leaq (BO, %rax, 8), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -364,7 +364,7 @@
sarq $2, %rax
jle .L03
ALIGN_4
-
+
.L02:
movaps 0 * SIZE(B), %xmm3
movaps 4 * SIZE(B), %xmm7
@@ -448,7 +448,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movq A, AO
@@ -493,7 +493,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(BO), %xmm9
movaps 4 * SIZE(BO), %xmm11
@@ -525,7 +525,7 @@
andq $-8, %rax
salq $4, %rax
je .L15
-.L1X:
+.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
KERNEL3(32 * 0)
@@ -1317,7 +1317,7 @@
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -1342,7 +1342,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -1877,8 +1877,8 @@
salq $1 + ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L30:
testq $1, M
je .L39
@@ -1902,7 +1902,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movlps 0 * SIZE(AO), %xmm8
movhps 2 * SIZE(AO), %xmm8
@@ -2358,7 +2358,7 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L39:
#ifdef LN
@@ -2394,7 +2394,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -2410,7 +2410,7 @@
salq $ZBASE_SHIFT, %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -2426,7 +2426,7 @@
sarq $2, %rax
jle .L43
ALIGN_4
-
+
.L42:
movaps 0 * SIZE(B), %xmm3
movaps 4 * SIZE(B), %xmm7
@@ -2483,7 +2483,7 @@
decq %rax
jne .L44
ALIGN_4
-
+
.L50:
#if defined(LT) || defined(RN)
movq A, AO
@@ -2526,7 +2526,7 @@
movq KK, %rax
salq $ZBASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -3262,7 +3262,7 @@
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -3287,7 +3287,7 @@
movq KK, %rax
salq $ZBASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -3652,8 +3652,8 @@
salq $1 + ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L70:
testq $1, M
je .L79
@@ -3677,7 +3677,7 @@
movq KK, %rax
salq $ZBASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm8
movhps 2 * SIZE(AO), %xmm8
@@ -3950,7 +3950,7 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L79:
#ifdef LN
diff --git a/kernel/x86_64/ztrsm_kernel_RT_1x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_RT_1x4_nehalem.S
index 451aafa..a65c271 100644
--- a/kernel/x86_64/ztrsm_kernel_RT_1x4_nehalem.S
+++ b/kernel/x86_64/ztrsm_kernel_RT_1x4_nehalem.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -97,7 +97,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -184,7 +184,7 @@
movq K, %rax
salq $ZBASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
movq C, CO1
@@ -195,7 +195,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, KK
@@ -221,7 +221,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -244,7 +244,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm8
@@ -464,7 +464,7 @@
#ifdef RT
subq $1, KK
#endif
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, N
@@ -481,7 +481,7 @@
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -495,7 +495,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, KK
@@ -521,7 +521,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -16 * SIZE(AO), %xmm0
@@ -547,7 +547,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm8
@@ -900,7 +900,7 @@
movq K, %rax
salq $2 + ZBASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 4), %rax
subq %rax, C
#endif
@@ -914,7 +914,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
movq K, %rax
salq $ZBASE_SHIFT + 2, %rax
@@ -944,7 +944,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
prefetchnta -16 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -980,7 +980,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm12
diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_core2.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_core2.S
index 005b65e..0702b00 100644
--- a/kernel/x86_64/ztrsm_kernel_RT_2x2_core2.S
+++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_core2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -49,7 +49,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -84,7 +84,7 @@
#define AORIG 48(%rsp)
#define BORIG 56(%rsp)
#define BUFFER 128(%rsp)
-
+
#define PREFETCH_R (8 * 4 + 0)
#define PREFETCH_W (PREFETCH_R)
@@ -106,9 +106,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -186,7 +186,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -202,7 +202,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -218,7 +218,7 @@
salq $ZBASE_SHIFT, %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 2), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -234,7 +234,7 @@
sarq $2, %rax
jle .L103
ALIGN_4
-
+
.L102:
movddup -16 * SIZE(B), %xmm8
movddup -15 * SIZE(B), %xmm9
@@ -284,7 +284,7 @@
decq %rax
jne .L104
ALIGN_4
-
+
.L105:
#if defined(LT) || defined(RN)
movq A, AO
@@ -326,7 +326,7 @@
movq KK, %rax
salq $0 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -692,7 +692,7 @@
decq I # i --
jg .L110
- ALIGN_4
+ ALIGN_4
.L130:
testq $1, M
@@ -719,7 +719,7 @@
movq KK, %rax
salq $0 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -964,7 +964,7 @@
addq %rax, AORIG
#endif
ALIGN_4
-
+
.L199:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -998,10 +998,10 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq 16 * SIZE + BUFFER, BO
-
+
#ifdef RT
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
@@ -1014,7 +1014,7 @@
salq $ZBASE_SHIFT, %rax
leaq (B, %rax, 2), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -1032,7 +1032,7 @@
addq %rax, %rax
ALIGN_4
-
+
.L02:
prefetcht0 (PREFETCH_R + 0) * SIZE(B)
@@ -1094,7 +1094,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L05:
#if defined(LT) || defined(RN)
movq A, AO
@@ -1141,7 +1141,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
prefetcht2 0 * SIZE(BB)
@@ -1184,7 +1184,7 @@
jle .L15
ALIGN_4
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
movapd -16 * SIZE(AO), %xmm0
@@ -1742,7 +1742,7 @@
decq I # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -1767,7 +1767,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
@@ -2105,7 +2105,7 @@
addq %rax, AORIG
#endif
ALIGN_4
-
+
.L99:
#ifdef LN
leaq (, K, SIZE), %rax
diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_penryn.S
index 4ed789a..7770f5d 100644
--- a/kernel/x86_64/ztrsm_kernel_RT_2x2_penryn.S
+++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_penryn.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -98,7 +98,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -182,7 +182,7 @@
movq K, %rax
salq $ZBASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -195,12 +195,12 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, KK
#endif
-
+
movq M, I
sarq $1, I # i = (m >> 2)
NOBRANCH
@@ -222,7 +222,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
movaps -14 * SIZE(AO), %xmm1
@@ -245,7 +245,7 @@
jle .L55
ALIGN_4
-.L52:
+.L52:
movaps %xmm2, %xmm4
pshufd $0x4e, %xmm2, %xmm7
mulpd %xmm0, %xmm2
@@ -575,7 +575,7 @@
decq I
BRANCH
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $1, M
@@ -597,7 +597,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
pxor %xmm8, %xmm8
@@ -617,7 +617,7 @@
jle .L65
ALIGN_4
-.L62:
+.L62:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
pshufd $0x4e, %xmm2, %xmm7
@@ -834,7 +834,7 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L79:
#ifdef LN
@@ -873,7 +873,7 @@
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -887,7 +887,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
movq K, %rax
salq $ZBASE_SHIFT + 1, %rax
@@ -919,7 +919,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
prefetcht2 -16 * SIZE(BB)
subq $-8 * SIZE, BB
@@ -963,7 +963,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
ADD1 %xmm3, %xmm12
movaps -14 * SIZE(BO), %xmm3
ADD1 %xmm4, %xmm14
@@ -1608,7 +1608,7 @@
decq I # i --
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $1, M
@@ -1630,7 +1630,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movaps -16 * SIZE(AO), %xmm0
movaps -16 * SIZE(BO), %xmm2
@@ -1655,7 +1655,7 @@
jle .L25
ALIGN_4
-.L22:
+.L22:
ADD1 %xmm3, %xmm12
movaps -14 * SIZE(BO), %xmm3
pshufd $0x4e, %xmm2, %xmm7
@@ -1956,7 +1956,7 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L39:
#ifdef LN
diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S
index 1b589e0..2dffe2d 100644
--- a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S
+++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -55,7 +55,7 @@
#define BO %rsi
#define CO1 %r15
#define CO2 %rbp
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 64
@@ -226,9 +226,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -309,7 +309,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -325,7 +325,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
@@ -342,7 +342,7 @@
salq $ZBASE_SHIFT, %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 2), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -358,7 +358,7 @@
sarq $2, %rax
jle .L103
ALIGN_4
-
+
.L102:
movlpd 0 * SIZE(B), %xmm0
movlpd 1 * SIZE(B), %xmm1
@@ -418,7 +418,7 @@
decq %rax
jne .L104
ALIGN_4
-
+
.L105:
#if defined(LT) || defined(RN)
movq A, AO
@@ -460,7 +460,7 @@
movq KK, %rax
salq $0 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
@@ -819,7 +819,7 @@
decq I # i --
jg .L110
- ALIGN_4
+ ALIGN_4
.L130:
testq $1, M
@@ -846,7 +846,7 @@
movq KK, %rax
salq $0 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
@@ -1087,7 +1087,7 @@
addq %rax, AORIG
#endif
ALIGN_4
-
+
.L199:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -1110,7 +1110,7 @@
#endif
ALIGN_4
-.L100:
+.L100:
movq N, J
sarq $1, J # j = (n >> 2)
jle .L999
@@ -1121,11 +1121,11 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
/* Copying to Sub Buffer */
leaq BUFFER, BO
-
+
#ifdef RT
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
@@ -1138,7 +1138,7 @@
salq $ZBASE_SHIFT, %rax
leaq (B, %rax, 2), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -1156,7 +1156,7 @@
addq %rax, %rax
ALIGN_4
-
+
.L02:
PREFETCHNTA 56 * SIZE(B)
@@ -1225,7 +1225,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L05:
#if defined(LT) || defined(RN)
movq A, AO
@@ -1270,7 +1270,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1302,7 +1302,7 @@
andq $-8, %rax
salq $4, %rax
je .L15
-.L1X:
+.L1X:
KERNEL1(16 * 0)
KERNEL2(16 * 0)
KERNEL3(16 * 0)
@@ -1826,7 +1826,7 @@
decq I # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -1851,7 +1851,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 2), BO
-#endif
+#endif
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
@@ -2210,7 +2210,7 @@
addq %rax, AORIG
#endif
ALIGN_4
-
+
.L99:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -2235,7 +2235,7 @@
decq J # j --
jg .L01
ALIGN_3
-
+
.L999:
movq %rbx, %rsp
movq 0(%rsp), %rbx
diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse3.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse3.S
index ca700eb..a473df4 100644
--- a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse3.S
+++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse3.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define M %rdi
#define N %rsi
#define K %rdx
@@ -55,7 +55,7 @@
#define CO1 %r15
#define CO2 %rbx
#define KK %rbp
-
+
#ifndef WINDOWS_ABI
#define STACKSIZE 128
@@ -338,9 +338,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -375,7 +375,7 @@
movq OLD_OFFSET, KK
movq KK, OFFSET
-
+
salq $ZBASE_SHIFT, LDC
#ifdef LN
@@ -399,7 +399,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, KK
@@ -420,7 +420,7 @@
movq K, %rax
salq $0 + ZBASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
movq C, CO1 # coffset1 = c
@@ -431,7 +431,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, KK
#endif
@@ -456,7 +456,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -849,7 +849,7 @@
decq I # i --
jg .L110
- ALIGN_4
+ ALIGN_4
.L130:
testq $1, M
@@ -870,7 +870,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1168,7 +1168,7 @@
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -1183,7 +1183,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, KK
#endif
@@ -1208,7 +1208,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -1240,8 +1240,8 @@
andq $-8, %rax
salq $4, %rax
je .L12
-
-.L1X:
+
+.L1X:
KERNEL1 (16 * 0)
KERNEL2 (16 * 0)
KERNEL3 (16 * 0)
@@ -1722,7 +1722,7 @@
#endif
decq I # i --
jg .L10
- ALIGN_4
+ ALIGN_4
.L30:
testq $1, M
@@ -1742,7 +1742,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
movapd 0 * SIZE(AO), %xmm8
pxor %xmm0, %xmm0
@@ -2145,7 +2145,7 @@
addq %rax, AORIG
#endif
ALIGN_4
-
+
.L99:
#ifdef LN
leaq (, K, SIZE), %rax
@@ -2166,7 +2166,7 @@
decq J # j --
jg .L01
-
+
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_RT_2x4_nehalem.S
index a5f0134..ddb5fe0 100644
--- a/kernel/x86_64/ztrsm_kernel_RT_2x4_nehalem.S
+++ b/kernel/x86_64/ztrsm_kernel_RT_2x4_nehalem.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define OLD_K %rdx
@@ -51,7 +51,7 @@
#define B %r8
#define C %r9
#define LDC %r10
-
+
#define I %r11
#define AO %rdi
#define BO %rsi
@@ -98,7 +98,7 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
@@ -182,7 +182,7 @@
movq K, %rax
salq $ZBASE_SHIFT, %rax
subq %rax, B
-
+
subq LDC, C
#endif
@@ -194,7 +194,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, KK
@@ -223,7 +223,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -248,7 +248,7 @@
jle .L55
ALIGN_3
-.L52:
+.L52:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm8
@@ -534,7 +534,7 @@
decq I # i --
BRANCH
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $1, M
@@ -557,7 +557,7 @@
leaq (B, %rax, 1), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -577,7 +577,7 @@
jle .L65
ALIGN_3
-.L62:
+.L62:
ADD1 %xmm1, %xmm8
pshufd $0xa0, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -766,8 +766,8 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L69:
#ifdef LN
movq K, %rax
@@ -803,7 +803,7 @@
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 2), %rax
subq %rax, C
#endif
@@ -817,7 +817,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
#ifdef LT
movq OFFSET, KK
@@ -846,7 +846,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
@@ -872,7 +872,7 @@
jle .L35
ALIGN_3
-.L32:
+.L32:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm8
@@ -1286,7 +1286,7 @@
decq I # i --
BRANCH
jg .L31
- ALIGN_4
+ ALIGN_4
.L40:
testq $1, M
@@ -1309,7 +1309,7 @@
leaq (B, %rax, 2), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -1334,7 +1334,7 @@
jle .L45
ALIGN_3
-.L42:
+.L42:
ADD1 %xmm1, %xmm8
pshufd $0xa0, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -1621,8 +1621,8 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L49:
#ifdef LN
movq K, %rax
@@ -1661,7 +1661,7 @@
movq K, %rax
salq $2 + ZBASE_SHIFT, %rax
subq %rax, B
-
+
leaq (, LDC, 4), %rax
subq %rax, C
#endif
@@ -1675,7 +1675,7 @@
#ifdef LN
movq OFFSET, KK
addq M, KK
-#endif
+#endif
movq K, %rax
salq $ZBASE_SHIFT + 2, %rax
@@ -1708,11 +1708,11 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
prefetchnta -32 * SIZE(BB)
subq $-16 * SIZE, BB
-
+
xorps %xmm1, %xmm1
movaps -32 * SIZE(AO), %xmm0
xorps %xmm2, %xmm2
@@ -1744,7 +1744,7 @@
jle .L15
ALIGN_3
-.L12:
+.L12:
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
ADD1 %xmm1, %xmm12
@@ -2487,7 +2487,7 @@
decq I # i --
BRANCH
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $1, M
@@ -2510,7 +2510,7 @@
leaq (B, %rax, 4), BO
#else
movq B, BO
-#endif
+#endif
xorps %xmm1, %xmm1
movddup -32 * SIZE(AO), %xmm0
@@ -2535,7 +2535,7 @@
jle .L25
ALIGN_3
-.L22:
+.L22:
ADD1 %xmm1, %xmm8
pshufd $0xa0, %xmm5, %xmm1
mulps %xmm0, %xmm1
@@ -3061,8 +3061,8 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L29:
#ifdef LN
movq K, %rax
diff --git a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S
index 2c47ce3..20b93e1 100644
--- a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S
+++ b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S
@@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
-
+
#define OLD_M %rdi
#define OLD_N %rsi
#define M %r13
@@ -225,9 +225,9 @@
PROLOGUE
PROFCODE
-
+
subq $STACKSIZE, %rsp
-
+
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
@@ -315,7 +315,7 @@
#ifdef RN
negq KK
-#endif
+#endif
#ifdef RT
movq N, %rax
@@ -331,7 +331,7 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
@@ -347,7 +347,7 @@
salq $ZBASE_SHIFT, %rax
leaq (B, %rax, 1), B
leaq (BO, %rax, 4), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -363,7 +363,7 @@
sarq $2, %rax
jle .L43
ALIGN_4
-
+
.L42:
movaps 0 * SIZE(B), %xmm3
movaps 4 * SIZE(B), %xmm7
@@ -420,7 +420,7 @@
decq %rax
jne .L44
ALIGN_4
-
+
.L50:
#if defined(LT) || defined(RN)
movq A, AO
@@ -463,7 +463,7 @@
movq KK, %rax
salq $ZBASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -1199,7 +1199,7 @@
decq I # i --
jg .L51
- ALIGN_4
+ ALIGN_4
.L60:
testq $2, M
@@ -1224,7 +1224,7 @@
movq KK, %rax
salq $ZBASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -1589,8 +1589,8 @@
salq $1 + ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L70:
testq $1, M
je .L79
@@ -1614,7 +1614,7 @@
movq KK, %rax
salq $ZBASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm8
movhps 2 * SIZE(AO), %xmm8
@@ -1887,7 +1887,7 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L79:
#ifdef LN
@@ -1922,10 +1922,10 @@
movq OFFSET, %rax
addq M, %rax
movq %rax, KK
-#endif
+#endif
leaq BUFFER, BO
-
+
#ifdef RT
movq K, %rax
salq $1 + ZBASE_SHIFT, %rax
@@ -1938,7 +1938,7 @@
salq $ZBASE_SHIFT, %rax
leaq (B, %rax, 2), B
leaq (BO, %rax, 8), BO
-#endif
+#endif
#if defined(LT)
movq OFFSET, %rax
@@ -1954,7 +1954,7 @@
sarq $2, %rax
jle .L03
ALIGN_4
-
+
.L02:
movaps 0 * SIZE(B), %xmm3
movaps 4 * SIZE(B), %xmm7
@@ -2038,7 +2038,7 @@
decq %rax
jne .L04
ALIGN_4
-
+
.L10:
#if defined(LT) || defined(RN)
movq A, AO
@@ -2083,7 +2083,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(BO), %xmm9
movaps 4 * SIZE(BO), %xmm11
@@ -2115,7 +2115,7 @@
andq $-8, %rax
salq $4, %rax
je .L15
-.L1X:
+.L1X:
KERNEL1(32 * 0)
KERNEL2(32 * 0)
KERNEL3(32 * 0)
@@ -2907,7 +2907,7 @@
decq I # i --
jg .L11
- ALIGN_4
+ ALIGN_4
.L20:
testq $2, M
@@ -2932,7 +2932,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movaps 0 * SIZE(AO), %xmm8
movaps 16 * SIZE(AO), %xmm10
@@ -3467,8 +3467,8 @@
salq $1 + ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
-
+ ALIGN_4
+
.L30:
testq $1, M
je .L39
@@ -3492,7 +3492,7 @@
movq KK, %rax
salq $1 + ZBASE_SHIFT, %rax
leaq (BO, %rax, 4), BO
-#endif
+#endif
movsd 0 * SIZE(AO), %xmm8
movhps 2 * SIZE(AO), %xmm8
@@ -3948,7 +3948,7 @@
salq $ZBASE_SHIFT, %rax
addq %rax, AORIG
#endif
- ALIGN_4
+ ALIGN_4
.L39:
#ifdef LN
diff --git a/lapack-devel.log b/lapack-devel.log
index 8243bb8..739e7aa 100644
--- a/lapack-devel.log
+++ b/lapack-devel.log
@@ -6,14 +6,14 @@ Platform: BULLDOZER single thread
--> LAPACK TESTING SUMMARY <--
Processing LAPACK Testing output found in the TESTING direcory
-SUMMARY nb test run numerical error other error
-================ =========== ================= ================
-REAL 1079349 0 (0.000%) 0 (0.000%)
-DOUBLE PRECISION 1080161 0 (0.000%) 0 (0.000%)
-COMPLEX 556022 0 (0.000%) 0 (0.000%)
-COMPLEX16 556834 0 (0.000%) 0 (0.000%)
+SUMMARY nb test run numerical error other error
+================ =========== ================= ================
+REAL 1079349 0 (0.000%) 0 (0.000%)
+DOUBLE PRECISION 1080161 0 (0.000%) 0 (0.000%)
+COMPLEX 556022 0 (0.000%) 0 (0.000%)
+COMPLEX16 556834 0 (0.000%) 0 (0.000%)
---> ALL PRECISIONS 3272366 0 (0.000%) 0 (0.000%)
+--> ALL PRECISIONS 3272366 0 (0.000%) 0 (0.000%)
========================================================================================
diff --git a/lapack-netlib/Makefile b/lapack-netlib/Makefile
deleted file mode 100644
index ad6a457..0000000
--- a/lapack-netlib/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-clean:
diff --git a/lapack-netlib/TESTING/Makefile b/lapack-netlib/TESTING/Makefile
deleted file mode 100644
index ad6a457..0000000
--- a/lapack-netlib/TESTING/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-clean:
diff --git a/lapack/getf2/getf2_k.c b/lapack/getf2/getf2_k.c
index fdc4eae..75c258b 100644
--- a/lapack/getf2/getf2_k.c
+++ b/lapack/getf2/getf2_k.c
@@ -61,7 +61,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
lda = args -> lda;
ipiv = (blasint *)args -> c;
offset = 0;
-
+
if (range_n) {
m -= range_n[0];
n = range_n[1] - range_n[0];
@@ -71,13 +71,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
info = 0;
b = a;
-
+
for (j = 0; j < n; j++) {
len = MIN(j, m);
for (i = 0; i < len; i++) {
- ip = ipiv[i + offset] - 1 - offset;
+ ip = ipiv[i + offset] - 1 - offset;
if (ip != i) {
temp1 = *(b + i);
temp2 = *(b + ip);
@@ -85,7 +85,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
*(b + ip) = temp1;
}
}
-
+
for (i = 1; i < len; i++) {
b[i] -= DOTU_K(i, a + i, lda, b, 1);
}
diff --git a/lapack/getf2/zgetf2_k.c b/lapack/getf2/zgetf2_k.c
index ae8c6fd..9bf47bc 100644
--- a/lapack/getf2/zgetf2_k.c
+++ b/lapack/getf2/zgetf2_k.c
@@ -63,7 +63,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
lda = args -> lda;
ipiv = (blasint *)args -> c;
offset = 0;
-
+
if (range_n) {
m -= range_n[0];
n = range_n[1] - range_n[0];
@@ -73,13 +73,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
info = 0;
b = a;
-
+
for (j = 0; j < n; j++) {
len = MIN(j, m);
for (i = 0; i < len; i++) {
- ip = ipiv[i + offset] - 1 - offset;
+ ip = ipiv[i + offset] - 1 - offset;
if (ip != i) {
temp1 = *(b + i * 2 + 0);
temp2 = *(b + i * 2 + 1);
@@ -91,7 +91,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
*(b + ip * 2 + 1) = temp2;
}
}
-
+
ZTRSV_NLU(len, a, lda, b, 1, sb);
if (j < m) {
@@ -124,7 +124,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
}
if (j + 1 < m) {
- SCAL_K(m - j - 1, 0, 0, temp3, temp4,
+ SCAL_K(m - j - 1, 0, 0, temp3, temp4,
b + (j + 1) * 2, 1, NULL, 0, NULL, 0);
}
} else {
diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c
index 3dbc70e..a76be3b 100644
--- a/lapack/getrf/getrf_parallel.c
+++ b/lapack/getrf/getrf_parallel.c
@@ -44,7 +44,7 @@ static FLOAT dm1 = -1.;
double sqrt(double);
//In this case, the recursive getrf_parallel may overflow the stack.
-//Instead, use malloc to alloc job_t.
+//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > GETRF_MEM_ALLOC_THRESHOLD
#define USE_ALLOC_HEAP
#endif
@@ -123,21 +123,21 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
for (jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){
min_jj = js + min_j - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+
if (0 && GEMM_UNROLL_N <= 8) {
- LASWP_NCOPY(min_jj, off + 1, off + k,
+ LASWP_NCOPY(min_jj, off + 1, off + k,
c + (- off + jjs * lda) * COMPSIZE, lda,
ipiv, sbb + k * (jjs - js) * COMPSIZE);
} else {
- LASWP_PLUS(min_jj, off + 1, off + k, ZERO,
+ LASWP_PLUS(min_jj, off + 1, off + k, ZERO,
#ifdef COMPLEX
ZERO,
#endif
c + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1);
-
+
GEMM_ONCOPY (k, min_jj, c + jjs * lda * COMPSIZE, lda, sbb + (jjs - js) * k * COMPSIZE);
}
@@ -145,13 +145,13 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
for (is = 0; is < k; is += GEMM_P) {
min_i = k - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
TRSM_KERNEL_LT(min_i, min_jj, k, dm1,
#ifdef COMPLEX
ZERO,
#endif
sb + k * is * COMPSIZE,
- sbb + (jjs - js) * k * COMPSIZE,
+ sbb + (jjs - js) * k * COMPSIZE,
c + (is + jjs * lda) * COMPSIZE, lda, is);
}
}
@@ -161,9 +161,9 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra
for (is = 0; is < m; is += GEMM_P){
min_i = m - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_ITCOPY (k, min_i, b + is * COMPSIZE, lda, sa);
-
+
GEMM_KERNEL_N(min_i, min_j, k, dm1,
#ifdef COMPLEX
ZERO,
@@ -234,7 +234,7 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
c += range_m[0] * COMPSIZE;
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
-
+
buffer[0] = sbb;
@@ -243,10 +243,10 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
}
for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
-
+
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {};
-
+
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(n_to, xxx + div_n) - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
@@ -254,43 +254,43 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
if (0 && GEMM_UNROLL_N <= 8) {
printf("helllo\n");
- LASWP_NCOPY(min_jj, off + 1, off + k,
+ LASWP_NCOPY(min_jj, off + 1, off + k,
b + (- off + jjs * lda) * COMPSIZE, lda,
ipiv, buffer[bufferside] + (jjs - xxx) * k * COMPSIZE);
} else {
- LASWP_PLUS(min_jj, off + 1, off + k, ZERO,
+ LASWP_PLUS(min_jj, off + 1, off + k, ZERO,
#ifdef COMPLEX
ZERO,
#endif
b + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1);
-
- GEMM_ONCOPY (k, min_jj, b + jjs * lda * COMPSIZE, lda,
+
+ GEMM_ONCOPY (k, min_jj, b + jjs * lda * COMPSIZE, lda,
buffer[bufferside] + (jjs - xxx) * k * COMPSIZE);
}
for (is = 0; is < k; is += GEMM_P) {
min_i = k - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
TRSM_KERNEL_LT(min_i, min_jj, k, dm1,
#ifdef COMPLEX
ZERO,
#endif
sb + k * is * COMPSIZE,
- buffer[bufferside] + (jjs - xxx) * k * COMPSIZE,
+ buffer[bufferside] + (jjs - xxx) * k * COMPSIZE,
b + (is + jjs * lda) * COMPSIZE, lda, is);
}
}
-
+
for (i = 0; i < args -> nthreads; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
}
-
+
flag[mypos * CACHE_LINE_SIZE] = 0;
-
+
if (m == 0) {
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0;
@@ -301,21 +301,21 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
min_i = m - is;
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
- } else
+ } else
if (min_i > GEMM_P) {
min_i = ((min_i + 1) / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
}
-
+
ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
-
+
current = mypos;
do {
-
+
div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
-
+
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
-
+
if ((current != mypos) && (!is)) {
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {};
}
@@ -323,18 +323,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, lda, is, xxx);
-
+
if (is + min_i >= m) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
}
}
-
+
current ++;
if (current >= args -> nthreads) current = 0;
-
+
} while (current != mypos);
}
-
+
for (i = 0; i < args -> nthreads; i++) {
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {};
@@ -382,7 +382,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -390,7 +390,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
m = args -> m;
@@ -408,7 +408,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
}
if (m <= 0 || n <= 0) return 0;
-
+
newarg.c = ipiv;
newarg.lda = lda;
@@ -428,14 +428,14 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
bk = mn;
if (bk > next_bk) bk = next_bk;
-
+
range_n_new[0] = offset;
range_n_new[1] = offset + bk;
-
+
iinfo = CNAME(args, NULL, range_n_new, sa, sb, 0);
-
+
if (iinfo && !info) info = iinfo;
-
+
#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
@@ -449,24 +449,24 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
TRSM_ILTCOPY(bk, bk, a, lda, 0, sb);
sbb = (FLOAT *)((((BLASULONG)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
-
+
is = 0;
num_cpu = 0;
while (is < mn) {
-
+
width = (FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
if (width > mn - is - bk) width = mn - is - bk;
if (width < bk) {
next_bk = (FORMULA2(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N) & ~(GEMM_UNROLL_N - 1);
-
+
if (next_bk > bk) next_bk = bk;
width = next_bk;
if (width > mn - is - bk) width = mn - is - bk;
}
-
+
if (num_cpu > 0) exec_blas_async_wait(num_cpu, &queue[0]);
mm = m - bk - is;
@@ -479,7 +479,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
newarg.n = nn;
newarg.k = bk;
newarg.ldb = is + offset;
-
+
nn -= width;
range_n_mine[0] = 0;
@@ -489,16 +489,16 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
range_M[0] = 0;
num_cpu = 0;
-
+
while (nn > 0){
-
+
if (mm >= nn) {
width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
if (nn < width) width = nn;
nn -= width;
range_N[num_cpu + 1] = range_N[num_cpu] + width;
-
+
width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
if (mm < width) width = mm;
if (nn <= 0) width = mm;
@@ -517,7 +517,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
if (mm <= 0) width = nn;
nn -= width;
range_N[num_cpu + 1] = range_N[num_cpu] + width;
-
+
}
queue[num_cpu].mode = mode;
@@ -529,13 +529,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
flag[num_cpu * CACHE_LINE_SIZE] = 1;
-
+
num_cpu ++;
}
-
+
newarg.nthreads = num_cpu;
-
+
if (num_cpu > 0) {
for (j = 0; j < num_cpu; j++) {
for (i = 0; i < num_cpu; i++) {
@@ -550,20 +550,20 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
bk = mn - is;
if (bk > next_bk) bk = next_bk;
-
+
range_n_new[0] = offset + is;
range_n_new[1] = offset + is + bk;
if (num_cpu > 0) {
queue[num_cpu - 1].next = NULL;
-
+
exec_blas_async(0, &queue[0]);
-
+
inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1);
-
+
iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);
-
+
if (iinfo && !info) info = iinfo + is;
for (i = 0; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {};
@@ -577,19 +577,19 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);
if (iinfo && !info) info = iinfo + is;
-
+
}
-
+
}
-
+
next_bk = init_bk;
is = 0;
-
+
while (is < mn) {
-
+
bk = mn - is;
if (bk > next_bk) bk = next_bk;
-
+
width = (FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
if (width > mn - is - bk) width = mn - is - bk;
@@ -598,13 +598,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
if (next_bk > bk) next_bk = bk;
}
- blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha,
+ blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha,
a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0,
ipiv, 1, (void *)LASWP_PLUS, args -> nthreads);
-
+
is += bk;
}
-
+
#ifdef USE_ALLOC_HEAP
free(job);
#endif
@@ -638,7 +638,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -646,7 +646,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
m = args -> m;
@@ -664,7 +664,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
}
if (m <= 0 || n <= 0) return 0;
-
+
newarg.c = ipiv;
newarg.lda = lda;
newarg.common = NULL;
@@ -700,9 +700,9 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
range_n_new[0] = offset;
range_n_new[1] = offset + bk;
-
+
info = CNAME(args, NULL, range_n_new, sa, sb, 0);
-
+
TRSM_ILTCOPY(bk, bk, a, lda, 0, sb);
is = 0;
@@ -714,7 +714,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
width = FORMULA1(m, n, is, bk, args -> nthreads);
width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
-
+
if (width < bk) {
next_bk = FORMULA2(m, n, is, bk, args -> nthreads);
@@ -729,7 +729,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
width = next_bk;
}
-
+
if (width > mn - is - bk) {
next_bk = mn - is - bk;
width = next_bk;
@@ -742,10 +742,10 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
range[0] = 0;
range[1] = width;
-
+
num_cpu = 1;
nn -= width;
-
+
newarg.a = sb;
newarg.b = a + (is + is * lda) * COMPSIZE;
newarg.d = (void *)flag;
@@ -753,16 +753,16 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
newarg.n = n - bk - is;
newarg.k = bk;
newarg.ldb = is + offset;
-
+
while (nn > 0){
-
+
width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu);
-
+
nn -= width;
if (nn < 0) width = width + nn;
-
+
range[num_cpu + 1] = range[num_cpu] + width;
-
+
queue[num_cpu].mode = mode;
//queue[num_cpu].routine = inner_advanced_thread;
queue[num_cpu].routine = (void *)inner_basic_thread;
@@ -776,21 +776,21 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
num_cpu ++;
}
-
+
queue[num_cpu - 1].next = NULL;
is += bk;
-
+
bk = n - is;
if (bk > next_bk) bk = next_bk;
-
+
range_n_new[0] = offset + is;
range_n_new[1] = offset + is + bk;
-
+
if (num_cpu > 1) {
exec_blas_async(1, &queue[1]);
-
+
#if 0
inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, 0);
@@ -823,30 +823,30 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
#endif
for (i = 1; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {};
-
+
TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb);
-
+
} else {
inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, -1);
-
+
iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);
}
if (iinfo && !info) info = iinfo + is;
-
+
}
-
+
next_bk = init_bk;
bk = init_bk;
-
+
is = 0;
-
+
while (is < mn) {
-
+
bk = mn - is;
if (bk > next_bk) bk = next_bk;
-
+
width = FORMULA1(m, n, is, bk, args -> nthreads);
width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
@@ -867,13 +867,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
width = next_bk;
}
- blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha,
+ blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha,
a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0,
ipiv, 1, (void *)LASWP_PLUS, args -> nthreads);
-
+
is += bk;
}
-
+
return info;
}
diff --git a/lapack/getrf/getrf_parallel_omp.c b/lapack/getrf/getrf_parallel_omp.c
index 6eda30a..7e23197 100644
--- a/lapack/getrf/getrf_parallel_omp.c
+++ b/lapack/getrf/getrf_parallel_omp.c
@@ -68,7 +68,7 @@ static void inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
c += range_n[0] * lda * COMPSIZE;
d += range_n[0] * lda * COMPSIZE;
}
-
+
for (js = 0; js < n; js += REAL_GEMM_R) {
min_j = n - js;
if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R;
@@ -76,32 +76,32 @@ static void inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for (jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){
min_jj = js + min_j - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+
#if 0
- LASWP_NCOPY(min_jj, off + 1, off + k,
+ LASWP_NCOPY(min_jj, off + 1, off + k,
c + (- off + jjs * lda) * COMPSIZE, lda,
ipiv, sb + k * (jjs - js) * COMPSIZE);
#else
- LASWP_PLUS(min_jj, off + 1, off + k, ZERO,
+ LASWP_PLUS(min_jj, off + 1, off + k, ZERO,
#ifdef COMPLEX
ZERO,
#endif
c + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1);
-
+
GEMM_ONCOPY (k, min_jj, c + jjs * lda * COMPSIZE, lda, sb + (jjs - js) * k * COMPSIZE);
#endif
for (is = 0; is < k; is += GEMM_P) {
min_i = k - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
TRSM_KERNEL_LT(min_i, min_jj, k, dm1,
#ifdef COMPLEX
ZERO,
#endif
(FLOAT *)args -> a + k * is * COMPSIZE,
- sb + (jjs - js) * k * COMPSIZE,
+ sb + (jjs - js) * k * COMPSIZE,
c + (is + jjs * lda) * COMPSIZE, lda, is);
}
}
@@ -109,9 +109,9 @@ static void inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for (is = 0; is < m; is += GEMM_P){
min_i = m - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_ITCOPY (k, min_i, b + is * COMPSIZE, lda, sa);
-
+
GEMM_KERNEL_N(min_i, min_j, k, dm1,
#ifdef COMPLEX
ZERO,
@@ -141,7 +141,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -149,7 +149,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
m = args -> m;
@@ -167,7 +167,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
}
if (m <= 0 || n <= 0) return 0;
-
+
mn = MIN(m, n);
blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
@@ -177,13 +177,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
info = GETF2(args, NULL, range_n, sa, sb, 0);
return info;
}
-
+
sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
info = 0;
for (j = 0; j < mn; j += blocking) {
-
+
jb = mn - j;
if (jb > blocking) jb = blocking;
@@ -198,9 +198,9 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
if (iinfo && !info) info = iinfo + j;
if (j + jb < n) {
-
+
TRSM_ILTCOPY(jb, jb, offsetA + j * COMPSIZE, lda, 0, sb);
-
+
newarg.m = m - jb - j;
newarg.n = n - jb - j;
newarg.k = jb;
@@ -215,7 +215,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
newarg.nthreads = args -> nthreads;
gemm_thread_n(mode, &newarg, NULL, NULL, (void *)inner_thread, sa, sbb, args -> nthreads);
-
+
}
}
@@ -226,7 +226,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
ZERO,
#endif
a - (offset - j * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1);
-
+
}
return info;
diff --git a/lapack/getrf/getrf_single.c b/lapack/getrf/getrf_single.c
index f1818ea..e60a16c 100644
--- a/lapack/getrf/getrf_single.c
+++ b/lapack/getrf/getrf_single.c
@@ -71,7 +71,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
}
if (m <= 0 || n <= 0) return 0;
-
+
mn = MIN(m, n);
blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
@@ -81,13 +81,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
info = GETF2(args, NULL, range_n, sa, sb, 0);
return info;
}
-
+
sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
info = 0;
for (j = 0; j < mn; j += blocking) {
-
+
jb = mn - j;
if (jb > blocking) jb = blocking;
@@ -102,53 +102,53 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
if (iinfo && !info) info = iinfo + j;
if (j + jb < n) {
-
+
TRSM_ILTCOPY(jb, jb, offsetA + j * COMPSIZE, lda, 0, sb);
-
+
for (js = j + jb; js < n; js += REAL_GEMM_R){
jmin = n - js;
if (jmin > REAL_GEMM_R) jmin = REAL_GEMM_R;
-
+
for (jjs = js; jjs < js + jmin; jjs += GEMM_UNROLL_N){
min_jj = js + jmin - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
-#if 1
- LASWP_PLUS(min_jj, j + offset + 1, j + jb + offset, ZERO,
+
+#if 1
+ LASWP_PLUS(min_jj, j + offset + 1, j + jb + offset, ZERO,
#ifdef COMPLEX
ZERO,
#endif
a + (- offset + jjs * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1);
-
+
GEMM_ONCOPY (jb, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sbb + jb * (jjs - js) * COMPSIZE);
#else
- LASWP_NCOPY(min_jj, j + offset + 1, j + jb + offset,
+ LASWP_NCOPY(min_jj, j + offset + 1, j + jb + offset,
a + (- offset + jjs * lda) * COMPSIZE, lda, ipiv, sbb + jb * (jjs - js) * COMPSIZE);
#endif
-
-
+
+
for (jc = 0; jc < jb; jc += GEMM_P) {
jcmin = jb - jc;
if (jcmin > GEMM_P) jcmin = GEMM_P;
-
+
TRSM_KERNEL_LT(jcmin, min_jj, jb, dm1,
#ifdef COMPLEX
ZERO,
#endif
sb + jb * jc * COMPSIZE,
- sbb + jb * (jjs - js) * COMPSIZE,
+ sbb + jb * (jjs - js) * COMPSIZE,
a + (j + jc + jjs * lda) * COMPSIZE, lda, jc);
}
}
for (is = j + jb; is < m; is += GEMM_P){
-
+
imin = m - is;
if (imin > GEMM_P) imin = GEMM_P;
GEMM_ITCOPY (jb, imin, offsetA + is * COMPSIZE, lda, sa);
-
+
GEMM_KERNEL_N(imin, jmin, jb, dm1,
#ifdef COMPLEX
ZERO,
@@ -158,7 +158,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
}
}
}
-
+
for (j = 0; j < mn; j += jb) {
jb = MIN(mn - j, blocking);
LASWP_PLUS(jb, j + jb + offset + 1, mn + offset, ZERO,
@@ -166,7 +166,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
ZERO,
#endif
a - (offset - j * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1);
-
+
}
return info;
diff --git a/lapack/getrs/getrs_parallel.c b/lapack/getrs/getrs_parallel.c
index 3a7e426..4b589fe 100644
--- a/lapack/getrs/getrs_parallel.c
+++ b/lapack/getrs/getrs_parallel.c
@@ -51,14 +51,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
}
#ifndef TRANS
- LASWP_PLUS(n, 1, args -> m, ZERO,
+ LASWP_PLUS(n, 1, args -> m, ZERO,
(FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, 1);
TRSM_LNLU (args, range_m, range_n, sa, sb, 0);
TRSM_LNUN (args, range_m, range_n, sa, sb, 0);
#else
TRSM_LTUN (args, range_m, range_n, sa, sb, 0);
TRSM_LTLU (args, range_m, range_n, sa, sb, 0);
- LASWP_MINUS(n, 1, args -> m, ZERO,
+ LASWP_MINUS(n, 1, args -> m, ZERO,
(FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, -1);
#endif
@@ -81,7 +81,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb, args -> nthreads);
}
@@ -97,7 +97,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_REAL | (1 << BLAS_TRANSA_SHIFT);
#else
mode = BLAS_SINGLE | BLAS_REAL | (1 << BLAS_TRANSA_SHIFT);
-#endif
+#endif
gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb, args -> nthreads);
}
diff --git a/lapack/getrs/zgetrs_parallel.c b/lapack/getrs/zgetrs_parallel.c
index b0d3fb0..d4abc49 100644
--- a/lapack/getrs/zgetrs_parallel.c
+++ b/lapack/getrs/zgetrs_parallel.c
@@ -104,7 +104,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb, args -> nthreads);
}
diff --git a/lapack/getrs/zgetrs_single.c b/lapack/getrs/zgetrs_single.c
index 3910d0e..ee4ac81 100644
--- a/lapack/getrs/zgetrs_single.c
+++ b/lapack/getrs/zgetrs_single.c
@@ -45,11 +45,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
LASWP_PLUS (args -> n, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1);
TRSM_LNLU(args, range_m, range_n, sa, sb, 0);
- TRSM_LNUN(args, range_m, range_n, sa, sb, 0);
+ TRSM_LNUN(args, range_m, range_n, sa, sb, 0);
#elif TRANS == 2
TRSM_LTUN(args, range_m, range_n, sa, sb, 0);
TRSM_LTLU(args, range_m, range_n, sa, sb, 0);
-
+
LASWP_MINUS(args -> n, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1);
#elif TRANS == 3
LASWP_PLUS (args -> n, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1);
diff --git a/lapack/laswp/generic/Makefile b/lapack/laswp/generic/Makefile
index bc9ab80..8675e1a 100644
--- a/lapack/laswp/generic/Makefile
+++ b/lapack/laswp/generic/Makefile
@@ -12,7 +12,7 @@ ZLASWP = ../generic/zlaswp_k.c
endif
LASWP_DEPS = ../generic/laswp_k_1.c ../generic/laswp_k_2.c \
- ../generic/laswp_k_4.c ../generic/laswp_k_8.c
+ ../generic/laswp_k_4.c ../generic/laswp_k_8.c
ZLASWP_DEPS = ../generic/zlaswp_k_1.c ../generic/zlaswp_k_2.c \
../generic/zlaswp_k_4.c
diff --git a/lapack/laswp/generic/laswp_k_1.c b/lapack/laswp/generic/laswp_k_1.c
index 1b0db5f..88648cf 100644
--- a/lapack/laswp/generic/laswp_k_1.c
+++ b/lapack/laswp/generic/laswp_k_1.c
@@ -45,7 +45,7 @@
#define a2 (a1 - 1)
#endif
-int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda,
+int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda,
FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
BLASLONG i, j, ip1, ip2, rows;
@@ -53,7 +53,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
FLOAT *a1;
FLOAT *b1, *b2;
FLOAT A1, A2, B1, B2;
-
+
a--;
k1 --;
@@ -64,7 +64,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#endif
if (n <= 0) return 0;
-
+
rows = k2-k1;
if (rows <=0) return 0;
if (rows == 1) {
@@ -72,7 +72,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
ip1 = *ipiv;
a1 = a + k1 + 1;
b1 = a + ip1;
-
+
if(a1 == b1) return 0;
for(j=0; j<n; j++){
@@ -102,10 +102,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
piv += incx;
ip2 = *piv;
piv += incx;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
i = (rows >> 1);
i--;
@@ -136,22 +136,22 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
A2 = *a2;
B1 = *b1;
B2 = *b2;
-
+
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
-
+
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -168,11 +168,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a1 = A2;
*a2 = B1;
*b1 = A1;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -184,10 +184,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*b2 = A2;
}
}
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
#ifndef MINUS
a1 += 2;
#else
@@ -205,12 +205,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -227,11 +227,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a1 = A2;
*a2 = B1;
*b1 = A1;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -249,26 +249,26 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#else
a1 -= 2;
#endif
-
+
//Remain
i = (rows & 1);
-
+
if (i > 0) {
ip1 = *piv;
- b1 = a + ip1;
+ b1 = a + ip1;
A1 = *a1;
B1 = *b1;
*a1 = B1;
*b1 = A1;
}
-
+
a += lda;
-
+
j --;
} while (j > 0);
}
return 0;
-}
+}
diff --git a/lapack/laswp/generic/laswp_k_2.c b/lapack/laswp/generic/laswp_k_2.c
index 8a8a89b..93b9a2c 100644
--- a/lapack/laswp/generic/laswp_k_2.c
+++ b/lapack/laswp/generic/laswp_k_2.c
@@ -47,7 +47,7 @@
#define a4 (a3 - 1)
#endif
-int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda,
+int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda,
FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
BLASLONG i, j, ip1, ip2, rows;
@@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
FLOAT *a1, *a3;
FLOAT *b1, *b2, *b3, *b4;
FLOAT A1, A2, B1, B2, A3, A4, B3, B4;
-
+
a--;
k1 --;
@@ -66,7 +66,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#endif
if (n <= 0) return 0;
-
+
j = (n >> 1);
rows = k2-k1;
if (rows <=0) return 0;
@@ -75,7 +75,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
ip1 = *ipiv;
a1 = a + k1 + 1;
b1 = a + ip1;
-
+
if(a1 == b1) return 0;
for(j=0; j<n; j++){
@@ -93,28 +93,28 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
if (j > 0) {
do {
piv = ipiv;
-
+
#ifndef MINUS
a1 = a + k1 + 1;
#else
a1 = a + k2;
#endif
-
+
a3 = a1 + 1 * lda;
-
+
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
-
+
i = ((rows) >> 1);
-
+
// Loop pipeline
i--;
@@ -137,31 +137,31 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
B2 = *b2;
B3 = *b3;
B4 = *b4;
-
+
A1 = *a1;
A2 = *a2;
A3 = *a3;
A4 = *a4;
-
+
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
-
+
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
*a3 = A4;
*a4 = A3;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
*a4 = B4;
*b4 = A4;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -186,13 +186,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a3 = A4;
*a4 = B3;
*b3 = A3;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
*a3 = B3;
*b3 = A3;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -211,13 +211,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*b4 = A4;
}
}
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
-
+
#ifndef MINUS
a1 += 2;
a3 += 2;
@@ -233,7 +233,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
B2 = *b2;
B3 = *b3;
B4 = *b4;
-
+
A1 = *a1;
A2 = *a2;
A3 = *a3;
@@ -245,14 +245,14 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a2 = A1;
*a3 = A4;
*a4 = A3;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
*a4 = B4;
*b4 = A4;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -277,13 +277,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a3 = A4;
*a4 = B3;
*b3 = A3;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
*a3 = B3;
*b3 = A3;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -310,9 +310,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
a3 -= 2;
#endif
- //Remain
+ //Remain
i = ((rows) & 1);
-
+
if (i > 0) {
ip1 = *piv;
b1 = a + ip1;
@@ -328,7 +328,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a3 = B3;
*b3 = A3;
}
-
+
a += 2 * lda;
j --;
} while (j > 0);
@@ -342,15 +342,15 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#else
a1 = a + k2;
#endif
-
+
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
i = ((rows) >> 1);
i --;
@@ -359,22 +359,22 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
A2 = *a2;
B1 = *b1;
B2 = *b2;
-
+
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
-
+
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -391,11 +391,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a1 = A2;
*a2 = B1;
*b1 = A1;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -407,10 +407,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*b2 = A2;
}
}
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
#ifndef MINUS
a1 += 2;
#else
@@ -418,7 +418,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#endif
i --;
}
-
+
//Loop Ending (n=1)
A1 = *a1;
A2 = *a2;
@@ -428,12 +428,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -450,11 +450,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a1 = A2;
*a2 = B1;
*b1 = A1;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -472,13 +472,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#else
a1 -= 2;
#endif
-
+
//Remain
i = (rows & 1);
-
+
if (i > 0) {
ip1 = *piv;
- b1 = a + ip1;
+ b1 = a + ip1;
A1 = *a1;
B1 = *b1;
@@ -488,5 +488,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
}
return 0;
-}
+}
diff --git a/lapack/laswp/generic/laswp_k_4.c b/lapack/laswp/generic/laswp_k_4.c
index 86ee949..5cb6517 100644
--- a/lapack/laswp/generic/laswp_k_4.c
+++ b/lapack/laswp/generic/laswp_k_4.c
@@ -51,7 +51,7 @@
#define a8 (a7 - 1)
#endif
-int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda,
+int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda,
FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
BLASLONG i, j, ip1, ip2, rows;
@@ -61,7 +61,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
FLOAT *b5, *b6, *b7, *b8;
FLOAT A1, A2, B1, B2, A3, A4, B3, B4;
FLOAT A5, A6, B5, B6, A7, A8, B7, B8;
-
+
a--;
k1 --;
@@ -80,7 +80,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
ip1 = *ipiv;
a1 = a + k1 + 1;
b1 = a + ip1;
-
+
if(a1 == b1) return 0;
for(j=0; j<n; j++){
@@ -105,7 +105,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#else
a1 = a + k2;
#endif
-
+
a3 = a1 + 1 * lda;
a5 = a1 + 2 * lda;
a7 = a1 + 3 * lda;
@@ -114,10 +114,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
piv += incx;
ip2 = *piv;
piv += incx;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
b5 = b1 + 2 * lda;
@@ -126,7 +126,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
b8 = b2 + 3 * lda;
i = ((k2 - k1) >> 1);
-
+
i--; //Loop pipeline
//Main Loop
while (i > 0) {
@@ -147,12 +147,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
B6 = *b6;
B7 = *b7;
B8 = *b8;
-
+
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
-
+
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
@@ -163,7 +163,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a6 = A5;
*a7 = A8;
*a8 = A7;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
@@ -174,7 +174,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a8 = B8;
*b8 = A8;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -215,7 +215,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a7 = A8;
*a8 = B7;
*b7 = A7;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
@@ -225,7 +225,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*b5 = A5;
*a7 = B7;
*b7 = A7;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -258,17 +258,17 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*b8 = A8;
}
}
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
b5 = b1 + 2 * lda;
b6 = b2 + 2 * lda;
b7 = b1 + 3 * lda;
b8 = b2 + 3 * lda;
-
+
#ifndef MINUS
a1 += 2;
a3 += 2;
@@ -312,7 +312,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a6 = A5;
*a7 = A8;
*a8 = A7;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
@@ -323,7 +323,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a8 = B8;
*b8 = A8;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -364,7 +364,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a7 = A8;
*a8 = B7;
*b7 = A7;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
@@ -374,7 +374,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*b5 = A5;
*a7 = B7;
*b7 = A7;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -420,9 +420,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
a7 -= 2;
#endif
- //Remain
+ //Remain
i = ((rows) & 1);
-
+
if (i > 0) {
ip1 = *piv;
b1 = a + ip1;
@@ -449,9 +449,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a7 = B7;
*b7 = A7;
}
-
+
a += 4 * lda;
-
+
j --;
} while (j > 0);
}
@@ -464,20 +464,20 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#else
a1 = a + k2;
#endif
-
+
a3 = a1 + 1 * lda;
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
-
+
i = ((rows) >> 1);
i--;
@@ -486,31 +486,31 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
A2 = *a2;
A3 = *a3;
A4 = *a4;
-
+
B1 = *b1;
B2 = *b2;
B3 = *b3;
B4 = *b4;
-
+
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
-
+
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
*a3 = A4;
*a4 = A3;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
*a4 = B4;
*b4 = A4;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -535,13 +535,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a3 = A4;
*a4 = B3;
*b3 = A3;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
*a3 = B3;
*b3 = A3;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -560,13 +560,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*b4 = A4;
}
}
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
-
+
#ifndef MINUS
a1 += 2;
a3 += 2;
@@ -576,13 +576,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#endif
i --;
}
-
+
//Loop Ending
B1 = *b1;
B2 = *b2;
B3 = *b3;
B4 = *b4;
-
+
A1 = *a1;
A2 = *a2;
A3 = *a3;
@@ -594,14 +594,14 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a2 = A1;
*a3 = A4;
*a4 = A3;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
*a4 = B4;
*b4 = A4;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -626,13 +626,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a3 = A4;
*a4 = B3;
*b3 = A3;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
*a3 = B3;
*b3 = A3;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -660,7 +660,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#endif
i = ((rows) & 1);
-
+
if (i > 0) {
ip1 = *piv;
b1 = a + ip1;
@@ -675,7 +675,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a3 = B3;
*b3 = A3;
}
-
+
a += 2 * lda;
}
@@ -687,15 +687,15 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#else
a1 = a + k2;
#endif
-
+
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
i = ((rows) >> 1);
i --;
@@ -704,22 +704,22 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
A2 = *a2;
B1 = *b1;
B2 = *b2;
-
+
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
-
+
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -736,11 +736,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a1 = A2;
*a2 = B1;
*b1 = A1;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -752,10 +752,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*b2 = A2;
}
}
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
#ifndef MINUS
a1 += 2;
#else
@@ -763,7 +763,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#endif
i --;
}
-
+
//Loop Ending (n=1)
A1 = *a1;
A2 = *a2;
@@ -773,12 +773,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -795,11 +795,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a1 = A2;
*a2 = B1;
*b1 = A1;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -817,13 +817,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#else
a1 -= 2;
#endif
-
+
//Remain
i = (rows & 1);
-
+
if (i > 0) {
ip1 = *piv;
- b1 = a + ip1;
+ b1 = a + ip1;
A1 = *a1;
B1 = *b1;
@@ -833,5 +833,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
}
return 0;
-}
+}
diff --git a/lapack/laswp/generic/laswp_k_8.c b/lapack/laswp/generic/laswp_k_8.c
index e3a05db..9479418 100644
--- a/lapack/laswp/generic/laswp_k_8.c
+++ b/lapack/laswp/generic/laswp_k_8.c
@@ -59,7 +59,7 @@
#define a16 (a15 - 1)
#endif
-int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda,
+int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda,
FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
BLASLONG i, j, ip1, ip2, rows;
@@ -74,7 +74,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
FLOAT A5, A6, B5, B6, A7, A8, B7, B8;
FLOAT A9, A10, B9, B10, A11, A12, B11, B12;
FLOAT A13, A14, B13, B14, A15, A16, B15, B16;
-
+
a--;
k1 --;
@@ -93,7 +93,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
ip1 = *ipiv;
a1 = a + k1 + 1;
b1 = a + ip1;
-
+
if(a1 == b1) return 0;
for(j=0; j<n; j++){
@@ -118,7 +118,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#else
a1 = a + k2;
#endif
-
+
a3 = a1 + 1 * lda;
a5 = a1 + 2 * lda;
a7 = a1 + 3 * lda;
@@ -131,10 +131,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
piv += incx;
ip2 = *piv;
piv += incx;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
b5 = b1 + 2 * lda;
@@ -164,7 +164,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
B6 = *b6;
B7 = *b7;
B8 = *b8;
-
+
B9 = *b9;
B10 = *b10;
B11 = *b11;
@@ -173,7 +173,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
B14 = *b14;
B15 = *b15;
B16 = *b16;
-
+
A1 = *a1;
A2 = *a2;
A3 = *a3;
@@ -196,7 +196,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
piv += incx;
ip2 = *piv;
piv += incx;
-
+
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
@@ -215,7 +215,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a14 = A13;
*a15 = A16;
*a16 = A15;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
@@ -235,7 +235,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a16 = B16;
*b16 = A16;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -311,7 +311,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a15 = A16;
*a16 = B15;
*b15 = A15;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
@@ -330,7 +330,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*b13 = A13;
*a15 = B15;
*b15 = A15;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -393,17 +393,17 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*b16 = A16;
}
}
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
b5 = b1 + 2 * lda;
b6 = b2 + 2 * lda;
b7 = b1 + 3 * lda;
b8 = b2 + 3 * lda;
-
+
b9 = b1 + 4 * lda;
b10 = b2 + 4 * lda;
b11 = b1 + 5 * lda;
@@ -443,7 +443,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
B6 = *b6;
B7 = *b7;
B8 = *b8;
-
+
B9 = *b9;
B10 = *b10;
B11 = *b11;
@@ -452,7 +452,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
B14 = *b14;
B15 = *b15;
B16 = *b16;
-
+
A1 = *a1;
A2 = *a2;
A3 = *a3;
@@ -488,7 +488,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a14 = A13;
*a15 = A16;
*a16 = A15;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
@@ -508,7 +508,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a16 = B16;
*b16 = A16;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -584,7 +584,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a15 = A16;
*a16 = B15;
*b15 = A15;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
@@ -603,7 +603,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*b13 = A13;
*a15 = B15;
*b15 = A15;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -666,7 +666,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*b16 = A16;
}
}
-
+
#ifndef MINUS
a1 += 2;
@@ -686,10 +686,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
a11 -= 2;
a13 -= 2;
a15 -= 2;
-#endif
+#endif
//Remain
i = (rows & 1);
-
+
if (i > 0) {
ip1 = *piv;
b1 = a + ip1;
@@ -697,7 +697,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
b5 = b1 + 2 * lda;
b7 = b1 + 3 * lda;
-
+
b9 = b1 + 4 * lda;
b11 = b1 + 5 * lda;
b13 = b1 + 6 * lda;
@@ -740,9 +740,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a15 = B15;
*b15 = A15;
}
-
+
a += 8 * lda;
-
+
j --;
} while (j > 0);
}
@@ -755,19 +755,19 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#else
a1 = a + k2;
#endif
-
+
a3 = a1 + 1 * lda;
a5 = a1 + 2 * lda;
a7 = a1 + 3 * lda;
-
+
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
b5 = b1 + 2 * lda;
@@ -787,7 +787,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
A6 = *a6;
A7 = *a7;
A8 = *a8;
-
+
B1 = *b1;
B2 = *b2;
B3 = *b3;
@@ -796,12 +796,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
B6 = *b6;
B7 = *b7;
B8 = *b8;
-
+
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
-
+
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
@@ -812,7 +812,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a6 = A5;
*a7 = A8;
*a8 = A7;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
@@ -823,7 +823,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a8 = B8;
*b8 = A8;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -864,7 +864,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a7 = A8;
*a8 = B7;
*b7 = A7;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
@@ -874,7 +874,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*b5 = A5;
*a7 = B7;
*b7 = A7;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -907,17 +907,17 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*b8 = A8;
}
}
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
b5 = b1 + 2 * lda;
b6 = b2 + 2 * lda;
b7 = b1 + 3 * lda;
b8 = b2 + 3 * lda;
-
+
#ifndef MINUS
a1 += 2;
a3 += 2;
@@ -959,7 +959,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a6 = A5;
*a7 = A8;
*a8 = A7;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
@@ -970,7 +970,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a8 = B8;
*b8 = A8;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -1011,7 +1011,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a7 = A8;
*a8 = B7;
*b7 = A7;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
@@ -1021,7 +1021,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*b5 = A5;
*a7 = B7;
*b7 = A7;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -1068,7 +1068,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#endif
i = (rows & 1);
-
+
if (i > 0) {
ip1 = *piv;
b1 = a + ip1;
@@ -1094,7 +1094,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a7 = B7;
*b7 = A7;
}
-
+
a += 4 * lda;
}
@@ -1106,20 +1106,20 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#else
a1 = a + k2;
#endif
-
+
a3 = a1 + 1 * lda;
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
-
+
i = ((rows) >> 1);
i--;
@@ -1128,31 +1128,31 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
A2 = *a2;
A3 = *a3;
A4 = *a4;
-
+
B1 = *b1;
B2 = *b2;
B3 = *b3;
B4 = *b4;
-
+
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
-
+
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
*a3 = A4;
*a4 = A3;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
*a4 = B4;
*b4 = A4;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -1177,13 +1177,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a3 = A4;
*a4 = B3;
*b3 = A3;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
*a3 = B3;
*b3 = A3;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -1202,13 +1202,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*b4 = A4;
}
}
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
-
+
#ifndef MINUS
a1 += 2;
a3 += 2;
@@ -1218,13 +1218,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#endif
i --;
}
-
+
//Loop Ending
B1 = *b1;
B2 = *b2;
B3 = *b3;
B4 = *b4;
-
+
A1 = *a1;
A2 = *a2;
A3 = *a3;
@@ -1236,14 +1236,14 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a2 = A1;
*a3 = A4;
*a4 = A3;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
*a4 = B4;
*b4 = A4;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -1268,13 +1268,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a3 = A4;
*a4 = B3;
*b3 = A3;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
*a3 = B3;
*b3 = A3;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -1302,7 +1302,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#endif
i = ((rows) & 1);
-
+
if (i > 0) {
ip1 = *piv;
b1 = a + ip1;
@@ -1317,7 +1317,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a3 = B3;
*b3 = A3;
}
-
+
a += 2 * lda;
}
@@ -1329,15 +1329,15 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#else
a1 = a + k2;
#endif
-
+
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
i = ((rows) >> 1);
i --;
@@ -1346,22 +1346,22 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
A2 = *a2;
B1 = *b1;
B2 = *b2;
-
+
ip1 = *piv;
piv += incx;
ip2 = *piv;
piv += incx;
-
+
if (b1 == a1) {
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -1378,11 +1378,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a1 = A2;
*a2 = B1;
*b1 = A1;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -1394,10 +1394,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*b2 = A2;
}
}
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
#ifndef MINUS
a1 += 2;
#else
@@ -1405,7 +1405,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#endif
i --;
}
-
+
//Loop Ending (n=1)
A1 = *a1;
A2 = *a2;
@@ -1415,12 +1415,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
if (b2 == a1) {
*a1 = A2;
*a2 = A1;
- } else
+ } else
if (b2 != a2) {
*a2 = B2;
*b2 = A2;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -1437,11 +1437,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a1 = A2;
*a2 = B1;
*b1 = A1;
- } else
+ } else
if (b2 == a2) {
*a1 = B1;
*b1 = A1;
- } else
+ } else
if (b2 == b1) {
*a1 = B1;
*a2 = A1;
@@ -1459,13 +1459,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
#else
a1 -= 2;
#endif
-
+
//Remain
i = (rows & 1);
-
+
if (i > 0) {
ip1 = *piv;
- b1 = a + ip1;
+ b1 = a + ip1;
A1 = *a1;
B1 = *b1;
@@ -1475,5 +1475,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
}
return 0;
-}
+}
diff --git a/lapack/laswp/generic/zlaswp_k_1.c b/lapack/laswp/generic/zlaswp_k_1.c
index 7a62dd9..d120477 100644
--- a/lapack/laswp/generic/zlaswp_k_1.c
+++ b/lapack/laswp/generic/zlaswp_k_1.c
@@ -45,8 +45,8 @@
#define a2 (a1 - 2)
#endif
-int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
- FLOAT *a, BLASLONG lda,
+int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
+ FLOAT *a, BLASLONG lda,
FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
BLASLONG i, j, ip1, ip2, rows;
@@ -79,7 +79,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
#endif
b1 = a + ip1;
-
+
if(a1 == b1) return 0;
for(j=0; j<n; j++){
@@ -92,7 +92,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
-
+
a1 += lda;
b1 += lda;
}
@@ -114,10 +114,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
piv += incx;
ip2 = *piv * 2;
piv += incx;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
i = ((k2 - k1) >> 1);
i --;
//Loop pipeline
@@ -152,26 +152,26 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
-
+
ip1 = *piv * 2;
piv += incx;
ip2 = *piv * 2;
piv += incx;
-
+
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
- } else
+ } else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -196,13 +196,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
- } else
+ } else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
- } else
+ } else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
@@ -221,10 +221,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b2 + 1) = A4;
}
}
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
#ifndef MINUS
a1 += 4;
#else
@@ -243,22 +243,22 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
-
-
+
+
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
- } else
+ } else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -283,13 +283,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
- } else
+ } else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
- } else
+ } else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
@@ -308,8 +308,8 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b2 + 1) = A4;
}
}
-
-
+
+
#ifndef MINUS
a1 += 4;
#else
@@ -318,7 +318,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
//Remain
i = (rows & 1);
-
+
if (i > 0) {
ip1 = *piv * 2;
b1 = a + ip1;
@@ -332,13 +332,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b1 + 0) = A1;
*(b1 + 1) = A2;
}
-
+
a += lda;
-
+
j --;
} while (j > 0);
}
return 0;
-}
+}
diff --git a/lapack/laswp/generic/zlaswp_k_2.c b/lapack/laswp/generic/zlaswp_k_2.c
index 0fa6858..c18ab4b 100644
--- a/lapack/laswp/generic/zlaswp_k_2.c
+++ b/lapack/laswp/generic/zlaswp_k_2.c
@@ -45,8 +45,8 @@
#define a2 (a1 - 2)
#endif
-int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
- FLOAT *a, BLASLONG lda,
+int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
+ FLOAT *a, BLASLONG lda,
FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
BLASLONG i, j, ip1, ip2, rows;
@@ -81,7 +81,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
#endif
b1 = a + ip1;
-
+
if(a1 == b1) return 0;
for(j=0; j<n; j++){
@@ -94,7 +94,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
-
+
a1 += lda;
b1 += lda;
}
@@ -116,10 +116,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
piv += incx;
ip2 = *piv * 2;
piv += incx;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
i = (rows >> 1);
i--;
@@ -154,7 +154,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
-
+
B5 = *(b1 + 0 + lda);
B6 = *(b1 + 1 + lda);
B7 = *(b2 + 0 + lda);
@@ -164,7 +164,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
piv += incx;
ip2 = *piv * 2;
piv += incx;
-
+
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
@@ -175,7 +175,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a1 + 1 + lda) = A8;
*(a2 + 0 + lda) = A5;
*(a2 + 1 + lda) = A6;
- } else
+ } else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
@@ -186,7 +186,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b2 + 0 + lda) = A7;
*(b2 + 1 + lda) = A8;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -227,7 +227,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a2 + 1 + lda) = B6;
*(b1 + 0 + lda) = A5;
*(b1 + 1 + lda) = A6;
- } else
+ } else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
@@ -237,7 +237,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a1 + 1 + lda) = B6;
*(b1 + 0 + lda) = A5;
*(b1 + 1 + lda) = A6;
- } else
+ } else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
@@ -270,10 +270,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b2 + 1 + lda) = A8;
}
}
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
#ifndef MINUS
a1 += 4;
#else
@@ -296,7 +296,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
-
+
B5 = *(b1 + 0 + lda);
B6 = *(b1 + 1 + lda);
B7 = *(b2 + 0 + lda);
@@ -312,7 +312,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a1 + 1 + lda) = A8;
*(a2 + 0 + lda) = A5;
*(a2 + 1 + lda) = A6;
- } else
+ } else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
@@ -323,7 +323,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b2 + 0 + lda) = A7;
*(b2 + 1 + lda) = A8;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -364,7 +364,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a2 + 1 + lda) = B6;
*(b1 + 0 + lda) = A5;
*(b1 + 1 + lda) = A6;
- } else
+ } else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
@@ -374,7 +374,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a1 + 1 + lda) = B6;
*(b1 + 0 + lda) = A5;
*(b1 + 1 + lda) = A6;
- } else
+ } else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
@@ -407,9 +407,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b2 + 1 + lda) = A8;
}
}
-
-
+
+
#ifndef MINUS
a1 += 4;
#else
@@ -418,7 +418,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
//Remain
i = (rows & 1);
-
+
if (i > 0) {
ip1 = *piv * 2;
b1 = a + ip1;
@@ -440,30 +440,30 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b1 + 0 + lda) = A3;
*(b1 + 1 + lda) = A4;
}
-
+
a += 2 * lda;
-
+
j --;
} while (j > 0);
}
if (n & 1) {
piv = ipiv;
-
+
#ifndef MINUS
a1 = a + (k1 + 1) * 2;
#else
a1 = a + k2 * 2;
#endif
-
+
ip1 = *piv * 2;
piv += incx;
ip2 = *piv * 2;
piv += incx;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
i = (rows >> 1);
i--;
@@ -478,26 +478,26 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
-
+
ip1 = *piv * 2;
piv += incx;
ip2 = *piv * 2;
piv += incx;
-
+
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
- } else
+ } else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -522,13 +522,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
- } else
+ } else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
- } else
+ } else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
@@ -547,10 +547,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b2 + 1) = A4;
}
}
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
#ifndef MINUS
a1 += 4;
#else
@@ -567,21 +567,21 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
-
+
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
- } else
+ } else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -606,13 +606,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
- } else
+ } else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
- } else
+ } else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
@@ -631,16 +631,16 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b2 + 1) = A4;
}
}
-
+
#ifndef MINUS
a1 += 4;
#else
a1 -= 4;
#endif
-
+
//Remain
i = (rows & 1);
-
+
if (i > 0) {
ip1 = *piv * 2;
b1 = a + ip1;
@@ -657,5 +657,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
}
return 0;
-}
+}
diff --git a/lapack/laswp/generic/zlaswp_k_4.c b/lapack/laswp/generic/zlaswp_k_4.c
index c63a8e2..45e1bf0 100644
--- a/lapack/laswp/generic/zlaswp_k_4.c
+++ b/lapack/laswp/generic/zlaswp_k_4.c
@@ -51,8 +51,8 @@
#define a8 (a7 - 2)
#endif
-int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
- FLOAT *a, BLASLONG lda,
+int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
+ FLOAT *a, BLASLONG lda,
FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
BLASLONG i, j, ip1, ip2, rows;
@@ -89,7 +89,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
#endif
b1 = a + ip1;
-
+
if(a1 == b1) return 0;
for(j=0; j<n; j++){
@@ -102,7 +102,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
-
+
a1 += lda;
b1 += lda;
}
@@ -128,7 +128,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
piv += incx;
ip2 = *piv * 2;
piv += incx;
-
+
b1 = a + ip1;
b2 = a + ip2;
@@ -185,7 +185,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
piv += incx;
ip2 = *piv * 2;
piv += incx;
-
+
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
@@ -204,7 +204,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a7 + 1) = A16;
*(a8 + 0) = A13;
*(a8 + 1) = A14;
- } else
+ } else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
@@ -223,7 +223,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b8 + 0) = A15;
*(b8 + 1) = A16;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -297,7 +297,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a8 + 1) = B14;
*(b7 + 0) = A13;
*(b7 + 1) = A14;
- } else
+ } else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
@@ -315,7 +315,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a7 + 1) = B14;
*(b7 + 0) = A13;
*(b7 + 1) = A14;
- } else
+ } else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
@@ -377,10 +377,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b8 + 1) = A16;
}
}
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + 1 * lda;
b4 = b2 + 1 * lda;
b5 = b1 + 2 * lda;
@@ -401,7 +401,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
#endif
i --;
}
-
+
//Loop Ending
A1 = *(a1 + 0);
A2 = *(a1 + 1);
@@ -438,7 +438,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
B14 = *(b7 + 1);
B15 = *(b8 + 0);
B16 = *(b8 + 1);
-
+
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
@@ -457,7 +457,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a7 + 1) = A16;
*(a8 + 0) = A13;
*(a8 + 1) = A14;
- } else
+ } else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
@@ -476,7 +476,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b8 + 0) = A15;
*(b8 + 1) = A16;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -550,7 +550,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a8 + 1) = B14;
*(b7 + 0) = A13;
*(b7 + 1) = A14;
- } else
+ } else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
@@ -568,7 +568,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a7 + 1) = B14;
*(b7 + 0) = A13;
*(b7 + 1) = A14;
- } else
+ } else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
@@ -630,7 +630,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b8 + 1) = A16;
}
}
-
+
#ifndef MINUS
a1 += 4;
a3 += 4;
@@ -644,7 +644,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
#endif
//Remain
i = (rows & 1);
-
+
if (i > 0) {
ip1 = *piv * 2;
b1 = a + ip1;
@@ -688,9 +688,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b7 + 0) = A7;
*(b7 + 1) = A8;
}
-
+
a += 4 * lda;
-
+
j --;
} while (j > 0);
}
@@ -705,18 +705,18 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
#endif
a3 = a1 + lda;
-
+
ip1 = *piv * 2;
piv += incx;
ip2 = *piv * 2;
piv += incx;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + lda;
b4 = b2 + lda;
-
+
i = (rows >> 1);
i--;
@@ -727,7 +727,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
A2 = *(a1 + 1);
A3 = *(a2 + 0);
A4 = *(a2 + 1);
-
+
A5 = *(a3 + 0);
A6 = *(a3 + 1);
A7 = *(a4 + 0);
@@ -737,17 +737,17 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
-
+
B5 = *(b3 + 0);
B6 = *(b3 + 1);
B7 = *(b4 + 0);
B8 = *(b4 + 1);
-
+
ip1 = *piv * 2;
piv += incx;
ip2 = *piv * 2;
piv += incx;
-
+
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
@@ -758,7 +758,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a3 + 1) = A8;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
- } else
+ } else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
@@ -769,7 +769,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b4 + 0) = A7;
*(b4 + 1) = A8;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -810,7 +810,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a4 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
- } else
+ } else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
@@ -820,7 +820,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a3 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
- } else
+ } else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
@@ -853,13 +853,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b4 + 1) = A8;
}
}
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
b3 = b1 + lda;
b4 = b2 + lda;
-
+
#ifndef MINUS
a1 += 4;
a3 += 4;
@@ -874,7 +874,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
A2 = *(a1 + 1);
A3 = *(a2 + 0);
A4 = *(a2 + 1);
-
+
A5 = *(a3 + 0);
A6 = *(a3 + 1);
A7 = *(a4 + 0);
@@ -884,13 +884,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
-
+
B5 = *(b3 + 0);
B6 = *(b3 + 1);
B7 = *(b4 + 0);
B8 = *(b4 + 1);
-
-
+
+
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
@@ -901,7 +901,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a3 + 1) = A8;
*(a4 + 0) = A5;
*(a4 + 1) = A6;
- } else
+ } else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
@@ -912,7 +912,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b4 + 0) = A7;
*(b4 + 1) = A8;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -953,7 +953,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a4 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
- } else
+ } else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
@@ -963,7 +963,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a3 + 1) = B6;
*(b3 + 0) = A5;
*(b3 + 1) = A6;
- } else
+ } else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
@@ -996,7 +996,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b4 + 1) = A8;
}
}
-
+
#ifndef MINUS
a1 += 4;
a3 += 4;
@@ -1007,7 +1007,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
//Remain
i = (rows & 1);
-
+
if (i > 0) {
ip1 = *piv * 2;
@@ -1031,28 +1031,28 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b3 + 0) = A3;
*(b3 + 1) = A4;
}
-
+
a += 2 * lda;
-
+
}
if (n & 1) {
piv = ipiv;
-
+
#ifndef MINUS
a1 = a + (k1 + 1) * 2;
#else
a1 = a + k2 * 2;
#endif
-
+
ip1 = *piv * 2;
piv += incx;
ip2 = *piv * 2;
piv += incx;
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
i = (rows >> 1);
i--;
@@ -1067,26 +1067,26 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
-
+
ip1 = *piv * 2;
piv += incx;
ip2 = *piv * 2;
piv += incx;
-
+
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
- } else
+ } else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -1111,13 +1111,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
- } else
+ } else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
- } else
+ } else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
@@ -1136,10 +1136,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b2 + 1) = A4;
}
}
-
+
b1 = a + ip1;
b2 = a + ip2;
-
+
#ifndef MINUS
a1 += 4;
#else
@@ -1156,21 +1156,21 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
B2 = *(b1 + 1);
B3 = *(b2 + 0);
B4 = *(b2 + 1);
-
+
if (b1 == a1) {
if (b2 == a1) {
*(a1 + 0) = A3;
*(a1 + 1) = A4;
*(a2 + 0) = A1;
*(a2 + 1) = A2;
- } else
+ } else
if (b2 != a2) {
*(a2 + 0) = B3;
*(a2 + 1) = B4;
*(b2 + 0) = A3;
*(b2 + 1) = A4;
}
- } else
+ } else
if (b1 == a2) {
if (b2 != a1) {
if (b2 == a2) {
@@ -1195,13 +1195,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(a2 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
- } else
+ } else
if (b2 == a2) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
*(b1 + 0) = A1;
*(b1 + 1) = A2;
- } else
+ } else
if (b2 == b1) {
*(a1 + 0) = B1;
*(a1 + 1) = B2;
@@ -1220,16 +1220,16 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
*(b2 + 1) = A4;
}
}
-
+
#ifndef MINUS
a1 += 4;
#else
a1 -= 4;
#endif
-
+
//Remain
i = (rows & 1);
-
+
if (i > 0) {
ip1 = *piv * 2;
b1 = a + ip1;
@@ -1246,5 +1246,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
}
return 0;
-}
+}
diff --git a/lapack/lauu2/lauu2_L.c b/lapack/lauu2/lauu2_L.c
index aedb966..ccb299e 100644
--- a/lapack/lauu2/lauu2_L.c
+++ b/lapack/lauu2/lauu2_L.c
@@ -52,7 +52,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
-
+
if (range_n) {
n = range_n[1] - range_n[0];
a += range_n[0] * (lda + 1) * COMPSIZE;
@@ -61,13 +61,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
for (i = 0; i < n; i++) {
SCAL_K(i + 1, 0, 0, *(a + i + i * lda), a + i, lda, NULL, 0, NULL, 0);
-
+
if (i < n - 1) {
aii = DOTU_K(n - i - 1, a + i + 1 + i * lda, 1, a + i + 1 + i * lda, 1);
-
+
*(a + i + i * lda) += aii;
-
- GEMV_T(n - i - 1, i, 0, dp1,
+
+ GEMV_T(n - i - 1, i, 0, dp1,
a + (i + 1) , lda,
a + (i + 1) + i * lda, 1,
a + i , lda, sb);
diff --git a/lapack/lauu2/lauu2_U.c b/lapack/lauu2/lauu2_U.c
index f9a7186..c097c81 100644
--- a/lapack/lauu2/lauu2_U.c
+++ b/lapack/lauu2/lauu2_U.c
@@ -52,7 +52,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
-
+
if (range_n) {
n = range_n[1] - range_n[0];
a += range_n[0] * (lda + 1) * COMPSIZE;
@@ -61,13 +61,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
for (i = 0; i < n; i++) {
SCAL_K(i + 1, 0, 0, *(a + i + i * lda), a + i * lda, 1, NULL, 0, NULL, 0);
-
+
if (i < n - 1) {
aii = DOTU_K(n - i - 1, a + i + (i + 1)* lda, lda, a + i + (i + 1) * lda, lda);
-
+
*(a + i + i * lda) += aii;
-
- GEMV_N(i, n - i - 1, 0, dp1,
+
+ GEMV_N(i, n - i - 1, 0, dp1,
a + (i + 1) * lda, lda,
a + i + (i + 1) * lda, lda,
a + i * lda, 1, sb);
diff --git a/lapack/lauu2/zlauu2_L.c b/lapack/lauu2/zlauu2_L.c
index 8a892d9..84baeca 100644
--- a/lapack/lauu2/zlauu2_L.c
+++ b/lapack/lauu2/zlauu2_L.c
@@ -52,7 +52,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
-
+
if (range_n) {
n = range_n[1] - range_n[0];
a += range_n[0] * (lda + 1) * COMPSIZE;
@@ -62,16 +62,16 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
SCAL_K(i + 1, 0, 0, *(a + (i + i * lda) * COMPSIZE + 0), ZERO,
a + i * COMPSIZE, lda, NULL, 0, NULL, 0);
-
+
if (i < n - 1) {
temp[0] = DOTC_K(n - i - 1,
a + (i + 1 + i * lda) * COMPSIZE, 1,
a + (i + 1 + i * lda) * COMPSIZE, 1);
GET_IMAGE(temp[1]);
-
+
*(a + (i + i * lda) * COMPSIZE + 0) += temp[0];
*(a + (i + i * lda) * COMPSIZE + 1) = ZERO;
-
+
GEMV_U(n - i - 1, i, 0, dp1, ZERO,
a + ((i + 1) ) * COMPSIZE, lda,
a + ((i + 1) + i * lda) * COMPSIZE, 1,
diff --git a/lapack/lauu2/zlauu2_U.c b/lapack/lauu2/zlauu2_U.c
index b20ea99..fd0a15f 100644
--- a/lapack/lauu2/zlauu2_U.c
+++ b/lapack/lauu2/zlauu2_U.c
@@ -52,24 +52,24 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
-
+
if (range_n) {
n = range_n[1] - range_n[0];
a += range_n[0] * (lda + 1) * COMPSIZE;
}
for (i = 0; i < n; i++) {
- SCAL_K(i + 1, 0, 0,
+ SCAL_K(i + 1, 0, 0,
*(a + (i + i * lda) * COMPSIZE + 0), ZERO,
a + i * lda * COMPSIZE, 1, NULL, 0, NULL, 0);
-
+
if (i < n - 1) {
temp[0] = DOTC_K(n - i - 1, a + (i + (i + 1) * lda) * COMPSIZE, lda, a + (i + (i + 1) * lda) * COMPSIZE, lda);
GET_IMAGE(temp[1]);
-
+
*(a + (i + i * lda) * COMPSIZE + 0) += temp[0];
*(a + (i + i * lda) * COMPSIZE + 1) = ZERO;
-
+
GEMV_O(i, n - i - 1, 0, dp1, ZERO,
a + ( (i + 1) * lda) * COMPSIZE, lda,
a + (i + (i + 1) * lda) * COMPSIZE, lda,
diff --git a/lapack/lauum/lauum_L_parallel.c b/lapack/lauum/lauum_L_parallel.c
index 8d9cde9..c93c4a8 100644
--- a/lapack/lauum/lauum_L_parallel.c
+++ b/lapack/lauum/lauum_L_parallel.c
@@ -54,7 +54,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -62,11 +62,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
if (args -> nthreads == 1) {
- LAUUM_L_SINGLE(args, NULL, NULL, sa, sb, 0);
+ LAUUM_L_SINGLE(args, NULL, NULL, sa, sb, 0);
return 0;
}
@@ -87,7 +87,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
newarg.alpha = alpha;
newarg.beta = NULL;
newarg.nthreads = args -> nthreads;
-
+
blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
if (blocking > GEMM_Q) blocking = GEMM_Q;
@@ -95,7 +95,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
bk = n - i;
if (bk > blocking) bk = blocking;
-
+
newarg.n = i;
newarg.k = bk;
newarg.a = a + i * COMPSIZE;
@@ -118,6 +118,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
CNAME(&newarg, NULL, NULL, sa, sb, 0);
}
-
+
return 0;
}
diff --git a/lapack/lauum/lauum_L_single.c b/lapack/lauum/lauum_L_single.c
index 65e8f04..dead857 100644
--- a/lapack/lauum/lauum_L_single.c
+++ b/lapack/lauum/lauum_L_single.c
@@ -91,7 +91,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
-
+
if (range_n) {
n = range_n[1] - range_n[0];
a += range_n[0] * (lda + 1) * COMPSIZE;
@@ -107,11 +107,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
for (j = 0; j < n; j += blocking) {
bk = MIN(blocking, n - j);
-
+
if (j > 0 ){
TRMM_ILNCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, 0, sb);
-
+
for (ls = 0; ls < j; ls += REAL_GEMM_R) {
min_l = j - ls;
if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R;
@@ -127,97 +127,97 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
} else {
aa = sb2;
}
-
+
for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){
min_jj = ls + min_l - jjs;
if (min_jj > GEMM_P) min_jj = GEMM_P;
-
+
GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE);
-
- SYRK_KERNEL(min_i, min_jj, bk, dp1,
- aa,
- sb2 + (jjs - ls) * bk * COMPSIZE,
- a + (ls + jjs * lda) * COMPSIZE, lda,
+
+ SYRK_KERNEL(min_i, min_jj, bk, dp1,
+ aa,
+ sb2 + (jjs - ls) * bk * COMPSIZE,
+ a + (ls + jjs * lda) * COMPSIZE, lda,
ls - jjs);
}
-
+
for(is = ls + min_i; is < j ; is += GEMM_P){
min_i = j - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_INCOPY(bk, min_i, a + (j + is * lda)* COMPSIZE, lda, sa);
-
- SYRK_KERNEL(min_i, min_l, bk, dp1,
- sa,
- sb2,
- a + (is + ls * lda) * COMPSIZE, lda,
+
+ SYRK_KERNEL(min_i, min_l, bk, dp1,
+ sa,
+ sb2,
+ a + (is + ls * lda) * COMPSIZE, lda,
is - ls);
}
-
+
for (ks = 0; ks < bk; ks += GEMM_P) {
min_k = bk - ks;
if (min_k > GEMM_P) min_k = GEMM_P;
-
+
TRMM_KERNEL(min_k, min_l, bk, dp1,
#ifdef COMPLEX
ZERO,
#endif
sb + ks * bk * COMPSIZE,
sb2,
- a + (ks + j + ls * lda) * COMPSIZE, lda, ks);
+ a + (ks + j + ls * lda) * COMPSIZE, lda, ks);
}
#else
min_i = j - ls;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_INCOPY(bk, min_i, a + (j + ls * lda)* COMPSIZE, lda, sa);
-
+
for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){
min_jj = ls + min_l - jjs;
if (min_jj > GEMM_P) min_jj = GEMM_P;
-
+
GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE);
-
- SYRK_KERNEL(min_i, min_jj, bk, dp1,
- sa,
- sb2 + (jjs - ls) * bk * COMPSIZE,
- a + (ls + jjs * lda) * COMPSIZE, lda,
+
+ SYRK_KERNEL(min_i, min_jj, bk, dp1,
+ sa,
+ sb2 + (jjs - ls) * bk * COMPSIZE,
+ a + (ls + jjs * lda) * COMPSIZE, lda,
ls - jjs);
}
-
+
for(is = ls + min_i; is < j ; is += GEMM_P){
min_i = j - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_INCOPY(bk, min_i, a + (j + is * lda)* COMPSIZE, lda, sa);
-
- SYRK_KERNEL(min_i, min_l, bk, dp1,
- sa,
- sb2,
- a + (is + ls * lda) * COMPSIZE, lda,
+
+ SYRK_KERNEL(min_i, min_l, bk, dp1,
+ sa,
+ sb2,
+ a + (is + ls * lda) * COMPSIZE, lda,
is - ls);
}
for (ks = 0; ks < bk; ks += GEMM_P) {
min_k = bk - ks;
if (min_k > GEMM_P) min_k = GEMM_P;
-
+
TRMM_KERNEL(min_k, min_l, bk, dp1,
#ifdef COMPLEX
ZERO,
#endif
sb + ks * bk * COMPSIZE,
sb2,
- a + (ks + j + ls * lda) * COMPSIZE, lda, ks);
+ a + (ks + j + ls * lda) * COMPSIZE, lda, ks);
}
#endif
}
}
-
+
if (!range_n) {
range_N[0] = j;
range_N[1] = j + bk;
@@ -225,9 +225,9 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
range_N[0] = range_n[0] + j;
range_N[1] = range_n[0] + j + bk;
}
-
+
CNAME(args, NULL, range_N, sa, sb, 0);
-
+
}
return 0;
diff --git a/lapack/lauum/lauum_U_parallel.c b/lapack/lauum/lauum_U_parallel.c
index d68d12b..e4a2792 100644
--- a/lapack/lauum/lauum_U_parallel.c
+++ b/lapack/lauum/lauum_U_parallel.c
@@ -54,7 +54,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -62,11 +62,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
if (args -> nthreads == 1) {
- LAUUM_U_SINGLE(args, NULL, NULL, sa, sb, 0);
+ LAUUM_U_SINGLE(args, NULL, NULL, sa, sb, 0);
return 0;
}
@@ -95,7 +95,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
bk = n - i;
if (bk > blocking) bk = blocking;
-
+
newarg.n = i;
newarg.k = bk;
newarg.a = a + ( i * lda) * COMPSIZE;
@@ -118,6 +118,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
CNAME(&newarg, NULL, NULL, sa, sb, 0);
}
-
+
return 0;
}
diff --git a/lapack/lauum/lauum_U_single.c b/lapack/lauum/lauum_U_single.c
index 14cf0ad..1ce62c0 100644
--- a/lapack/lauum/lauum_U_single.c
+++ b/lapack/lauum/lauum_U_single.c
@@ -91,7 +91,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
-
+
if (range_n) {
n = range_n[1] - range_n[0];
a += range_n[0] * (lda + 1) * COMPSIZE;
@@ -117,74 +117,74 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
min_l = j - ls;
#if 0
-
-
+
+
if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R;
min_i = ls + min_l;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
if (ls > 0) {
GEMM_ITCOPY(bk, min_i, a + (j * lda) * COMPSIZE, lda, sa);
aa = sa;
} else {
aa = sb2;
}
-
+
for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){
min_jj = ls + min_l - jjs;
if (min_jj > GEMM_P) min_jj = GEMM_P;
-
+
GEMM_OTCOPY(bk, min_jj, a + (jjs + j * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE);
-
- SYRK_KERNEL(min_i, min_jj, bk, dp1,
- aa,
- sb2 + (jjs - ls) * bk * COMPSIZE,
+
+ SYRK_KERNEL(min_i, min_jj, bk, dp1,
+ aa,
+ sb2 + (jjs - ls) * bk * COMPSIZE,
a + (jjs * lda) * COMPSIZE, lda, - jjs);
}
-
+
if (ls + REAL_GEMM_R >= j ) {
for (ks = 0; ks < bk; ks += GEMM_P) {
min_k = bk - ks;
if (min_k > GEMM_P) min_k = GEMM_P;
-
+
TRMM_KERNEL(min_i, min_k, bk, dp1,
#ifdef COMPLEX
ZERO,
#endif
aa,
sb + ks * bk * COMPSIZE,
- a + ((ks + j) * lda) * COMPSIZE, lda, -ks);
+ a + ((ks + j) * lda) * COMPSIZE, lda, -ks);
}
}
-
+
for(is = min_i; is < ls + min_l ; is += GEMM_P){
min_i = ls + min_l - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
if (is < ls) {
GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa);
aa = sa;
} else {
aa = sb2 + (is - ls) * bk * COMPSIZE;
}
-
- SYRK_KERNEL(min_i, min_l, bk, dp1,
- aa,
- sb2,
+
+ SYRK_KERNEL(min_i, min_l, bk, dp1,
+ aa,
+ sb2,
a + (is + ls * lda) * COMPSIZE, lda, is - ls);
-
+
if (ls + REAL_GEMM_R >= j ) {
for (ks = 0; ks < bk; ks += GEMM_P) {
min_k = bk - ks;
if (min_k > GEMM_P) min_k = GEMM_P;
-
+
TRMM_KERNEL(min_i, min_k, bk, dp1,
#ifdef COMPLEX
ZERO,
#endif
aa,
sb + ks * bk * COMPSIZE,
- a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks);
+ a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks);
}
}
}
@@ -198,12 +198,12 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){
min_jj = ls + min_l - jjs;
if (min_jj > GEMM_P) min_jj = GEMM_P;
-
+
GEMM_OTCOPY(bk, min_jj, a + (jjs + j * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE);
-
- SYRK_KERNEL(min_i, min_jj, bk, dp1,
- sa,
- sb2 + (jjs - ls) * bk * COMPSIZE,
+
+ SYRK_KERNEL(min_i, min_jj, bk, dp1,
+ sa,
+ sb2 + (jjs - ls) * bk * COMPSIZE,
a + (jjs * lda) * COMPSIZE, lda, - jjs);
}
@@ -211,40 +211,40 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
for (ks = 0; ks < bk; ks += GEMM_P) {
min_k = bk - ks;
if (min_k > GEMM_P) min_k = GEMM_P;
-
+
TRMM_KERNEL(min_i, min_k, bk, dp1,
#ifdef COMPLEX
ZERO,
#endif
sa,
sb + ks * bk * COMPSIZE,
- a + ((ks + j) * lda) * COMPSIZE, lda, -ks);
+ a + ((ks + j) * lda) * COMPSIZE, lda, -ks);
}
}
for(is = min_i; is < ls + min_l ; is += GEMM_P){
min_i = ls + min_l - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa);
-
- SYRK_KERNEL(min_i, min_l, bk, dp1,
- sa,
- sb2,
+
+ SYRK_KERNEL(min_i, min_l, bk, dp1,
+ sa,
+ sb2,
a + (is + ls * lda) * COMPSIZE, lda, is - ls);
-
+
if (ls + REAL_GEMM_R >= j ) {
for (ks = 0; ks < bk; ks += GEMM_P) {
min_k = bk - ks;
if (min_k > GEMM_P) min_k = GEMM_P;
-
+
TRMM_KERNEL(min_i, min_k, bk, dp1,
#ifdef COMPLEX
ZERO,
#endif
sa,
sb + ks * bk * COMPSIZE,
- a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks);
+ a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks);
}
}
}
@@ -259,7 +259,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
range_N[0] = range_n[0] + j;
range_N[1] = range_n[0] + j + bk;
}
-
+
CNAME(args, NULL, range_N, sa, sb, 0);
}
diff --git a/lapack/potf2/potf2_L.c b/lapack/potf2/potf2_L.c
index 23aa97c..8cd094a 100644
--- a/lapack/potf2/potf2_L.c
+++ b/lapack/potf2/potf2_L.c
@@ -59,7 +59,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
-
+
if (range_n) {
n = range_n[1] - range_n[0];
a += range_n[0] * (lda + 1) * COMPSIZE;
@@ -81,11 +81,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
i = n - j - 1;
if (i > 0) {
- GEMV_N(i, j, 0, dm1,
+ GEMV_N(i, j, 0, dm1,
a + j + 1, lda,
a + j, lda,
aoffset + j + 1, 1, sb);
-
+
SCAL_K(i, 0, 0, dp1 / ajj,
aoffset + j + 1, 1, NULL, 0, NULL, 0);
}
diff --git a/lapack/potf2/potf2_U.c b/lapack/potf2/potf2_U.c
index 755bf8d..9f908c1 100644
--- a/lapack/potf2/potf2_U.c
+++ b/lapack/potf2/potf2_U.c
@@ -58,7 +58,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
-
+
if (range_n) {
n = range_n[1] - range_n[0];
a += range_n[0] * (lda + 1) * COMPSIZE;
@@ -78,11 +78,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
i = n - j - 1;
if (i > 0) {
- GEMV_T(j, i, 0, dm1,
+ GEMV_T(j, i, 0, dm1,
a + lda, lda,
a, 1,
a + j + lda, lda, sb);
-
+
SCAL_K(i, 0, 0, dp1 / ajj,
a + j + lda, lda, NULL, 0, NULL, 0);
}
diff --git a/lapack/potf2/zpotf2_L.c b/lapack/potf2/zpotf2_L.c
index 8ce0d4e..33e9b60 100644
--- a/lapack/potf2/zpotf2_L.c
+++ b/lapack/potf2/zpotf2_L.c
@@ -58,7 +58,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
-
+
if (range_n) {
n = range_n[1] - range_n[0];
a += range_n[0] * (lda + 1) * COMPSIZE;
@@ -89,7 +89,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
a + (j + 1) * 2, lda,
a + j * 2, lda,
aoffset + (j + 1) * 2, 1, sb);
-
+
SCAL_K(i, 0, 0, ONE / ajj[0], ZERO,
aoffset + (j + 1) * 2, 1, NULL, 0, NULL, 0);
}
diff --git a/lapack/potf2/zpotf2_U.c b/lapack/potf2/zpotf2_U.c
index c1f5156..e0ccd46 100644
--- a/lapack/potf2/zpotf2_U.c
+++ b/lapack/potf2/zpotf2_U.c
@@ -57,7 +57,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
-
+
if (range_n) {
n = range_n[1] - range_n[0];
a += range_n[0] * (lda + 1) * COMPSIZE;
@@ -68,7 +68,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
ajj[0] = DOTC_K(j, a, 1, a, 1);
GET_IMAGE(ajj[1]);
- ajj[0] = *(a + j * 2) - ajj[0];
+ ajj[0] = *(a + j * 2) - ajj[0];
if (ajj[0] <= 0){
*(a + j * 2 + 0) = ajj[0];
@@ -87,7 +87,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
a + lda * 2, lda,
a, 1,
a + (j + lda) * 2, lda, sb);
-
+
SCAL_K(i, 0, 0, ONE / ajj[0], ZERO,
a + (j + lda) * 2, lda, NULL, 0, NULL, 0);
}
diff --git a/lapack/potrf/potrf_L_parallel.c b/lapack/potrf/potrf_L_parallel.c
index 1ebcad8..52a383a 100644
--- a/lapack/potrf/potrf_L_parallel.c
+++ b/lapack/potrf/potrf_L_parallel.c
@@ -55,7 +55,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -63,11 +63,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
if (args -> nthreads == 1) {
- info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0);
+ info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0);
return info;
}
@@ -91,7 +91,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
if (blocking > GEMM_Q) blocking = GEMM_Q;
-
+
for (i = 0; i < n; i += blocking) {
bk = n - i;
if (bk > blocking) bk = blocking;
@@ -108,15 +108,15 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
newarg.n = bk;
newarg.a = a + (i + i * lda) * COMPSIZE;
newarg.b = a + (i + bk + i * lda) * COMPSIZE;
-
+
gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO,
&newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads);
-
+
newarg.n = n - i - bk;
newarg.k = bk;
newarg.a = a + (i + bk + i * lda) * COMPSIZE;
newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE;
-
+
#ifndef USE_SIMPLE_THREADED_LEVEL3
HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0);
#else
diff --git a/lapack/potrf/potrf_L_single.c b/lapack/potrf/potrf_L_single.c
index d6d1436..0edadf3 100644
--- a/lapack/potrf/potrf_L_single.c
+++ b/lapack/potrf/potrf_L_single.c
@@ -100,7 +100,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
-
+
if (range_n) {
n = range_n[1] - range_n[0];
a += range_n[0] * (lda + 1) * COMPSIZE;
@@ -129,7 +129,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
if (info) return info + j;
if (n - j - bk > 0) {
-
+
TRSM_OLTCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, sb);
/* First tile */
@@ -147,9 +147,9 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
} else {
aa = sa;
}
-
+
GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, aa);
-
+
TRSM_KERNEL(min_i, bk, bk, dm1,
#ifdef COMPLEX
ZERO,
@@ -157,7 +157,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
aa,
sb,
a + (is + j * lda) * COMPSIZE, lda, 0);
-
+
SYRK_KERNEL_L(min_i, min_j, bk, dm1,
aa,
sb2,
@@ -172,7 +172,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
#ifdef COMPLEX
ZERO,
#endif
-
+
sa,
sb,
a + (is + j * lda) * COMPSIZE, lda, 0);
@@ -188,17 +188,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
is - j - bk);
#endif
}
-
+
for(js = j + bk + min_j; js < n; js += REAL_GEMM_R){
min_j = n - js;
if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R;
GEMM_OTCOPY(bk, min_j, a + (js + j * lda) * COMPSIZE, lda, sb2);
-
+
for (is = js; is < n; is += GEMM_P) {
min_i = n - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
+
#ifdef SHARED_ARRAY
if (is + min_i < js + min_j) {
@@ -207,7 +207,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa);
aa = sa;
}
-
+
SYRK_KERNEL_L(min_i, min_j, bk, dm1,
aa,
sb2,
@@ -217,7 +217,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
#else
GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa);
-
+
SYRK_KERNEL_L(min_i, min_j, bk, dm1,
sa,
sb2,
@@ -229,7 +229,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
}
}
-
+
}
return 0;
diff --git a/lapack/potrf/potrf_U_parallel.c b/lapack/potrf/potrf_U_parallel.c
index 31da141..d9b7a88 100644
--- a/lapack/potrf/potrf_U_parallel.c
+++ b/lapack/potrf/potrf_U_parallel.c
@@ -55,7 +55,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -63,11 +63,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
if (args -> nthreads == 1) {
- info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0);
+ info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0);
return info;
}
@@ -91,7 +91,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
if (blocking > GEMM_Q) blocking = GEMM_Q;
-
+
for (i = 0; i < n; i += blocking) {
bk = n - i;
if (bk > blocking) bk = blocking;
@@ -108,15 +108,15 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
newarg.n = n - i - bk;
newarg.a = a + (i + i * lda) * COMPSIZE;
newarg.b = a + (i + (i + bk) * lda) * COMPSIZE;
-
+
gemm_thread_n(mode | BLAS_TRANSA_T,
&newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads);
-
+
newarg.n = n - i - bk;
newarg.k = bk;
newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE;
newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE;
-
+
#ifndef USE_SIMPLE_THREADED_LEVEL3
HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0);
#else
diff --git a/lapack/potrf/potrf_U_single.c b/lapack/potrf/potrf_U_single.c
index aa445c5..7bdeb49 100644
--- a/lapack/potrf/potrf_U_single.c
+++ b/lapack/potrf/potrf_U_single.c
@@ -91,7 +91,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
#ifdef SHARED_ARRAY
FLOAT *aa;
#endif
-
+
FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb
+ GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)
+ GEMM_OFFSET_B);
@@ -109,14 +109,14 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
info = POTF2_U(args, NULL, range_n, sa, sb, 0);
return info;
}
-
+
blocking = GEMM_Q;
if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4;
-
+
for (j = 0; j < n; j += blocking) {
bk = n - j;
if (bk > blocking) bk = blocking;
-
+
if (!range_n) {
range_N[0] = j;
range_N[1] = j + bk;
@@ -124,29 +124,29 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
range_N[0] = range_n[0] + j;
range_N[1] = range_n[0] + j + bk;
}
-
+
info = CNAME(args, NULL, range_N, sa, sb, 0);
if (info) return info + j;
-
+
if (n - j - bk > 0) {
-
+
TRSM_IUNCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, sb);
-
+
for(js = j + bk; js < n; js += REAL_GEMM_R) {
min_j = n - js;
if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R;
-
+
for(jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){
min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+
GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + bk * (jjs - js) * COMPSIZE);
-
+
for (is = 0; is < bk; is += GEMM_P) {
min_i = bk - is;
if (min_i > GEMM_P) min_i = GEMM_P;
-
- TRSM_KERNEL (min_i, min_jj, bk, dm1,
+
+ TRSM_KERNEL (min_i, min_jj, bk, dm1,
#ifdef COMPLEX
ZERO,
#endif
@@ -158,14 +158,14 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
for (is = j + bk; is < js + min_j; is += min_i) {
min_i = js + min_j - is;
-
+
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
- } else
+ } else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
}
-
+
#ifdef SHARED_ARRAY
if ((is >= js) && (is + min_i <= js + min_j)) {
aa = sb2 + bk * (is - js) * COMPSIZE;
@@ -176,18 +176,18 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
#else
GEMM_INCOPY(bk, min_i, a + (j + is * lda) * COMPSIZE, lda, sa);
#endif
-
+
SYRK_KERNEL_U(min_i, min_j, bk,
- dm1,
+ dm1,
SA, sb2,
a + (is + js * lda) * COMPSIZE, lda,
is - js);
-
+
}
}
}
-
+
}
-
+
return 0;
}
diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c
index 11f7f53..c3a7ced 100644
--- a/lapack/potrf/potrf_parallel.c
+++ b/lapack/potrf/potrf_parallel.c
@@ -42,7 +42,7 @@
#ifndef USE_SIMPLE_THREADED_LEVEL3
//The array of job_t may overflow the stack.
-//Instead, use malloc to alloc job_t.
+//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
#define USE_ALLOC_HEAP
#endif
@@ -189,19 +189,19 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for (i = 1; i < DIVIDE_RATE; i++) {
buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE;
}
-
+
#ifndef LOWER
TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb);
#else
TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb);
#endif
-
+
for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) {
-
+
for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){
-
+
min_jj = MIN(m_to, xxx + div_n) - jjs;
-
+
#ifndef LOWER
if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
#else
@@ -211,7 +211,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#ifndef LOWER
OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE);
- TRSM_KERNEL (k, min_jj, k, dm1,
+ TRSM_KERNEL (k, min_jj, k, dm1,
#ifdef COMPLEX
ZERO,
#endif
@@ -230,7 +230,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
a + jjs * COMPSIZE, lda, 0);
#endif
}
-
+
#ifndef LOWER
for (i = 0; i <= mypos; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
@@ -238,25 +238,25 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for (i = mypos; i < args -> nthreads; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
#endif
-
+
WMB;
}
-
+
min_i = m_to - m_from;
-
+
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
- } else
+ } else
if (min_i > GEMM_P) {
min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
}
-
+
#ifndef LOWER
ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
#else
OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
#endif
-
+
current = mypos;
#ifndef LOWER
@@ -266,47 +266,47 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#endif
{
div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
-
+
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
-
+
/* thread has to wait */
if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
-
+
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, lda, m_from, xxx);
-
+
if (m_from + min_i >= m_to) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
}
}
-
+
#ifndef LOWER
current ++;
#else
current --;
#endif
}
-
+
for(is = m_from + min_i; is < m_to; is += min_i){
min_i = m_to - is;
-
+
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
- } else
+ } else
if (min_i > GEMM_P) {
min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
}
-
+
#ifndef LOWER
ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
#else
OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
#endif
-
+
current = mypos;
-
+
#ifndef LOWER
while (current < args -> nthreads)
#else
@@ -314,18 +314,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#endif
{
div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
-
+
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
-
+
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, lda, is, xxx);
-
+
if (is + min_i >= m_to) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
}
- }
+ }
#ifndef LOWER
current ++;
#else
@@ -333,7 +333,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#endif
}
}
-
+
for (i = 0; i < args -> nthreads; i++) {
if (i != mypos) {
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
@@ -341,7 +341,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
}
}
}
-
+
return 0;
}
@@ -378,7 +378,7 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
#else
mode = BLAS_SINGLE | BLAS_REAL;
mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
-#endif
+#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -389,7 +389,7 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1;
-#endif
+#endif
#endif
newarg.m = args -> m;
@@ -409,7 +409,7 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
#endif
newarg.common = (void *)job;
-
+
n_from = 0;
n_to = args -> m;
@@ -424,17 +424,17 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
dnum = (double)n * (double)n /(double)nthreads;
while (i < n){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)i;
-
+
width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
-
+
if (num_cpu == 0) width = n - ((n - width) & ~mask);
-
+
if ((width > n - i) || (width < mask)) width = n - i;
-
+
} else {
width = n - i;
}
@@ -449,7 +449,7 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
@@ -466,21 +466,21 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
dnum = (double)n * (double)n /(double)nthreads;
while (i < n){
-
+
if (nthreads - num_cpu > 1) {
-
+
double di = (double)i;
-
+
width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
-
+
if ((width > n - i) || (width < mask)) width = n - i;
-
+
} else {
width = n - i;
}
range[num_cpu + 1] = range[num_cpu] + width;
-
+
queue[num_cpu].mode = mode;
queue[num_cpu].routine = inner_thread;
queue[num_cpu].args = &newarg;
@@ -489,7 +489,7 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
-
+
num_cpu ++;
i += width;
}
@@ -507,14 +507,14 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
}
}
}
-
+
queue[0].sa = sa;
queue[0].sb = sb;
queue[num_cpu - 1].next = NULL;
-
+
exec_blas(num_cpu, queue);
}
-
+
#ifdef USE_ALLOC_HEAP
free(job);
#endif
@@ -540,7 +540,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -548,14 +548,14 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
if (args -> nthreads == 1) {
#ifndef LOWER
- info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0);
+ info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0);
#else
- info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0);
+ info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0);
#endif
return info;
}
@@ -584,7 +584,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
if (blocking > GEMM_Q) blocking = GEMM_Q;
-
+
for (i = 0; i < n; i += blocking) {
bk = n - i;
if (bk > blocking) bk = blocking;
@@ -643,7 +643,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
newarg.k = bk;
newarg.a = a + (i + bk + i * lda) * COMPSIZE;
newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE;
-
+
#if 0
HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0);
#else
diff --git a/lapack/trti2/trti2_L.c b/lapack/trti2/trti2_L.c
index 47fb53d..f1c0ddf 100644
--- a/lapack/trti2/trti2_L.c
+++ b/lapack/trti2/trti2_L.c
@@ -56,7 +56,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
-
+
if (range_n) {
n = range_n[1] - range_n[0];
a += range_n[0] * (lda + 1) * COMPSIZE;
@@ -77,7 +77,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
1, sb);
SCAL_K(n - j - 1, 0, 0,
- -ajj,
+ -ajj,
a + (j + 1) + j * lda, 1,
NULL, 0, NULL, 0);
}
diff --git a/lapack/trti2/trti2_U.c b/lapack/trti2/trti2_U.c
index f43cecd..376be73 100644
--- a/lapack/trti2/trti2_U.c
+++ b/lapack/trti2/trti2_U.c
@@ -56,7 +56,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
-
+
if (range_n) {
n = range_n[1] - range_n[0];
a += range_n[0] * (lda + 1) * COMPSIZE;
@@ -72,12 +72,12 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
#endif
TRMV (j,
- a , lda,
+ a , lda,
a + j * lda, 1,
sb);
- SCAL_K(j, 0, 0,
- -ajj,
+ SCAL_K(j, 0, 0,
+ -ajj,
a + j * lda, 1,
NULL, 0, NULL, 0);
diff --git a/lapack/trti2/ztrti2_L.c b/lapack/trti2/ztrti2_L.c
index fd19be2..819bff2 100644
--- a/lapack/trti2/ztrti2_L.c
+++ b/lapack/trti2/ztrti2_L.c
@@ -59,7 +59,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
-
+
if (range_n) {
n = range_n[1] - range_n[0];
a += range_n[0] * (lda + 1) * COMPSIZE;
@@ -92,9 +92,9 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
ZTRMV (n - j - 1,
a + ((j + 1) + (j + 1) * lda) * COMPSIZE, lda,
- a + ((j + 1) + j * lda) * COMPSIZE, 1,
+ a + ((j + 1) + j * lda) * COMPSIZE, 1,
sb);
-
+
SCAL_K(n - j - 1, 0, 0,
-ajj_r, -ajj_i,
a + ((j + 1) + j * lda) * COMPSIZE, 1,
diff --git a/lapack/trti2/ztrti2_U.c b/lapack/trti2/ztrti2_U.c
index d85b327..972329a 100644
--- a/lapack/trti2/ztrti2_U.c
+++ b/lapack/trti2/ztrti2_U.c
@@ -59,7 +59,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
-
+
if (range_n) {
n = range_n[1] - range_n[0];
a += range_n[0] * (lda + 1) * COMPSIZE;
@@ -92,15 +92,15 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
#endif
ZTRMV (j,
- a , lda,
+ a , lda,
a + j * lda * COMPSIZE, 1,
sb);
-
- SCAL_K(j, 0, 0,
+
+ SCAL_K(j, 0, 0,
-ajj_r, -ajj_i,
a + j * lda * COMPSIZE, 1,
NULL, 0, NULL, 0);
-
+
}
return 0;
diff --git a/lapack/trtri/trtri_L_parallel.c b/lapack/trtri/trtri_L_parallel.c
index 5969eb6..5dc60b8 100644
--- a/lapack/trtri/trtri_L_parallel.c
+++ b/lapack/trtri/trtri_L_parallel.c
@@ -67,7 +67,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -75,7 +75,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
n = args -> n;
@@ -99,7 +99,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
for (i = start_i; i >= 0; i -= blocking) {
bk = n - i;
if (bk > blocking) bk = blocking;
-
+
range_N[0] = i;
range_N[1] = i + bk;
@@ -124,7 +124,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
newarg.a = a + (i + i * lda) * COMPSIZE;
CNAME (&newarg, NULL, NULL, sa, sb, 0);
-
+
newarg.m = n - bk - i;
newarg.n = i;
newarg.k = bk;
diff --git a/lapack/trtri/trtri_U_parallel.c b/lapack/trtri/trtri_U_parallel.c
index 8761a40..fc48a33 100644
--- a/lapack/trtri/trtri_U_parallel.c
+++ b/lapack/trtri/trtri_U_parallel.c
@@ -67,7 +67,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
-#endif
+#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
@@ -75,7 +75,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
-#endif
+#endif
#endif
n = args -> n;
@@ -120,7 +120,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
newarg.a = a + (i + i * lda) * COMPSIZE;
CNAME (&newarg, NULL, NULL, sa, sb, 0);
-
+
newarg.m = i;
newarg.n = n - i - bk;
newarg.k = bk;
@@ -142,6 +142,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
gemm_thread_n(mode, &newarg, NULL, NULL, TRMM, sa, sb, args -> nthreads);
}
-
+
return 0;
}
diff --git a/make.inc b/make.inc
index affae3a..485cb7d 100644
--- a/make.inc
+++ b/make.inc
@@ -1,7 +1,6 @@
SHELL = /bin/sh
PLAT = _LINUX
DRVOPTS = $(OPTS)
-LOADER = $(FORTRAN) -pthread
ARCHFLAGS= -ru
#RANLIB = ranlib
diff --git a/param.h b/param.h
index ae40ac1..880219b 100644
--- a/param.h
+++ b/param.h
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -196,7 +196,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define DGEMM_DEFAULT_P 224
#define QGEMM_DEFAULT_P 112
#define CGEMM_DEFAULT_P 224
-#define ZGEMM_DEFAULT_P 112
+#define ZGEMM_DEFAULT_P 112
#define XGEMM_DEFAULT_P 56
#define SGEMM_DEFAULT_Q 224
@@ -274,7 +274,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#define QGEMM_DEFAULT_P 112
#define CGEMM_DEFAULT_P 224
-#define ZGEMM_DEFAULT_P 112
+#define ZGEMM_DEFAULT_P 112
#define XGEMM_DEFAULT_P 56
#if defined(ARCH_X86_64)
@@ -351,7 +351,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#define SGEMM_DEFAULT_P 448
#define DGEMM_DEFAULT_P 480
-#define ZGEMM_DEFAULT_P 112
+#define ZGEMM_DEFAULT_P 112
#define CGEMM_DEFAULT_P 224
#endif
#define QGEMM_DEFAULT_P 112
@@ -1032,14 +1032,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define XGEMM_DEFAULT_UNROLL_N 1
#else
#define SGEMM_DEFAULT_UNROLL_M 4
-#define DGEMM_DEFAULT_UNROLL_M 4
+#define DGEMM_DEFAULT_UNROLL_M 2
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_M 1
#define XGEMM_DEFAULT_UNROLL_M 1
#define SGEMM_DEFAULT_UNROLL_N 8
-#define DGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_N 8
#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_N 4
@@ -1073,6 +1073,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GETRF_FACTOR 0.72
+#define CGEMM3M_DEFAULT_UNROLL_N 4
+#define CGEMM3M_DEFAULT_UNROLL_M 8
+#define ZGEMM3M_DEFAULT_UNROLL_N 2
+#define ZGEMM3M_DEFAULT_UNROLL_M 8
#endif
@@ -1104,22 +1108,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_N 2
#define XGEMM_DEFAULT_UNROLL_N 1
#else
-#define SGEMM_DEFAULT_UNROLL_M 4
+#define SGEMM_DEFAULT_UNROLL_M 16
#define DGEMM_DEFAULT_UNROLL_M 8
#define QGEMM_DEFAULT_UNROLL_M 2
-#define CGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_M 4
#define XGEMM_DEFAULT_UNROLL_M 1
-#define SGEMM_DEFAULT_UNROLL_N 8
+#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 4
#define QGEMM_DEFAULT_UNROLL_N 2
-#define CGEMM_DEFAULT_UNROLL_N 4
+#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_N 4
#define XGEMM_DEFAULT_UNROLL_N 1
#endif
-#define SGEMM_DEFAULT_P 512
+#define SGEMM_DEFAULT_P 768
#define SGEMM_DEFAULT_R sgemm_r
//#define SGEMM_DEFAULT_R 1024
@@ -1130,7 +1134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define QGEMM_DEFAULT_P 504
#define QGEMM_DEFAULT_R qgemm_r
-#define CGEMM_DEFAULT_P 128
+#define CGEMM_DEFAULT_P 384
//#define CGEMM_DEFAULT_R cgemm_r
#define CGEMM_DEFAULT_R 1024
@@ -1141,13 +1145,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define XGEMM_DEFAULT_P 252
#define XGEMM_DEFAULT_R xgemm_r
-#define SGEMM_DEFAULT_Q 256
+#define SGEMM_DEFAULT_Q 384
#define DGEMM_DEFAULT_Q 256
#define QGEMM_DEFAULT_Q 128
-#define CGEMM_DEFAULT_Q 256
+#define CGEMM_DEFAULT_Q 192
#define ZGEMM_DEFAULT_Q 192
#define XGEMM_DEFAULT_Q 128
+#define CGEMM3M_DEFAULT_UNROLL_N 4
+#define CGEMM3M_DEFAULT_UNROLL_M 8
+#define ZGEMM3M_DEFAULT_UNROLL_N 2
+#define ZGEMM3M_DEFAULT_UNROLL_M 8
+
#define GETRF_FACTOR 0.72
#endif
@@ -1216,7 +1225,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SGEMM_DEFAULT_Q 256
#define DGEMM_DEFAULT_Q 256
#define QGEMM_DEFAULT_Q 128
-#define CGEMM_DEFAULT_Q 256
+#define CGEMM_DEFAULT_Q 256
#define ZGEMM_DEFAULT_Q 192
#define XGEMM_DEFAULT_Q 128
@@ -1233,7 +1242,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#define DGEMM_DEFAULT_Q 256
#endif
-#define CGEMM_DEFAULT_Q 192
+#define CGEMM_DEFAULT_Q 192
#define ZGEMM_DEFAULT_Q 128
#define SGEMM_DEFAULT_R sgemm_r
@@ -1817,7 +1826,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 16
#endif
-#ifdef LOONGSON3A
+#ifdef LOONGSON3A
////Copy from SICORTEX
#define SNUMOPT 2
#define DNUMOPT 2
@@ -1839,7 +1848,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_P 64
-#define DGEMM_DEFAULT_P 44
+#define DGEMM_DEFAULT_P 44
#define CGEMM_DEFAULT_P 64
#define ZGEMM_DEFAULT_P 32
@@ -1848,8 +1857,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CGEMM_DEFAULT_Q 128
#define ZGEMM_DEFAULT_Q 80
-#define SGEMM_DEFAULT_R 640
-#define DGEMM_DEFAULT_R dgemm_r
+#define SGEMM_DEFAULT_R 640
+#define DGEMM_DEFAULT_R dgemm_r
#define CGEMM_DEFAULT_R 640
#define ZGEMM_DEFAULT_R 640
@@ -1890,7 +1899,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_Q 64
#define SGEMM_DEFAULT_R 512
-#define DGEMM_DEFAULT_R 512
+#define DGEMM_DEFAULT_R 512
#define CGEMM_DEFAULT_R 512
#define ZGEMM_DEFAULT_R 512
diff --git a/reference/Makefile b/reference/Makefile
index d6368dc..fb52c86 100644
--- a/reference/Makefile
+++ b/reference/Makefile
@@ -37,7 +37,7 @@ SBLAS2OBJS = \
SBLAS3OBJS = \
sgemmf.$(SUFFIX) ssymmf.$(SUFFIX) strmmf.$(SUFFIX) \
- strsmf.$(SUFFIX) ssyrkf.$(SUFFIX) ssyr2kf.$(SUFFIX)
+ strsmf.$(SUFFIX) ssyrkf.$(SUFFIX) ssyr2kf.$(SUFFIX)
DBLAS1OBJS = \
daxpyf.$(SUFFIX) dswapf.$(SUFFIX) \
@@ -59,7 +59,7 @@ DBLAS2OBJS = \
DBLAS3OBJS = \
dgemmf.$(SUFFIX) dsymmf.$(SUFFIX) dtrmmf.$(SUFFIX) \
- dtrsmf.$(SUFFIX) dsyrkf.$(SUFFIX) dsyr2kf.$(SUFFIX)
+ dtrsmf.$(SUFFIX) dsyrkf.$(SUFFIX) dsyr2kf.$(SUFFIX)
CBLAS1OBJS = \
caxpyf.$(SUFFIX) caxpycf.$(SUFFIX) cswapf.$(SUFFIX) \
@@ -140,7 +140,7 @@ DBLASOBJS += \
dpotf2f.$(SUFFIX) dpotrff.$(SUFFIX) dtrti2f.$(SUFFIX) dtrtrif.$(SUFFIX) \
dlaswpf.$(SUFFIX) dgetrsf.$(SUFFIX) dgesvf.$(SUFFIX) dpotrif.$(SUFFIX) \
-QBLASOBJS +=
+QBLASOBJS +=
# \
qgetf2f.$(SUFFIX) qgetrff.$(SUFFIX) qlauu2f.$(SUFFIX) qlauumf.$(SUFFIX) \
qpotf2f.$(SUFFIX) qpotrff.$(SUFFIX) qtrti2f.$(SUFFIX) qtrtrif.$(SUFFIX) \
@@ -156,7 +156,7 @@ ZBLASOBJS += \
zpotf2f.$(SUFFIX) zpotrff.$(SUFFIX) ztrti2f.$(SUFFIX) ztrtrif.$(SUFFIX) \
zlaswpf.$(SUFFIX) zgetrsf.$(SUFFIX) zgesvf.$(SUFFIX) zpotrif.$(SUFFIX) \
-XBLASOBJS +=
+XBLASOBJS +=
# \
xgetf2f.$(SUFFIX) xgetrff.$(SUFFIX) xlauu2f.$(SUFFIX) xlauumf.$(SUFFIX) \
xpotf2f.$(SUFFIX) xpotrff.$(SUFFIX) xtrti2f.$(SUFFIX) xtrtrif.$(SUFFIX) \
diff --git a/reference/cspmvf.f b/reference/cspmvf.f
index 7f357c6..e32b490 100644
--- a/reference/cspmvf.f
+++ b/reference/cspmvf.f
@@ -78,7 +78,7 @@
* supplied as zero then Y need not be set on input.
* Unchanged on exit.
*
-* Y (input/output) COMPLEX array, dimension at least
+* Y (input/output) COMPLEX array, dimension at least
* ( 1 + ( N - 1 )*abs( INCY ) ).
* Before entry, the incremented array Y must contain the n
* element vector y. On exit, Y is overwritten by the updated
diff --git a/reference/ctpmvf.f b/reference/ctpmvf.f
index cd29ec5..3402342 100644
--- a/reference/ctpmvf.f
+++ b/reference/ctpmvf.f
@@ -140,7 +140,7 @@
IF( N.EQ.0 )
$ RETURN
*
- NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' )
+ NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' )
NOUNIT = LSAME( DIAG , 'N' )
*
* Set up the start point in X if the increment is not unity. This
diff --git a/reference/sgetrff.f b/reference/sgetrff.f
index 139e7de..8923869 100644
--- a/reference/sgetrff.f
+++ b/reference/sgetrff.f
@@ -3,7 +3,7 @@
* -- LAPACK routine (version 3.0) --
* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
* Courant Institute, Argonne National Lab, and Rice University
-* March 31, 1993
+* March 31, 1993
*
* .. Scalar Arguments ..
INTEGER INFO, LDA, M, N
diff --git a/reference/sgetrsf.f b/reference/sgetrsf.f
index f009218..0f14aed 100644
--- a/reference/sgetrsf.f
+++ b/reference/sgetrsf.f
@@ -3,7 +3,7 @@
* -- LAPACK routine (version 3.0) --
* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
* Courant Institute, Argonne National Lab, and Rice University
-* March 31, 1993
+* March 31, 1993
*
* .. Scalar Arguments ..
CHARACTER TRANS
diff --git a/reference/spotrff.f b/reference/spotrff.f
index 0a49251..7297c81 100644
--- a/reference/spotrff.f
+++ b/reference/spotrff.f
@@ -3,7 +3,7 @@
* -- LAPACK routine (version 3.0) --
* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
* Courant Institute, Argonne National Lab, and Rice University
-* March 31, 1993
+* March 31, 1993
*
* .. Scalar Arguments ..
CHARACTER UPLO
diff --git a/reference/strtrif.f b/reference/strtrif.f
index 27e3234..39919e9 100644
--- a/reference/strtrif.f
+++ b/reference/strtrif.f
@@ -3,7 +3,7 @@
* -- LAPACK routine (version 3.0) --
* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
* Courant Institute, Argonne National Lab, and Rice University
-* March 31, 1993
+* March 31, 1993
*
* .. Scalar Arguments ..
CHARACTER DIAG, UPLO
diff --git a/reference/ztpmvf.f b/reference/ztpmvf.f
index d050272..7e52ef7 100644
--- a/reference/ztpmvf.f
+++ b/reference/ztpmvf.f
@@ -140,7 +140,7 @@
IF( N.EQ.0 )
$ RETURN
*
- NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' )
+ NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' )
NOUNIT = LSAME( DIAG , 'N' )
*
* Set up the start point in X if the increment is not unity. This
diff --git a/reference/ztrmvf.f b/reference/ztrmvf.f
index db0f9ca..9e4f853 100644
--- a/reference/ztrmvf.f
+++ b/reference/ztrmvf.f
@@ -174,7 +174,7 @@
X( I ) = X( I ) + TEMP*A( I, J )
ELSE
X( I ) = X( I ) + TEMP*DCONJG(A( I, J ))
- ENDIF
+ ENDIF
10 CONTINUE
IF (NOCONJ) THEN
IF( NOUNIT )
diff --git a/segfaults.patch b/segfaults.patch
index 375ab76..0087f77 100644
--- a/segfaults.patch
+++ b/segfaults.patch
@@ -9,4 +9,4 @@ diff -ruN common_linux.h.orig common_linux.h
+ return 0;
#endif
}
-
+
diff --git a/symcopy.h b/symcopy.h
index ed6e5b4..48ccbd3 100644
--- a/symcopy.h
+++ b/symcopy.h
@@ -61,11 +61,11 @@ static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda + 2;
-
+
bb1 = b1 + 0 * m;
bb2 = b1 + 1 * m;
b1 += 2 * m + 2;
-
+
cc1 = b2 + 0 * m;
cc2 = b2 + 1 * m;
b2 += 2 * m + 2;
@@ -74,9 +74,9 @@ static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
-
+
a22 = *(aa2 + 1);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a21;
@@ -85,7 +85,7 @@ static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa2 += 2;
bb1 += 2;
bb2 += 2;
-
+
cc1 += 2 * m;
cc2 += 2 * m;
@@ -96,10 +96,10 @@ static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
-
+
aa1 += 2;
aa2 += 2;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
@@ -124,7 +124,7 @@ static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
if (is == 1){
a11 = *(aa1 + 0);
a12 = *(aa2 + 0);
-
+
*(bb1 + 0) = a11;
*(bb2 + 0) = a12;
@@ -132,7 +132,7 @@ static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(cc1 + 1) = a12;
}
}
-
+
if (m - js == 1){
a11 = *(aa1 + 0);
*(bb1 + 0) = a11;
@@ -159,11 +159,11 @@ static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda;
-
+
bb1 = b1 + 0 * m;
bb2 = b1 + 1 * m;
b1 += 2 * m;
-
+
cc1 = b2 + 0 * m;
cc2 = b2 + 1 * m;
b2 += 2;
@@ -171,7 +171,7 @@ static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
if (m - js >= 2){
for (is = 0; is < js; is += 2){
-
+
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
@@ -179,48 +179,48 @@ static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 += 2;
aa2 += 2;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
-
+
*(cc1 + 0) = a11;
*(cc1 + 1) = a12;
*(cc2 + 0) = a21;
*(cc2 + 1) = a22;
-
+
bb1 += 2;
bb2 += 2;
-
+
cc1 += 2 * m;
cc2 += 2 * m;
}
a11 = *(aa1 + 0);
-
+
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a12;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
}
-
+
if (m - js == 1){
for (is = 0; is < js; is += 2){
-
+
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
aa1 += 2;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(cc1 + 0) = a11;
*(cc2 + 0) = a21;
bb1 += 2;
-
+
cc1 += 2 * m;
cc2 += 2 * m;
}
@@ -252,11 +252,11 @@ static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda + 4;
-
+
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m + 4;
-
+
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4 * m + 4;
@@ -267,10 +267,10 @@ static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
-
+
a12 = *(aa2 + 2);
a22 = *(aa2 + 3);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
@@ -285,7 +285,7 @@ static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa2 += 4;
bb1 += 4;
bb2 += 4;
-
+
cc1 += 4 * m;
cc2 += 4 * m;
@@ -301,10 +301,10 @@ static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
-
+
aa1 += 4;
aa2 += 4;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
@@ -339,7 +339,7 @@ static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
@@ -351,7 +351,7 @@ static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(cc1 + 3) = a22;
}
}
-
+
if (m - js == 1){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
@@ -382,11 +382,11 @@ static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda;
-
+
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m;
-
+
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4;
@@ -394,7 +394,7 @@ static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
if (m - js >= 2){
for (is = 0; is < js; is += 2){
-
+
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
@@ -407,7 +407,7 @@ static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 += 4;
aa2 += 4;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
@@ -417,7 +417,7 @@ static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(bb2 + 1) = a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = a42;
-
+
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc1 + 2) = a12;
@@ -427,22 +427,22 @@ static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(cc2 + 1) = a41;
*(cc2 + 2) = a32;
*(cc2 + 3) = a42;
-
+
bb1 += 4;
bb2 += 4;
-
+
cc1 += 4 * m;
cc2 += 4 * m;
}
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
-
+
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a12;
@@ -453,16 +453,16 @@ static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(bb2 + 2) = a32;
*(bb2 + 3) = a42;
}
-
+
if (m - js == 1){
for (is = 0; is < js; is += 2){
-
+
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
aa1 += 4;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
@@ -473,7 +473,7 @@ static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(cc2 + 0) = a31;
*(cc2 + 1) = a41;
bb1 += 4;
-
+
cc1 += 4 * m;
cc2 += 4 * m;
}
@@ -506,11 +506,11 @@ static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda + 4;
-
+
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m + 4;
-
+
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4 * m + 4;
@@ -520,9 +520,9 @@ static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a11 = *(aa1 + 0);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
-
+
a12 = *(aa2 + 2);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = 0.;
*(bb1 + 2) = a31;
@@ -537,7 +537,7 @@ static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa2 += 4;
bb1 += 4;
bb2 += 4;
-
+
cc1 += 4 * m;
cc2 += 4 * m;
@@ -553,10 +553,10 @@ static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
-
+
aa1 += 4;
aa2 += 4;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
@@ -591,7 +591,7 @@ static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
@@ -603,7 +603,7 @@ static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(cc1 + 3) = -a22;
}
}
-
+
if (m - js == 1){
a11 = *(aa1 + 0);
*(bb1 + 0) = a11;
@@ -633,11 +633,11 @@ static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda;
-
+
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m;
-
+
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4;
@@ -645,7 +645,7 @@ static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
if (m - js >= 2){
for (is = 0; is < js; is += 2){
-
+
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
@@ -658,7 +658,7 @@ static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 += 4;
aa2 += 4;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
@@ -668,7 +668,7 @@ static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(bb2 + 1) = a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = a42;
-
+
*(cc1 + 0) = a11;
*(cc1 + 1) = -a21;
*(cc1 + 2) = a12;
@@ -678,20 +678,20 @@ static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(cc2 + 1) = -a41;
*(cc2 + 2) = a32;
*(cc2 + 3) = -a42;
-
+
bb1 += 4;
bb2 += 4;
-
+
cc1 += 4 * m;
cc2 += 4 * m;
}
a11 = *(aa1 + 0);
-
+
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = 0.;
*(bb1 + 2) = a12;
@@ -702,16 +702,16 @@ static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(bb2 + 2) = a32;
*(bb2 + 3) = 0.;
}
-
+
if (m - js == 1){
for (is = 0; is < js; is += 2){
-
+
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
aa1 += 4;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
@@ -722,7 +722,7 @@ static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(cc2 + 0) = a31;
*(cc2 + 1) = -a41;
bb1 += 4;
-
+
cc1 += 4 * m;
cc2 += 4 * m;
}
@@ -755,11 +755,11 @@ static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda + 4;
-
+
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m + 4;
-
+
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4 * m + 4;
@@ -769,9 +769,9 @@ static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a11 = *(aa1 + 0);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
-
+
a12 = *(aa2 + 2);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = 0.;
*(bb1 + 2) = a31;
@@ -786,7 +786,7 @@ static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa2 += 4;
bb1 += 4;
bb2 += 4;
-
+
cc1 += 4 * m;
cc2 += 4 * m;
@@ -802,10 +802,10 @@ static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
-
+
aa1 += 4;
aa2 += 4;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = -a21;
*(bb1 + 2) = a31;
@@ -840,7 +840,7 @@ static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = -a21;
*(bb2 + 0) = a12;
@@ -852,7 +852,7 @@ static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(cc1 + 3) = a22;
}
}
-
+
if (m - js == 1){
a11 = *(aa1 + 0);
*(bb1 + 0) = a11;
@@ -882,11 +882,11 @@ static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda;
-
+
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m;
-
+
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4;
@@ -894,7 +894,7 @@ static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
if (m - js >= 2){
for (is = 0; is < js; is += 2){
-
+
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
@@ -907,7 +907,7 @@ static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 += 4;
aa2 += 4;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = -a21;
*(bb1 + 2) = a31;
@@ -917,7 +917,7 @@ static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(bb2 + 1) = -a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = -a42;
-
+
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc1 + 2) = a12;
@@ -927,20 +927,20 @@ static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(cc2 + 1) = a41;
*(cc2 + 2) = a32;
*(cc2 + 3) = a42;
-
+
bb1 += 4;
bb2 += 4;
-
+
cc1 += 4 * m;
cc2 += 4 * m;
}
a11 = *(aa1 + 0);
-
+
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = 0.;
*(bb1 + 2) = a12;
@@ -951,16 +951,16 @@ static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(bb2 + 2) = a32;
*(bb2 + 3) = 0.;
}
-
+
if (m - js == 1){
for (is = 0; is < js; is += 2){
-
+
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
aa1 += 4;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = -a21;
*(bb1 + 2) = a31;
@@ -971,7 +971,7 @@ static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(cc2 + 0) = a31;
*(cc2 + 1) = a41;
bb1 += 4;
-
+
cc1 += 4 * m;
cc2 += 4 * m;
}
@@ -1002,11 +1002,11 @@ static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda + 2;
-
+
bb1 = b1 + 0 * m;
bb2 = b1 + 1 * m;
b1 += 2 * m + 2;
-
+
cc1 = b2 + 0 * m;
cc2 = b2 + 1 * m;
b2 += 2 * m + 2;
@@ -1015,9 +1015,9 @@ static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
-
+
a22 = *(aa2 + 1);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a21;
@@ -1026,7 +1026,7 @@ static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa2 += 2;
bb1 += 2;
bb2 += 2;
-
+
cc1 += 2 * m;
cc2 += 2 * m;
@@ -1037,10 +1037,10 @@ static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
-
+
aa1 += 2;
aa2 += 2;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
@@ -1065,7 +1065,7 @@ static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
if (is == 1){
a11 = *(aa1 + 0);
a12 = *(aa2 + 0);
-
+
*(bb1 + 0) = a11;
*(bb2 + 0) = a12;
@@ -1073,7 +1073,7 @@ static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(cc1 + 1) = a12;
}
}
-
+
if (m - js == 1){
a11 = *(aa1 + 0);
*(bb1 + 0) = a11;
@@ -1100,11 +1100,11 @@ static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda + 2;
-
+
bb1 = b1 + 0 * m;
bb2 = b1 + 1 * m;
b1 += 2 * m + 2;
-
+
cc1 = b2 + 0 * m;
cc2 = b2 + 1 * m;
b2 += 2 * m + 2;
@@ -1113,9 +1113,9 @@ static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
-
+
a22 = *(aa2 + 1);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a21;
@@ -1124,7 +1124,7 @@ static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa2 += 2;
bb1 += 2;
bb2 += 2;
-
+
cc1 += 2 * m;
cc2 += 2 * m;
@@ -1135,10 +1135,10 @@ static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
-
+
aa1 += 2;
aa2 += 2;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
@@ -1163,7 +1163,7 @@ static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
if (is == 1){
a11 = *(aa1 + 0);
a12 = *(aa2 + 0);
-
+
*(bb1 + 0) = a11;
*(bb2 + 0) = a12;
@@ -1171,7 +1171,7 @@ static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(cc1 + 1) = a12;
}
}
-
+
if (m - js == 1){
a11 = *(aa1 + 0);
*(bb1 + 0) = a11;
@@ -1198,11 +1198,11 @@ static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda;
-
+
bb1 = b1 + 0 * m;
bb2 = b1 + 1 * m;
b1 += 2 * m;
-
+
cc1 = b2 + 0 * m;
cc2 = b2 + 1 * m;
b2 += 2;
@@ -1210,7 +1210,7 @@ static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
if (m - js >= 2){
for (is = 0; is < js; is += 2){
-
+
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
@@ -1218,48 +1218,48 @@ static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 += 2;
aa2 += 2;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
-
+
*(cc1 + 0) = a11;
*(cc1 + 1) = a12;
*(cc2 + 0) = a21;
*(cc2 + 1) = a22;
-
+
bb1 += 2;
bb2 += 2;
-
+
cc1 += 2 * m;
cc2 += 2 * m;
}
a11 = *(aa1 + 0);
-
+
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a12;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
}
-
+
if (m - js == 1){
for (is = 0; is < js; is += 2){
-
+
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
aa1 += 2;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(cc1 + 0) = a11;
*(cc2 + 0) = a21;
bb1 += 2;
-
+
cc1 += 2 * m;
cc2 += 2 * m;
}
@@ -1288,11 +1288,11 @@ static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda;
-
+
bb1 = b1 + 0 * m;
bb2 = b1 + 1 * m;
b1 += 2 * m;
-
+
cc1 = b2 + 0 * m;
cc2 = b2 + 1 * m;
b2 += 2;
@@ -1300,7 +1300,7 @@ static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
if (m - js >= 2){
for (is = 0; is < js; is += 2){
-
+
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
@@ -1308,48 +1308,48 @@ static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 += 2;
aa2 += 2;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
-
+
*(cc1 + 0) = a11;
*(cc1 + 1) = a12;
*(cc2 + 0) = a21;
*(cc2 + 1) = a22;
-
+
bb1 += 2;
bb2 += 2;
-
+
cc1 += 2 * m;
cc2 += 2 * m;
}
a11 = *(aa1 + 0);
-
+
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a12;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
}
-
+
if (m - js == 1){
for (is = 0; is < js; is += 2){
-
+
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
aa1 += 2;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(cc1 + 0) = a11;
*(cc2 + 0) = a21;
bb1 += 2;
-
+
cc1 += 2 * m;
cc2 += 2 * m;
}
@@ -1380,11 +1380,11 @@ static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda + 4;
-
+
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m + 4;
-
+
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4 * m + 4;
@@ -1395,10 +1395,10 @@ static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
-
+
a12 = *(aa2 + 2);
a22 = *(aa2 + 3);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
@@ -1413,7 +1413,7 @@ static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa2 += 4;
bb1 += 4;
bb2 += 4;
-
+
cc1 += 4 * m;
cc2 += 4 * m;
@@ -1429,10 +1429,10 @@ static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
-
+
aa1 += 4;
aa2 += 4;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
@@ -1467,7 +1467,7 @@ static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
@@ -1479,7 +1479,7 @@ static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(cc1 + 3) = a22;
}
}
-
+
if (m - js == 1){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
@@ -1510,11 +1510,11 @@ static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda + 4;
-
+
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m + 4;
-
+
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4 * m + 4;
@@ -1525,10 +1525,10 @@ static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
-
+
a12 = *(aa2 + 2);
a22 = *(aa2 + 3);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
@@ -1543,7 +1543,7 @@ static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa2 += 4;
bb1 += 4;
bb2 += 4;
-
+
cc1 += 4 * m;
cc2 += 4 * m;
@@ -1559,10 +1559,10 @@ static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
-
+
aa1 += 4;
aa2 += 4;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
@@ -1597,7 +1597,7 @@ static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
@@ -1609,7 +1609,7 @@ static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(cc1 + 3) = a22;
}
}
-
+
if (m - js == 1){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
@@ -1640,11 +1640,11 @@ static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda;
-
+
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m;
-
+
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4;
@@ -1652,7 +1652,7 @@ static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
if (m - js >= 2){
for (is = 0; is < js; is += 2){
-
+
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
@@ -1665,7 +1665,7 @@ static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 += 4;
aa2 += 4;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
@@ -1675,7 +1675,7 @@ static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(bb2 + 1) = a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = a42;
-
+
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc1 + 2) = a12;
@@ -1685,22 +1685,22 @@ static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(cc2 + 1) = a41;
*(cc2 + 2) = a32;
*(cc2 + 3) = a42;
-
+
bb1 += 4;
bb2 += 4;
-
+
cc1 += 4 * m;
cc2 += 4 * m;
}
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
-
+
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a12;
@@ -1711,16 +1711,16 @@ static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(bb2 + 2) = a32;
*(bb2 + 3) = a42;
}
-
+
if (m - js == 1){
for (is = 0; is < js; is += 2){
-
+
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
aa1 += 4;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
@@ -1731,7 +1731,7 @@ static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(cc2 + 0) = a31;
*(cc2 + 1) = a41;
bb1 += 4;
-
+
cc1 += 4 * m;
cc2 += 4 * m;
}
@@ -1764,11 +1764,11 @@ static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda;
-
+
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m;
-
+
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4;
@@ -1776,7 +1776,7 @@ static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
if (m - js >= 2){
for (is = 0; is < js; is += 2){
-
+
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
@@ -1789,7 +1789,7 @@ static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
aa1 += 4;
aa2 += 4;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
@@ -1799,7 +1799,7 @@ static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(bb2 + 1) = a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = a42;
-
+
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc1 + 2) = a12;
@@ -1809,22 +1809,22 @@ static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(cc2 + 1) = a41;
*(cc2 + 2) = a32;
*(cc2 + 3) = a42;
-
+
bb1 += 4;
bb2 += 4;
-
+
cc1 += 4 * m;
cc2 += 4 * m;
}
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
-
+
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a12;
@@ -1835,16 +1835,16 @@ static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(bb2 + 2) = a32;
*(bb2 + 3) = a42;
}
-
+
if (m - js == 1){
for (is = 0; is < js; is += 2){
-
+
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
aa1 += 4;
-
+
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
@@ -1855,7 +1855,7 @@ static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
*(cc2 + 0) = a31;
*(cc2 + 1) = a41;
bb1 += 4;
-
+
cc1 += 4 * m;
cc2 += 4 * m;
}
diff --git a/test/Makefile b/test/Makefile
index 0bc06e8..801efe2 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -89,7 +89,7 @@ endif
endif
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
-CEXTRALIB =
+CEXTRALIB =
sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME)
diff --git a/test/get_threading_model.c b/test/get_threading_model.c
index 9a6835b..3f34a33 100644
--- a/test/get_threading_model.c
+++ b/test/get_threading_model.c
@@ -1,18 +1,18 @@
-#include "../cblas.h"
+#include "../cblas.h"
int main() {
- int th_model = openblas_get_parallel();
+ int th_model = openblas_get_parallel();
switch(th_model) {
- case OPENBLAS_SEQUENTIAL:
- printf("OpenBLAS is compiled sequentially.\n");
- break;
- case OPENBLAS_THREAD:
- printf("OpenBLAS is compiled using the normal threading model\n");
- break;
- case OPENBLAS_OPENMP:
- printf("OpenBLAS is compiled using OpenMP\n");
- break;
+ case OPENBLAS_SEQUENTIAL:
+ printf("OpenBLAS is compiled sequentially.\n");
+ break;
+ case OPENBLAS_THREAD:
+ printf("OpenBLAS is compiled using the normal threading model\n");
+ break;
+ case OPENBLAS_OPENMP:
+ printf("OpenBLAS is compiled using OpenMP\n");
+ break;
}
- return 0;
+ return 0;
}
diff --git a/test/sblat2.f b/test/sblat2.f
index 057a854..a1074be 100644
--- a/test/sblat2.f
+++ b/test/sblat2.f
@@ -2886,7 +2886,7 @@
WRITE( NOUT, FMT = 9998 )I, YT( I ),
$ YY( 1 + ( I - 1 )*ABS( INCY ) )
ELSE
- WRITE( NOUT, FMT = 9998 )I,
+ WRITE( NOUT, FMT = 9998 )I,
$ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT(I)
END IF
60 CONTINUE
diff --git a/utest/Makefile b/utest/Makefile
index 38ebb03..fa05458 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -1,15 +1,19 @@
UTEST_CHECK = 1
TOPDIR = ..
-include $(TOPDIR)/Makefile.system
TARGET=openblas_utest
+.PHONY : all
+.NOTPARALLEL : all run_test $(TARGET)
+
CUNIT_URL=http://downloads.sourceforge.net/project/cunit/CUnit/2.1-2/CUnit-2.1-2-src.tar.bz2
CUNIT_DIR=$(CURDIR)/CUnit-2.1-2
CUNIT_LIB=$(CUNIT_DIR)/lib/libcunit.a
-CFLAGS+=-I$(CUNIT_DIR)/include
+CFLAGS +=-I$(CUNIT_DIR)/include
+
+include $(TOPDIR)/Makefile.system
OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o test_rotmg.o test_dsdot.o test_amax.o test_fork.o
@@ -45,7 +49,7 @@ run_test: $(TARGET)
./$(TARGET)
clean:
- -rm -f *.o $(TARGET)
+ -rm -f *.o $(TARGET)
-rm -rf $(CUNIT_DIR)
libs:
diff --git a/utest/common_utest.h b/utest/common_utest.h
index f3841c5..51f04ca 100644
--- a/utest/common_utest.h
+++ b/utest/common_utest.h
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
diff --git a/utest/main.c b/utest/main.c
index ca50e47..7fb5811 100644
--- a/utest/main.c
+++ b/utest/main.c
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -81,29 +81,29 @@ int main()
CU_ErrorCode error;
if (CUE_SUCCESS != CU_initialize_registry())
return CU_get_error();
-
+
error=CU_register_suites(suites);
-
+
if (error != CUE_SUCCESS) {
perror(CU_get_error_msg());
CU_cleanup_registry();
return CU_get_error();
-
+
}
-
-
+
+
printf("Seting OK\n");
fflush(stdout);
-
+
/* Run all tests using the CUnit Basic interface */
CU_basic_set_mode(CU_BRM_VERBOSE);
-
+
CU_basic_run_tests();
-
+
CU_cleanup_registry();
-
+
return CU_get_error();
-
+
}
diff --git a/utest/test_amax.c b/utest/test_amax.c
index 8d16385..fcc9343 100644
--- a/utest/test_amax.c
+++ b/utest/test_amax.c
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -41,6 +41,6 @@ void test_samax()
te_max=BLASFUNC(samax)(&N, x, &inc);
tr_max=BLASFUNC_REF(samax)(&N, x, &inc);
-
+
CU_ASSERT_DOUBLE_EQUAL(te_max, tr_max, CHECK_EPS);
}
diff --git a/utest/test_axpy.c b/utest/test_axpy.c
index a141d7a..0355973 100644
--- a/utest/test_axpy.c
+++ b/utest/test_axpy.c
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
diff --git a/utest/test_dotu.c b/utest/test_dotu.c
index 60bb3a6..aef1005 100644
--- a/utest/test_dotu.c
+++ b/utest/test_dotu.c
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -50,7 +50,7 @@ void test_zdotu_n_1(void)
CU_ASSERT_DOUBLE_EQUAL(creal(result1), creal(result2), CHECK_EPS);
CU_ASSERT_DOUBLE_EQUAL(cimag(result1), cimag(result2), CHECK_EPS);
// printf("\%lf,%lf\n",creal(result1),cimag(result1));
-
+
}
void test_zdotu_offset_1(void)
@@ -70,6 +70,6 @@ void test_zdotu_offset_1(void)
CU_ASSERT_DOUBLE_EQUAL(creal(result1), creal(result2), CHECK_EPS);
CU_ASSERT_DOUBLE_EQUAL(cimag(result1), cimag(result2), CHECK_EPS);
// printf("\%lf,%lf\n",creal(result1),cimag(result1));
-
+
}
diff --git a/utest/test_dsdot.c b/utest/test_dsdot.c
index 8df7380..41b62c2 100644
--- a/utest/test_dsdot.c
+++ b/utest/test_dsdot.c
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -39,12 +39,12 @@ void test_dsdot_n_1()
int incx=1;
int incy=1;
int n=1;
-
+
double res1=0.0f, res2=0.0f;
res1=BLASFUNC(dsdot)(&n, &x, &incx, &y, &incy);
res2=BLASFUNC_REF(dsdot)(&n, &x, &incx, &y, &incy);
CU_ASSERT_DOUBLE_EQUAL(res1, res2, CHECK_EPS);
-
+
}
diff --git a/utest/test_fork.c b/utest/test_fork.c
index 1d8804a..6e99d14 100644
--- a/utest/test_fork.c
+++ b/utest/test_fork.c
@@ -61,20 +61,20 @@ void test_fork_safety(void)
{
int n = 1000;
int i;
-
+
double *a, *b, *c, *d;
size_t n_bytes;
-
+
pid_t fork_pid;
pid_t fork_pid_nested;
n_bytes = sizeof(*a) * n * n;
-
+
a = xmalloc(n_bytes);
b = xmalloc(n_bytes);
c = xmalloc(n_bytes);
d = xmalloc(n_bytes);
-
+
// Put ones in a and b
for(i = 0; i < n * n; ++i) {
a[i] = 1;
diff --git a/utest/test_rot.c b/utest/test_rot.c
index f5332d4..988f54e 100644
--- a/utest/test_rot.c
+++ b/utest/test_rot.c
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
diff --git a/utest/test_rotmg.c b/utest/test_rotmg.c
index 9a1a3d0..bb03c27 100644
--- a/utest/test_rotmg.c
+++ b/utest/test_rotmg.c
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
@@ -54,7 +54,7 @@ void test_drotmg()
BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
//reference
BLASFUNC_REF(drotmg)(&tr_d1, &tr_d2, &tr_x1, &tr_y1, tr_param);
-
+
CU_ASSERT_DOUBLE_EQUAL(te_d1, tr_d1, CHECK_EPS);
CU_ASSERT_DOUBLE_EQUAL(te_d2, tr_d2, CHECK_EPS);
CU_ASSERT_DOUBLE_EQUAL(te_x1, tr_x1, CHECK_EPS);
diff --git a/version.h b/version.h
index 5c621e6..213faae 100644
--- a/version.h
+++ b/version.h
@@ -13,19 +13,19 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the ISCAS nor the names of its contributors may
- be used to endorse or promote products derived from this software
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/openblas.git
More information about the debian-science-commits
mailing list